Merge tag 'idmapped-mounts-v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)
diff --combined Documentation/filesystems/porting.rst

index 1f8cf8e,de1dcec..6336102
--- 1/Documentation/filesystems/porting.rst
--- 2/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@@ -717,6 -717,8 +717,8 @@@ be removed.  Switch while you still can
   **mandatory**
   
   ->setxattr() and xattr_handler.set() get dentry and inode passed separately.
+ The xattr_handler.set() gets passed the user namespace of the mount the inode
+ is seen from so filesystems can idmap the i_uid and i_gid accordingly.
   dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
   in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
   called before we attach dentry to inode and !@#!@##!@$!$#!@#$!@$!@$ smack
@@@ -865,19 -867,3 +867,19 @@@ no matter what.  Everything is handled 
   
   clone_private_mount() returns a longterm mount now, so the proper destructor of
   its result is kern_unmount() or kern_unmount_array().
+ +
+ +---
+ +
+ +**mandatory**
+ +
+ +zero-length bvec segments are disallowed, they must be filtered out before
+ +passed on to an iterator.
+ +
+ +---
+ +
+ +**mandatory**
+ +
+ +For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but
+ +uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and
+ +page references stay until I/O has completed, i.e. until ->ki_complete() has
+ +been called or returned with non -EIOCBQUEUED code.
diff --combined Documentation/filesystems/vfs.rst

index a4d64b1,98290ef..2049bbf
--- 1/Documentation/filesystems/vfs.rst
--- 2/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@@ -112,7 -112,7 +112,7 @@@ members are defined
   
   .. code-block:: c
   
- -      struct file_system_operations {
+ +      struct file_system_type {
                 const char *name;
                 int fs_flags;
                 struct dentry *(*mount) (struct file_system_type *, int,
@@@ -270,10 -270,7 +270,10 @@@ or bottom half)
         ->alloc_inode.
   
   ``dirty_inode``
- -      this method is called by the VFS to mark an inode dirty.
+ +      this method is called by the VFS when an inode is marked dirty.
+ +      This is specifically for the inode itself being marked dirty,
+ +      not its data.  If the update needs to be persisted by fdatasync(),
+ +      then I_DIRTY_DATASYNC will be set in the flags argument.
   
   ``write_inode``
         this method is called when the VFS needs to write an inode to
@@@ -418,28 -415,29 +418,29 @@@ As of kernel 2.6.22, the following memb
   .. code-block:: c
   
         struct inode_operations {
-               int (*create) (struct inode *,struct dentry *, umode_t, bool);
+               int (*create) (struct user_namespace *, struct inode *,struct dentry *, umode_t, bool);
                 struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
                 int (*link) (struct dentry *,struct inode *,struct dentry *);
                 int (*unlink) (struct inode *,struct dentry *);
-               int (*symlink) (struct inode *,struct dentry *,const char *);
-               int (*mkdir) (struct inode *,struct dentry *,umode_t);
+               int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,const char *);
+               int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,umode_t);
                 int (*rmdir) (struct inode *,struct dentry *);
-               int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-               int (*rename) (struct inode *, struct dentry *,
+               int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,umode_t,dev_t);
+               int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
                                struct inode *, struct dentry *, unsigned int);
                 int (*readlink) (struct dentry *, char __user *,int);
                 const char *(*get_link) (struct dentry *, struct inode *,
                                          struct delayed_call *);
-               int (*permission) (struct inode *, int);
+               int (*permission) (struct user_namespace *, struct inode *, int);
                 int (*get_acl)(struct inode *, int);
-               int (*setattr) (struct dentry *, struct iattr *);
-               int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+               int (*setattr) (struct user_namespace *, struct dentry *, struct iattr *);
+               int (*getattr) (struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int);
                 ssize_t (*listxattr) (struct dentry *, char *, size_t);
                 void (*update_time)(struct inode *, struct timespec *, int);
                 int (*atomic_open)(struct inode *, struct dentry *, struct file *,
                                    unsigned open_flag, umode_t create_mode);
-               int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+               int (*tmpfile) (struct user_namespace *, struct inode *, struct dentry *, umode_t);
+               int (*set_acl)(struct user_namespace *, struct inode *, struct posix_acl *, int);
         };
   
   Again, all methods are called without any locks being held, unless
diff --combined arch/powerpc/kernel/syscalls/syscall.tbl

index 96b2157,72e5aa6..0b2480c
--- 1/arch/powerpc/kernel/syscalls/syscall.tbl
--- 2/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@@ -9,7 -9,9 +9,7 @@@
   #
   0     nospu   restart_syscall                 sys_restart_syscall
   1     nospu   exit                            sys_exit
- -2     32      fork                            ppc_fork                        sys_fork
- -2     64      fork                            sys_fork
- -2     spu     fork                            sys_ni_syscall
+ +2     nospu   fork                            sys_fork
   3     common  read                            sys_read
   4     common  write                           sys_write
   5     common  open                            sys_open                        compat_sys_open
@@@ -158,7 -160,9 +158,7 @@@
   119   32      sigreturn                       sys_sigreturn                   compat_sys_sigreturn
   119   64      sigreturn                       sys_ni_syscall
   119   spu     sigreturn                       sys_ni_syscall
- -120   32      clone                           ppc_clone                       sys_clone
- -120   64      clone                           sys_clone
- -120   spu     clone                           sys_ni_syscall
+ +120   nospu   clone                           sys_clone
   121   common  setdomainname                   sys_setdomainname
   122   common  uname                           sys_newuname
   123   common  modify_ldt                      sys_ni_syscall
@@@ -240,7 -244,9 +240,7 @@@
   186   spu     sendfile                        sys_sendfile64
   187   common  getpmsg                         sys_ni_syscall
   188   common  putpmsg                         sys_ni_syscall
- -189   32      vfork                           ppc_vfork                       sys_vfork
- -189   64      vfork                           sys_vfork
- -189   spu     vfork                           sys_ni_syscall
+ +189   nospu   vfork                           sys_vfork
   190   common  ugetrlimit                      sys_getrlimit                   compat_sys_getrlimit
   191   common  readahead                       sys_readahead                   compat_sys_readahead
   192   32      mmap2                           sys_mmap2                       compat_sys_mmap2
@@@ -316,7 -322,9 +316,7 @@@
   248   32      clock_nanosleep                 sys_clock_nanosleep_time32
   248   64      clock_nanosleep                 sys_clock_nanosleep
   248   spu     clock_nanosleep                 sys_clock_nanosleep
- -249   32      swapcontext                     ppc_swapcontext                 compat_sys_swapcontext
- -249   64      swapcontext                     sys_swapcontext
- -249   spu     swapcontext                     sys_ni_syscall
+ +249   nospu   swapcontext                     sys_swapcontext                 compat_sys_swapcontext
   250   common  tgkill                          sys_tgkill
   251   32      utimes                          sys_utimes_time32
   251   64      utimes                          sys_utimes
@@@ -514,10 -522,13 +514,11 @@@
   432   common  fsmount                         sys_fsmount
   433   common  fspick                          sys_fspick
   434   common  pidfd_open                      sys_pidfd_open
- -435   32      clone3                          ppc_clone3                      sys_clone3
- -435   64      clone3                          sys_clone3
- -435   spu     clone3                          sys_ni_syscall
+ +435   nospu   clone3                          sys_clone3
   436   common  close_range                     sys_close_range
   437   common  openat2                         sys_openat2
   438   common  pidfd_getfd                     sys_pidfd_getfd
   439   common  faccessat2                      sys_faccessat2
   440   common  process_madvise                 sys_process_madvise
   441   common  epoll_pwait2                    sys_epoll_pwait2                compat_sys_epoll_pwait2
+ 442   common  mount_setattr                   sys_mount_setattr
diff --combined fs/affs/namei.c

index 5400a87,9ad22be..bcab189
--- 1/fs/affs/namei.c
--- 2/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@@ -242,7 -242,8 +242,8 @@@ affs_unlink(struct inode *dir, struct d
   }
   
   int
- affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+ affs_create(struct user_namespace *mnt_userns, struct inode *dir,
+           struct dentry *dentry, umode_t mode, bool excl)
   {
         struct super_block *sb = dir->i_sb;
         struct inode    *inode;
@@@ -273,7 -274,8 +274,8 @@@
   }
   
   int
- affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+          struct dentry *dentry, umode_t mode)
   {
         struct inode            *inode;
         int                      error;
@@@ -311,7 -313,8 +313,8 @@@ affs_rmdir(struct inode *dir, struct de
   }
   
   int
- affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+ affs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+            struct dentry *dentry, const char *symname)
   {
         struct super_block      *sb = dir->i_sb;
         struct buffer_head      *bh;
@@@ -460,10 -463,8 +463,10 @@@ affs_xrename(struct inode *old_dir, str
                 return -EIO;
   
         bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino);
- -      if (!bh_new)
+ +      if (!bh_new) {
+ +              affs_brelse(bh_old);
                 return -EIO;
+ +      }
   
         /* Remove old header from its parent directory. */
         affs_lock_dir(old_dir);
@@@ -500,9 -501,9 +503,9 @@@ done
         return retval;
   }
   
- int affs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-                       struct inode *new_dir, struct dentry *new_dentry,
-                       unsigned int flags)
+ int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+                struct dentry *old_dentry, struct inode *new_dir,
+                struct dentry *new_dentry, unsigned int flags)
   {
   
         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
diff --combined fs/btrfs/ctree.h

index 3bc00ae,9c0b438..bd65935
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -298,8 -298,7 +298,8 @@@ struct btrfs_super_block 
          BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |       \
          BTRFS_FEATURE_INCOMPAT_NO_HOLES        |       \
          BTRFS_FEATURE_INCOMPAT_METADATA_UUID   |       \
- -       BTRFS_FEATURE_INCOMPAT_RAID1C34)
+ +       BTRFS_FEATURE_INCOMPAT_RAID1C34        |       \
+ +       BTRFS_FEATURE_INCOMPAT_ZONED)
   
   #define BTRFS_FEATURE_INCOMPAT_SAFE_SET                       \
         (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@@ -564,9 -563,6 +564,9 @@@ enum 
   
         /* Indicate that we need to cleanup space cache v1 */
         BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
+ +
+ +      /* Indicate that we can't trust the free space tree for caching yet */
+ +      BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
   };
   
   /*
@@@ -798,7 -794,7 +798,7 @@@ struct btrfs_fs_info 
         /* used to keep from writing metadata until there is a nice batch */
         struct percpu_counter dirty_metadata_bytes;
         struct percpu_counter delalloc_bytes;
- -      struct percpu_counter dio_bytes;
+ +      struct percpu_counter ordered_bytes;
         s32 dirty_metadata_batch;
         s32 delalloc_batch;
   
@@@ -934,7 -930,6 +934,7 @@@
         /* Used to reclaim the metadata space in the background. */
         struct work_struct async_reclaim_work;
         struct work_struct async_data_reclaim_work;
+ +      struct work_struct preempt_reclaim_work;
   
         spinlock_t unused_bgs_lock;
         struct list_head unused_bgs;
@@@ -976,9 -971,6 +976,9 @@@
   
         /* Max size to emit ZONE_APPEND write command */
         u64 max_zone_append_size;
+ +      struct mutex zoned_meta_io_lock;
+ +      spinlock_t treelog_bg_lock;
+ +      u64 treelog_bg;
   
   #ifdef CONFIG_BTRFS_FS_REF_VERIFY
         spinlock_t ref_verify_lock;
@@@ -1109,7 -1101,7 +1109,7 @@@ struct btrfs_root 
   
         u32 type;
   
- -      u64 highest_objectid;
+ +      u64 free_objectid;
   
         struct btrfs_key defrag_progress;
         struct btrfs_key defrag_max;
@@@ -2745,7 -2737,6 +2745,7 @@@ enum btrfs_flush_state 
         ALLOC_CHUNK_FORCE       =       8,
         RUN_DELAYED_IPUTS       =       9,
         COMMIT_TRANS            =       10,
+ +      FORCE_COMMIT_TRANS      =       11,
   };
   
   int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
@@@ -3106,14 -3097,15 +3106,14 @@@ int btrfs_truncate_inode_items(struct b
                                u32 min_type);
   
   int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
- -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
                                bool in_reclaim_context);
   int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                               unsigned int extra_bits,
                               struct extent_state **cached_state);
   int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
- -                           struct btrfs_root *parent_root,
- -                           u64 new_dirid);
+ +                           struct btrfs_root *parent_root);
    void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
                                unsigned *bits);
   void btrfs_clear_delalloc_extent(struct inode *inode,
@@@ -3124,8 -3116,6 +3124,8 @@@ void btrfs_split_delalloc_extent(struc
                                  struct extent_state *orig, u64 split);
   int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
                              unsigned long bio_flags);
+ +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ +                                    unsigned int size);
   void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end);
   vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
   int btrfs_readpage(struct file *file, struct page *page);
@@@ -3635,7 -3625,8 +3635,8 @@@ static inline int __btrfs_fs_compat_ro(
   /* acl.c */
   #ifdef CONFIG_BTRFS_FS_POSIX_ACL
   struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
- int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+                 struct posix_acl *acl, int type);
   int btrfs_init_acl(struct btrfs_trans_handle *trans,
                    struct inode *inode, struct inode *dir);
   #else
diff --combined fs/btrfs/inode.c

index 535abf8,c0b11db..2e1c282
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -50,7 -50,6 +50,7 @@@
   #include "delalloc-space.h"
   #include "block-group.h"
   #include "space-info.h"
+ +#include "zoned.h"
   
   struct btrfs_iget_args {
         u64 ino;
@@@ -693,7 -692,8 +693,7 @@@ cont
                                                      NULL,
                                                      clear_flags,
                                                      PAGE_UNLOCK |
- -                                                   PAGE_CLEAR_DIRTY |
- -                                                   PAGE_SET_WRITEBACK |
+ +                                                   PAGE_START_WRITEBACK |
                                                      page_error_op |
                                                      PAGE_END_WRITEBACK);
   
@@@ -917,6 -917,7 +917,6 @@@ retry
                                                 ins.objectid,
                                                 async_extent->ram_size,
                                                 ins.offset,
- -                                              BTRFS_ORDERED_COMPRESSED,
                                                 async_extent->compress_type);
                 if (ret) {
                         btrfs_drop_extent_cache(inode, async_extent->start,
@@@ -933,7 -934,8 +933,7 @@@
                                 async_extent->start +
                                 async_extent->ram_size - 1,
                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
- -                              PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- -                              PAGE_SET_WRITEBACK);
+ +                              PAGE_UNLOCK | PAGE_START_WRITEBACK);
                 if (btrfs_submit_compressed_write(inode, async_extent->start,
                                     async_extent->ram_size,
                                     ins.objectid,
@@@ -969,8 -971,9 +969,8 @@@ out_free
                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DELALLOC_NEW |
                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
- -                                   PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- -                                   PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- -                                   PAGE_SET_ERROR);
+ +                                   PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ +                                   PAGE_END_WRITEBACK | PAGE_SET_ERROR);
         free_async_extent_pages(async_extent);
         kfree(async_extent);
         goto again;
@@@ -1068,7 -1071,8 +1068,7 @@@ static noinline int cow_file_range(stru
                                      EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- -                                   PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- -                                   PAGE_END_WRITEBACK);
+ +                                   PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
                         *nr_written = *nr_written +
                              (end - start + PAGE_SIZE) / PAGE_SIZE;
                         *page_started = 1;
@@@ -1123,8 -1127,7 +1123,8 @@@
                 free_extent_map(em);
   
                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- -                                             ram_size, cur_alloc_size, 0);
+ +                                             ram_size, cur_alloc_size,
+ +                                             BTRFS_ORDERED_REGULAR);
                 if (ret)
                         goto out_drop_extent_cache;
   
@@@ -1191,7 -1194,8 +1191,7 @@@ out_reserve
   out_unlock:
         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
- -      page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
- -              PAGE_END_WRITEBACK;
+ +      page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
         /*
          * If we reserved an extent for our delalloc range (or a subrange) and
          * failed to create the respective ordered extent, then it means that
@@@ -1316,8 -1320,9 +1316,8 @@@ static int cow_file_range_async(struct 
                 unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
                         EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
                         EXTENT_DO_ACCOUNTING;
- -              unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- -                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
- -                      PAGE_SET_ERROR;
+ +              unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ +                                       PAGE_END_WRITEBACK | PAGE_SET_ERROR;
   
                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
                                              clear_bits, page_ops);
@@@ -1394,29 -1399,6 +1394,29 @@@
         return 0;
   }
   
+ +static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ +                                     struct page *locked_page, u64 start,
+ +                                     u64 end, int *page_started,
+ +                                     unsigned long *nr_written)
+ +{
+ +      int ret;
+ +
+ +      ret = cow_file_range(inode, locked_page, start, end, page_started,
+ +                           nr_written, 0);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (*page_started)
+ +              return 0;
+ +
+ +      __set_page_dirty_nobuffers(locked_page);
+ +      account_page_redirty(locked_page);
+ +      extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+ +      *page_started = 1;
+ +
+ +      return 0;
+ +}
+ +
   static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
                                         u64 bytenr, u64 num_bytes)
   {
@@@ -1537,7 -1519,8 +1537,7 @@@ static noinline int run_delalloc_nocow(
                                              EXTENT_LOCKED | EXTENT_DELALLOC |
                                              EXTENT_DO_ACCOUNTING |
                                              EXTENT_DEFRAG, PAGE_UNLOCK |
- -                                           PAGE_CLEAR_DIRTY |
- -                                           PAGE_SET_WRITEBACK |
+ +                                           PAGE_START_WRITEBACK |
                                              PAGE_END_WRITEBACK);
                 return -ENOMEM;
         }
@@@ -1859,7 -1842,8 +1859,7 @@@ error
                                              locked_page, EXTENT_LOCKED |
                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
- -                                           PAGE_CLEAR_DIRTY |
- -                                           PAGE_SET_WRITEBACK |
+ +                                           PAGE_START_WRITEBACK |
                                              PAGE_END_WRITEBACK);
         btrfs_free_path(path);
         return ret;
@@@ -1894,24 -1878,17 +1894,24 @@@ int btrfs_run_delalloc_range(struct btr
   {
         int ret;
         int force_cow = need_force_cow(inode, start, end);
+ +      const bool zoned = btrfs_is_zoned(inode->root->fs_info);
   
         if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ +              ASSERT(!zoned);
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, 1, nr_written);
         } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
+ +              ASSERT(!zoned);
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, 0, nr_written);
         } else if (!inode_can_compress(inode) ||
                    !inode_need_compress(inode, start, end)) {
- -              ret = cow_file_range(inode, locked_page, start, end,
- -                                   page_started, nr_written, 1);
+ +              if (zoned)
+ +                      ret = run_delalloc_zoned(inode, locked_page, start, end,
+ +                                               page_started, nr_written);
+ +              else
+ +                      ret = cow_file_range(inode, locked_page, start, end,
+ +                                           page_started, nr_written, 1);
         } else {
                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@@ -2206,10 -2183,9 +2206,10 @@@ int btrfs_bio_fits_in_stripe(struct pag
         struct inode *inode = page->mapping->host;
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 logical = bio->bi_iter.bi_sector << 9;
+ +      struct extent_map *em;
         u64 length = 0;
         u64 map_length;
- -      int ret;
+ +      int ret = 0;
         struct btrfs_io_geometry geom;
   
         if (bio_flags & EXTENT_BIO_COMPRESSED)
@@@ -2217,19 -2193,14 +2217,19 @@@
   
         length = bio->bi_iter.bi_size;
         map_length = length;
- -      ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
- -                                  &geom);
+ +      em = btrfs_get_chunk_map(fs_info, logical, map_length);
+ +      if (IS_ERR(em))
+ +              return PTR_ERR(em);
+ +      ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical,
+ +                                  map_length, &geom);
         if (ret < 0)
- -              return ret;
+ +              goto out;
   
         if (geom.len < length + size)
- -              return 1;
- -      return 0;
+ +              ret = 1;
+ +out:
+ +      free_extent_map(em);
+ +      return ret;
   }
   
   /*
@@@ -2246,119 -2217,6 +2246,119 @@@ static blk_status_t btrfs_submit_bio_st
         return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
   }
   
+ +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio,
+ +                                    unsigned int size)
+ +{
+ +      struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ +      struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ +      struct btrfs_ordered_extent *ordered;
+ +      u64 len = bio->bi_iter.bi_size + size;
+ +      bool ret = true;
+ +
+ +      ASSERT(btrfs_is_zoned(fs_info));
+ +      ASSERT(fs_info->max_zone_append_size > 0);
+ +      ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND);
+ +
+ +      /* Ordered extent not yet created, so we're good */
+ +      ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+ +      if (!ordered)
+ +              return ret;
+ +
+ +      if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len >
+ +          ordered->disk_bytenr + ordered->disk_num_bytes)
+ +              ret = false;
+ +
+ +      btrfs_put_ordered_extent(ordered);
+ +
+ +      return ret;
+ +}
+ +
+ +static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+ +                                         struct bio *bio, loff_t file_offset)
+ +{
+ +      struct btrfs_ordered_extent *ordered;
+ +      struct extent_map *em = NULL, *em_new = NULL;
+ +      struct extent_map_tree *em_tree = &inode->extent_tree;
+ +      u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ +      u64 len = bio->bi_iter.bi_size;
+ +      u64 end = start + len;
+ +      u64 ordered_end;
+ +      u64 pre, post;
+ +      int ret = 0;
+ +
+ +      ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ +      if (WARN_ON_ONCE(!ordered))
+ +              return BLK_STS_IOERR;
+ +
+ +      /* No need to split */
+ +      if (ordered->disk_num_bytes == len)
+ +              goto out;
+ +
+ +      /* We cannot split once end_bio'd ordered extent */
+ +      if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      /* We cannot split a compressed ordered extent */
+ +      if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+ +      /* bio must be in one ordered extent */
+ +      if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      /* Checksum list should be empty */
+ +      if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
+ +      pre = start - ordered->disk_bytenr;
+ +      post = ordered_end - end;
+ +
+ +      ret = btrfs_split_ordered_extent(ordered, pre, post);
+ +      if (ret)
+ +              goto out;
+ +
+ +      read_lock(&em_tree->lock);
+ +      em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
+ +      if (!em) {
+ +              read_unlock(&em_tree->lock);
+ +              ret = -EIO;
+ +              goto out;
+ +      }
+ +      read_unlock(&em_tree->lock);
+ +
+ +      ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ +      /*
+ +       * We cannot reuse em_new here but have to create a new one, as
+ +       * unpin_extent_cache() expects the start of the extent map to be the
+ +       * logical offset of the file, which does not hold true anymore after
+ +       * splitting.
+ +       */
+ +      em_new = create_io_em(inode, em->start + pre, len,
+ +                            em->start + pre, em->block_start + pre, len,
+ +                            len, len, BTRFS_COMPRESS_NONE,
+ +                            BTRFS_ORDERED_REGULAR);
+ +      if (IS_ERR(em_new)) {
+ +              ret = PTR_ERR(em_new);
+ +              goto out;
+ +      }
+ +      free_extent_map(em_new);
+ +
+ +out:
+ +      free_extent_map(em);
+ +      btrfs_put_ordered_extent(ordered);
+ +
+ +      return errno_to_blk_status(ret);
+ +}
+ +
   /*
    * extent_io.c submission hook. This does the right thing for csum calculation
    * on write, or reading the csums from the tree before a read.
@@@ -2394,16 -2252,7 +2394,16 @@@ blk_status_t btrfs_submit_data_bio(stru
         if (btrfs_is_free_space_inode(BTRFS_I(inode)))
                 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
   
- -      if (bio_op(bio) != REQ_OP_WRITE) {
+ +      if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ +              struct page *page = bio_first_bvec_all(bio)->bv_page;
+ +              loff_t file_offset = page_offset(page);
+ +
+ +              ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+ +              if (ret)
+ +                      goto out;
+ +      }
+ +
+ +      if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
                 ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
                 if (ret)
                         goto out;
@@@ -2905,9 -2754,6 +2905,9 @@@ static int btrfs_finish_ordered_io(stru
                 goto out;
         }
   
+ +      if (ordered_extent->disk)
+ +              btrfs_rewrite_logical_zoned(ordered_extent);
+ +
         btrfs_free_io_failure_record(inode, start, end);
   
         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
@@@ -3257,16 -3103,14 +3257,16 @@@ void btrfs_run_delayed_iputs(struct btr
   }
   
   /**
- - * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
- - * @fs_info - the fs_info for this fs
- - * @return - EINTR if we were killed, 0 if nothing's pending
+ + * Wait for flushing all delayed iputs
+ + *
+ + * @fs_info:  the filesystem
    *
    * This will wait on any delayed iputs that are currently running with KILLABLE
    * set.  Once they are all done running we will return, unless we are killed in
    * which case we return EINTR. This helps in user operations like fallocate etc
    * that might get blocked on the iputs.
+ + *
+ + * Return EINTR if we were killed, 0 if nothing's pending
    */
   int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
   {
@@@ -4876,9 -4720,6 +4876,9 @@@ again
                 ret = -ENOMEM;
                 goto out;
         }
+ +      ret = set_page_extent_mapped(page);
+ +      if (ret < 0)
+ +              goto out_unlock;
   
         if (!PageUptodate(page)) {
                 ret = btrfs_readpage(NULL, page);
@@@ -4896,6 -4737,7 +4896,6 @@@
         wait_on_page_writeback(page);
   
         lock_extent_bits(io_tree, block_start, block_end, &cached_state);
- -      set_page_extent_mapped(page);
   
         ordered = btrfs_lookup_ordered_extent(inode, block_start);
         if (ordered) {
@@@ -5169,15 -5011,6 +5169,15 @@@ static int btrfs_setsize(struct inode *
                 btrfs_drew_write_unlock(&root->snapshot_lock);
                 btrfs_end_transaction(trans);
         } else {
+ +              struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ +
+ +              if (btrfs_is_zoned(fs_info)) {
+ +                      ret = btrfs_wait_ordered_range(inode,
+ +                                      ALIGN(newsize, fs_info->sectorsize),
+ +                                      (u64)-1);
+ +                      if (ret)
+ +                              return ret;
+ +              }
   
                 /*
                  * We're truncating a file that used to have good data down to
@@@ -5212,7 -5045,8 +5212,8 @@@
         return ret;
   }
   
- static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                        struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@@ -5221,7 -5055,7 +5222,7 @@@
         if (btrfs_root_readonly(root))
                 return -EROFS;
   
-       err = setattr_prepare(dentry, attr);
+       err = setattr_prepare(&init_user_ns, dentry, attr);
         if (err)
                 return err;
   
@@@ -5232,12 -5066,13 +5233,13 @@@
         }
   
         if (attr->ia_valid) {
-               setattr_copy(inode, attr);
+               setattr_copy(&init_user_ns, inode, attr);
                 inode_inc_iversion(inode);
                 err = btrfs_dirty_inode(inode);
   
                 if (!err && attr->ia_valid & ATTR_MODE)
-                       err = posix_acl_chmod(inode, inode->i_mode);
+                       err = posix_acl_chmod(&init_user_ns, inode,
+                                             inode->i_mode);
         }
   
         return err;
@@@ -6357,7 -6192,7 +6359,7 @@@ static struct inode *btrfs_new_inode(st
         if (ret != 0)
                 goto fail_unlock;
   
-       inode_init_owner(inode, dir, mode);
+       inode_init_owner(&init_user_ns, inode, dir, mode);
         inode_set_bytes(inode, 0);
   
         inode->i_mtime = current_time(inode);
@@@ -6518,8 -6353,8 +6520,8 @@@ static int btrfs_add_nondir(struct btrf
         return err;
   }
   
- static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
-                       umode_t mode, dev_t rdev)
+ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+                      struct dentry *dentry, umode_t mode, dev_t rdev)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct btrfs_trans_handle *trans;
@@@ -6538,7 -6373,7 +6540,7 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
- -      err = btrfs_find_free_objectid(root, &objectid);
+ +      err = btrfs_get_free_objectid(root, &objectid);
         if (err)
                 goto out_unlock;
   
@@@ -6582,8 -6417,8 +6584,8 @@@ out_unlock
         return err;
   }
   
- static int btrfs_create(struct inode *dir, struct dentry *dentry,
-                       umode_t mode, bool excl)
+ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+                       struct dentry *dentry, umode_t mode, bool excl)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct btrfs_trans_handle *trans;
@@@ -6602,7 -6437,7 +6604,7 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
- -      err = btrfs_find_free_objectid(root, &objectid);
+ +      err = btrfs_get_free_objectid(root, &objectid);
         if (err)
                 goto out_unlock;
   
@@@ -6727,7 -6562,8 +6729,8 @@@ fail
         return err;
   }
   
- static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                      struct dentry *dentry, umode_t mode)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct inode *inode = NULL;
@@@ -6746,7 -6582,7 +6749,7 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
- -      err = btrfs_find_free_objectid(root, &objectid);
+ +      err = btrfs_get_free_objectid(root, &objectid);
         if (err)
                 goto out_fail;
   
@@@ -7270,6 -7106,9 +7273,6 @@@ static struct extent_map *btrfs_new_ext
    * @strict:   if true, omit optimizations that might force us into unnecessary
    *            cow. e.g., don't trust generation number.
    *
- - * This function will flush ordered extents in the range to ensure proper
- - * nocow checks for (nowait == false) case.
- - *
    * Return:
    * >0 and update @len if we can do nocow write
    *  0 if we can't do nocow write
@@@ -7777,9 -7616,6 +7780,9 @@@ static int btrfs_dio_iomap_begin(struc
         iomap->bdev = fs_info->fs_devices->latest_bdev;
         iomap->length = len;
   
+ +      if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ +              iomap->flags |= IOMAP_F_ZONE_APPEND;
+ +
         free_extent_map(em);
   
         return 0;
@@@ -7849,7 -7685,7 +7852,7 @@@ static void btrfs_dio_private_put(struc
         if (!refcount_dec_and_test(&dip->refs))
                 return;
   
- -      if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ +      if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
                 __endio_write_update_ordered(BTRFS_I(dip->inode),
                                              dip->logical_offset,
                                              dip->bytes,
@@@ -7964,8 -7800,10 +7967,8 @@@ static void __endio_write_update_ordere
                                         NULL);
                         btrfs_queue_work(wq, &ordered->work);
                 }
- -              /*
- -               * If btrfs_dec_test_ordered_pending does not find any ordered
- -               * extent in the range, we can exit.
- -               */
+ +
+ +              /* No ordered extent found in the range, exit */
                 if (ordered_offset == last_offset)
                         return;
                 /*
@@@ -8006,8 -7844,6 +8009,8 @@@ static void btrfs_end_dio_bio(struct bi
         if (err)
                 dip->dio_bio->bi_status = err;
   
+ +      btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+ +
         bio_put(bio);
         btrfs_dio_private_put(dip);
   }
@@@ -8017,7 -7853,7 +8020,7 @@@ static inline blk_status_t btrfs_submit
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_dio_private *dip = bio->bi_private;
- -      bool write = bio_op(bio) == REQ_OP_WRITE;
+ +      bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
         blk_status_t ret;
   
         /* Check btrfs_submit_bio_hook() for rules about async submit. */
@@@ -8067,7 -7903,7 +8070,7 @@@ static struct btrfs_dio_private *btrfs_
                                                           struct inode *inode,
                                                           loff_t file_offset)
   {
- -      const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ +      const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
         size_t dip_size;
         struct btrfs_dio_private *dip;
@@@ -8097,7 -7933,7 +8100,7 @@@
   static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
                 struct bio *dio_bio, loff_t file_offset)
   {
- -      const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ +      const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
                              BTRFS_BLOCK_GROUP_RAID56_MASK);
@@@ -8108,12 -7944,10 +8111,12 @@@
         u64 submit_len;
         int clone_offset = 0;
         int clone_len;
+ +      u64 logical;
         int ret;
         blk_status_t status;
         struct btrfs_io_geometry geom;
         struct btrfs_dio_data *dio_data = iomap->private;
+ +      struct extent_map *em = NULL;
   
         dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
         if (!dip) {
@@@ -8142,18 -7976,12 +8145,18 @@@
         submit_len = dio_bio->bi_iter.bi_size;
   
         do {
- -              ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
- -                                          start_sector << 9, submit_len,
- -                                          &geom);
+ +              logical = start_sector << 9;
+ +              em = btrfs_get_chunk_map(fs_info, logical, submit_len);
+ +              if (IS_ERR(em)) {
+ +                      status = errno_to_blk_status(PTR_ERR(em));
+ +                      em = NULL;
+ +                      goto out_err_em;
+ +              }
+ +              ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
+ +                                          logical, submit_len, &geom);
                 if (ret) {
                         status = errno_to_blk_status(ret);
- -                      goto out_err;
+ +                      goto out_err_em;
                 }
                 ASSERT(geom.len <= INT_MAX);
   
@@@ -8168,19 -7996,6 +8171,19 @@@
                 bio->bi_end_io = btrfs_end_dio_bio;
                 btrfs_io_bio(bio)->logical = file_offset;
   
+ +              WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
+ +                           fs_info->max_zone_append_size &&
+ +                           bio_op(bio) != REQ_OP_ZONE_APPEND);
+ +
+ +              if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ +                      status = extract_ordered_extent(BTRFS_I(inode), bio,
+ +                                                      file_offset);
+ +                      if (status) {
+ +                              bio_put(bio);
+ +                              goto out_err;
+ +                      }
+ +              }
+ +
                 ASSERT(submit_len >= clone_len);
                 submit_len -= clone_len;
   
@@@ -8211,24 -8026,19 +8214,24 @@@
                         bio_put(bio);
                         if (submit_len > 0)
                                 refcount_dec(&dip->refs);
- -                      goto out_err;
+ +                      goto out_err_em;
                 }
   
                 dio_data->submitted += clone_len;
                 clone_offset += clone_len;
                 start_sector += clone_len >> 9;
                 file_offset += clone_len;
+ +
+ +              free_extent_map(em);
         } while (submit_len > 0);
         return BLK_QC_T_NONE;
   
+ +out_err_em:
+ +      free_extent_map(em);
   out_err:
         dip->dio_bio->bi_status = status;
         btrfs_dio_private_put(dip);
+ +
         return BLK_QC_T_NONE;
   }
   
@@@ -8310,7 -8120,7 +8313,7 @@@ static int __btrfs_releasepage(struct p
   {
         int ret = try_release_extent_mapping(page, gfp_flags);
         if (ret == 1)
- -              detach_page_private(page);
+ +              clear_page_extent_mapped(page);
         return ret;
   }
   
@@@ -8379,9 -8189,8 +8382,9 @@@ static void btrfs_invalidatepage(struc
   
         if (!inode_evicting)
                 lock_extent_bits(tree, page_start, page_end, &cached_state);
- -again:
+ +
         start = page_start;
+ +again:
         ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
         if (ordered) {
                 found_ordered = true;
@@@ -8470,7 -8279,7 +8473,7 @@@
         }
   
         ClearPageChecked(page);
- -      detach_page_private(page);
+ +      clear_page_extent_mapped(page);
   }
   
   /*
@@@ -8549,12 -8358,7 +8552,12 @@@ again
         wait_on_page_writeback(page);
   
         lock_extent_bits(io_tree, page_start, page_end, &cached_state);
- -      set_page_extent_mapped(page);
+ +      ret2 = set_page_extent_mapped(page);
+ +      if (ret2 < 0) {
+ +              ret = vmf_error(ret2);
+ +              unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ +              goto out_unlock;
+ +      }
   
         /*
          * we can't set the delalloc bits if there are pending ordered
@@@ -8791,18 -8595,15 +8794,18 @@@ out
    */
   int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root,
- -                           struct btrfs_root *parent_root,
- -                           u64 new_dirid)
+ +                           struct btrfs_root *parent_root)
   {
         struct inode *inode;
         int err;
         u64 index = 0;
+ +      u64 ino;
+ +
+ +      err = btrfs_get_free_objectid(new_root, &ino);
+ +      if (err < 0)
+ +              return err;
   
- -      inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
- -                              new_dirid, new_dirid,
+ +      inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino,
                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
                                 &index);
         if (IS_ERR(inode))
@@@ -9017,7 -8818,8 +9020,8 @@@ fail
         return -ENOMEM;
   }
   
- static int btrfs_getattr(const struct path *path, struct kstat *stat,
+ static int btrfs_getattr(struct user_namespace *mnt_userns,
+                        const struct path *path, struct kstat *stat,
                          u32 request_mask, unsigned int flags)
   {
         u64 delalloc_bytes;
@@@ -9043,7 -8845,7 +9047,7 @@@
                                   STATX_ATTR_IMMUTABLE |
                                   STATX_ATTR_NODUMP);
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         stat->dev = BTRFS_I(inode)->root->anon_dev;
   
         spin_lock(&BTRFS_I(inode)->lock);
@@@ -9281,7 -9083,7 +9285,7 @@@ static int btrfs_whiteout_for_rename(st
         u64 objectid;
         u64 index;
   
- -      ret = btrfs_find_free_objectid(root, &objectid);
+ +      ret = btrfs_get_free_objectid(root, &objectid);
         if (ret)
                 return ret;
   
@@@ -9534,9 -9336,9 +9538,9 @@@ out_notrans
         return ret;
   }
   
- static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-                        struct inode *new_dir, struct dentry *new_dentry,
-                        unsigned int flags)
+ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
+                        struct dentry *old_dentry, struct inode *new_dir,
+                        struct dentry *new_dentry, unsigned int flags)
   {
         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
@@@ -9688,11 -9490,11 +9692,11 @@@ int btrfs_start_delalloc_snapshot(struc
         return start_delalloc_inodes(root, &wbc, true, false);
   }
   
- -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+ +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
                                bool in_reclaim_context)
   {
         struct writeback_control wbc = {
- -              .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
+ +              .nr_to_write = nr,
                 .sync_mode = WB_SYNC_NONE,
                 .range_start = 0,
                 .range_end = LLONG_MAX,
@@@ -9709,12 -9511,12 +9713,12 @@@
         mutex_lock(&fs_info->delalloc_root_mutex);
         spin_lock(&fs_info->delalloc_root_lock);
         list_splice_init(&fs_info->delalloc_roots, &splice);
- -      while (!list_empty(&splice) && nr) {
+ +      while (!list_empty(&splice)) {
                 /*
                  * Reset nr_to_write here so we know that we're doing a full
                  * flush.
                  */
- -              if (nr == U64_MAX)
+ +              if (nr == LONG_MAX)
                         wbc.nr_to_write = LONG_MAX;
   
                 root = list_first_entry(&splice, struct btrfs_root,
@@@ -9744,8 -9546,8 +9748,8 @@@ out
         return ret;
   }
   
- static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
-                        const char *symname)
+ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+                        struct dentry *dentry, const char *symname)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct btrfs_trans_handle *trans;
@@@ -9777,7 -9579,7 +9781,7 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
- -      err = btrfs_find_free_objectid(root, &objectid);
+ +      err = btrfs_get_free_objectid(root, &objectid);
         if (err)
                 goto out_unlock;
   
@@@ -10079,7 -9881,8 +10083,8 @@@ static int btrfs_set_page_dirty(struct 
         return __set_page_dirty_nobuffers(page);
   }
   
- static int btrfs_permission(struct inode *inode, int mask)
+ static int btrfs_permission(struct user_namespace *mnt_userns,
+                           struct inode *inode, int mask)
   {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         umode_t mode = inode->i_mode;
@@@ -10091,10 -9894,11 +10096,11 @@@
                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
                         return -EACCES;
         }
-       return generic_permission(inode, mask);
+       return generic_permission(&init_user_ns, inode, mask);
   }
   
- static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+                        struct dentry *dentry, umode_t mode)
   {
         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
         struct btrfs_trans_handle *trans;
@@@ -10111,7 -9915,7 +10117,7 @@@
         if (IS_ERR(trans))
                 return PTR_ERR(trans);
   
- -      ret = btrfs_find_free_objectid(root, &objectid);
+ +      ret = btrfs_get_free_objectid(root, &objectid);
         if (ret)
                 goto out;
   
diff --combined fs/btrfs/ioctl.c

index a8c60d4,56f53d6..072e777
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -213,7 -213,7 +213,7 @@@ static int btrfs_ioctl_setflags(struct 
         const char *comp = NULL;
         u32 binode_flags;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EPERM;
   
         if (btrfs_root_readonly(root))
@@@ -429,7 -429,7 +429,7 @@@ static int btrfs_ioctl_fssetxattr(struc
         unsigned old_i_flags;
         int ret = 0;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EPERM;
   
         if (btrfs_root_readonly(root))
@@@ -528,14 -528,6 +528,14 @@@ static noinline int btrfs_ioctl_fitrim(
                 return -EPERM;
   
         /*
+ +       * btrfs_trim_block_group() depends on space cache, which is not
+ +       * available in zoned filesystem. So, disallow fitrim on a zoned
+ +       * filesystem for now.
+ +       */
+ +      if (btrfs_is_zoned(fs_info))
+ +              return -EOPNOTSUPP;
+ +
+ +      /*
          * If the fs is mounted with nologreplay, which requires it to be
          * mounted in RO mode as well, we can not allow discard on free space
          * inside block groups, because log trees refer to extents that are not
@@@ -614,13 -606,14 +614,13 @@@ static noinline int create_subvol(struc
         int err;
         dev_t anon_dev = 0;
         u64 objectid;
- -      u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
         u64 index = 0;
   
         root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
         if (!root_item)
                 return -ENOMEM;
   
- -      ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
+ +      ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
         if (ret)
                 goto fail_free;
   
@@@ -700,7 -693,7 +700,7 @@@
         free_extent_buffer(leaf);
         leaf = NULL;
   
- -      btrfs_set_root_dirid(root_item, new_dirid);
+ +      btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
   
         key.objectid = objectid;
         key.offset = 0;
@@@ -723,7 -716,7 +723,7 @@@
   
         btrfs_record_root_in_trans(trans, new_root);
   
- -      ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
+ +      ret = btrfs_create_subvol_root(trans, new_root, root);
         btrfs_put_root(new_root);
         if (ret) {
                 /* We potentially lose an unused inode item here */
@@@ -731,6 -724,10 +731,6 @@@
                 goto fail;
         }
   
- -      mutex_lock(&new_root->objectid_mutex);
- -      new_root->highest_objectid = new_dirid;
- -      mutex_unlock(&new_root->objectid_mutex);
- -
         /*
          * insert the directory item
          */
@@@ -925,13 -922,14 +925,14 @@@ static int btrfs_may_delete(struct inod
         BUG_ON(d_inode(victim->d_parent) != dir);
         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
   
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
         if (IS_APPEND(dir))
                 return -EPERM;
-       if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
-           IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
+       if (check_sticky(&init_user_ns, dir, d_inode(victim)) ||
+           IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
+           IS_SWAPFILE(d_inode(victim)))
                 return -EPERM;
         if (isdir) {
                 if (!d_is_dir(victim))
@@@ -954,7 -952,7 +955,7 @@@ static inline int btrfs_may_create(stru
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
-       return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
   }
   
   /*
@@@ -1322,13 -1320,6 +1323,13 @@@ again
                 if (!page)
                         break;
   
+ +              ret = set_page_extent_mapped(page);
+ +              if (ret < 0) {
+ +                      unlock_page(page);
+ +                      put_page(page);
+ +                      break;
+ +              }
+ +
                 page_start = page_offset(page);
                 page_end = page_start + PAGE_SIZE - 1;
                 while (1) {
@@@ -1450,6 -1441,7 +1451,6 @@@
         for (i = 0; i < i_done; i++) {
                 clear_page_dirty_for_io(pages[i]);
                 ClearPageChecked(pages[i]);
- -              set_page_extent_mapped(pages[i]);
                 set_page_dirty(pages[i]);
                 unlock_page(pages[i]);
                 put_page(pages[i]);
@@@ -1871,7 -1863,7 +1872,7 @@@ static noinline int __btrfs_ioctl_snap_
                         btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
                                    "Snapshot src from another FS");
                         ret = -EXDEV;
-               } else if (!inode_owner_or_capable(src_inode)) {
+               } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) {
                         /*
                          * Subvolume creation is not restricted, but snapshots
                          * are limited to own subvolumes only
@@@ -1991,7 -1983,7 +1992,7 @@@ static noinline int btrfs_ioctl_subvol_
         u64 flags;
         int ret = 0;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EPERM;
   
         ret = mnt_want_write_file(file);
@@@ -2547,7 -2539,8 +2548,8 @@@ static int btrfs_search_path_in_tree_us
                                 ret = PTR_ERR(temp_inode);
                                 goto out_put;
                         }
-                       ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
+                       ret = inode_permission(&init_user_ns, temp_inode,
+                                              MAY_READ | MAY_EXEC);
                         iput(temp_inode);
                         if (ret) {
                                 ret = -EACCES;
@@@ -3077,7 -3070,8 +3079,8 @@@ static noinline int btrfs_ioctl_snap_de
                 if (root == dest)
                         goto out_dput;
   
-               err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+               err = inode_permission(&init_user_ns, inode,
+                                      MAY_WRITE | MAY_EXEC);
                 if (err)
                         goto out_dput;
         }
@@@ -3148,7 -3142,7 +3151,7 @@@ static int btrfs_ioctl_defrag(struct fi
                  * running and allows defrag on files open in read-only mode.
                  */
                 if (!capable(CAP_SYS_ADMIN) &&
-                   inode_permission(inode, MAY_WRITE)) {
+                   inode_permission(&init_user_ns, inode, MAY_WRITE)) {
                         ret = -EPERM;
                         goto out;
                 }
@@@ -4460,7 -4454,7 +4463,7 @@@ static long _btrfs_ioctl_set_received_s
         int ret = 0;
         int received_uuid_changed;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EPERM;
   
         ret = mnt_want_write_file(file);
@@@ -4960,7 -4954,7 +4963,7 @@@ long btrfs_ioctl(struct file *file, uns
         case BTRFS_IOC_SYNC: {
                 int ret;
   
- -              ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
+ +              ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
                 if (ret)
                         return ret;
                 ret = btrfs_sync_fs(inode->i_sb, 1);
diff --combined fs/ceph/inode.c

index 5d20a62,d6ece56..156f849
--- 1/fs/ceph/inode.c
--- 2/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@@ -1816,17 -1816,60 +1816,17 @@@ void ceph_async_iput(struct inode *inod
         }
   }
   
- -/*
- - * Write back inode data in a worker thread.  (This can't be done
- - * in the message handler context.)
- - */
- -void ceph_queue_writeback(struct inode *inode)
- -{
- -      struct ceph_inode_info *ci = ceph_inode(inode);
- -      set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask);
- -
- -      ihold(inode);
- -      if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- -                     &ci->i_work)) {
- -              dout("ceph_queue_writeback %p\n", inode);
- -      } else {
- -              dout("ceph_queue_writeback %p already queued, mask=%lx\n",
- -                   inode, ci->i_work_mask);
- -              iput(inode);
- -      }
- -}
- -
- -/*
- - * queue an async invalidation
- - */
- -void ceph_queue_invalidate(struct inode *inode)
- -{
- -      struct ceph_inode_info *ci = ceph_inode(inode);
- -      set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask);
- -
- -      ihold(inode);
- -      if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- -                     &ceph_inode(inode)->i_work)) {
- -              dout("ceph_queue_invalidate %p\n", inode);
- -      } else {
- -              dout("ceph_queue_invalidate %p already queued, mask=%lx\n",
- -                   inode, ci->i_work_mask);
- -              iput(inode);
- -      }
- -}
- -
- -/*
- - * Queue an async vmtruncate.  If we fail to queue work, we will handle
- - * the truncation the next time we call __ceph_do_pending_vmtruncate.
- - */
- -void ceph_queue_vmtruncate(struct inode *inode)
+ +void ceph_queue_inode_work(struct inode *inode, int work_bit)
   {
+ +      struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         struct ceph_inode_info *ci = ceph_inode(inode);
- -      set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask);
+ +      set_bit(work_bit, &ci->i_work_mask);
   
         ihold(inode);
- -      if (queue_work(ceph_inode_to_client(inode)->inode_wq,
- -                     &ci->i_work)) {
- -              dout("ceph_queue_vmtruncate %p\n", inode);
+ +      if (queue_work(fsc->inode_wq, &ci->i_work)) {
+ +              dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
         } else {
- -              dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n",
+ +              dout("queue_inode_work %p already queued, mask=%lx\n",
                      inode, ci->i_work_mask);
                 iput(inode);
         }
@@@ -1965,12 -2008,6 +1965,12 @@@ static void ceph_inode_work(struct work
         if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
                 __ceph_do_pending_vmtruncate(inode);
   
+ +      if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
+ +              ceph_check_caps(ci, 0, NULL);
+ +
+ +      if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
+ +              ceph_flush_snaps(ci, NULL);
+ +
         iput(inode);
   }
   
@@@ -2201,7 -2238,8 +2201,8 @@@ int __ceph_setattr(struct inode *inode
   /*
    * setattr
    */
- int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@@ -2210,7 -2248,7 +2211,7 @@@
         if (ceph_snap(inode) != CEPH_NOSNAP)
                 return -EROFS;
   
-       err = setattr_prepare(dentry, attr);
+       err = setattr_prepare(&init_user_ns, dentry, attr);
         if (err != 0)
                 return err;
   
@@@ -2225,7 -2263,7 +2226,7 @@@
         err = __ceph_setattr(inode, attr);
   
         if (err >= 0 && (attr->ia_valid & ATTR_MODE))
-               err = posix_acl_chmod(inode, attr->ia_mode);
+               err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
   
         return err;
   }
@@@ -2284,7 -2322,8 +2285,8 @@@ int __ceph_do_getattr(struct inode *ino
    * Check inode permissions.  We verify we have a valid value for
    * the AUTH cap, then call the generic handler.
    */
- int ceph_permission(struct inode *inode, int mask)
+ int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
+                   int mask)
   {
         int err;
   
@@@ -2294,7 -2333,7 +2296,7 @@@
         err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
   
         if (!err)
-               err = generic_permission(inode, mask);
+               err = generic_permission(&init_user_ns, inode, mask);
         return err;
   }
   
@@@ -2331,8 -2370,8 +2333,8 @@@ static int statx_to_caps(u32 want, umod
    * Get all the attributes. If we have sufficient caps for the requested attrs,
    * then we can avoid talking to the MDS at all.
    */
- int ceph_getattr(const struct path *path, struct kstat *stat,
-                u32 request_mask, unsigned int flags)
+ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                struct kstat *stat, u32 request_mask, unsigned int flags)
   {
         struct inode *inode = d_inode(path->dentry);
         struct ceph_inode_info *ci = ceph_inode(inode);
@@@ -2348,7 -2387,7 +2350,7 @@@
                         return err;
         }
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         stat->ino = ceph_present_inode(inode);
   
         /*
diff --combined fs/ceph/super.h

index 13b0288,1ef0a2a..c48bb30
--- 1/fs/ceph/super.h
--- 2/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@@ -562,11 -562,9 +562,11 @@@ static inline struct inode *ceph_find_i
   /*
    * Masks of ceph inode work.
    */
- -#define CEPH_I_WORK_WRITEBACK         0 /* writeback */
- -#define CEPH_I_WORK_INVALIDATE_PAGES  1 /* invalidate pages */
- -#define CEPH_I_WORK_VMTRUNCATE                2 /* vmtruncate */
+ +#define CEPH_I_WORK_WRITEBACK         0
+ +#define CEPH_I_WORK_INVALIDATE_PAGES  1
+ +#define CEPH_I_WORK_VMTRUNCATE                2
+ +#define CEPH_I_WORK_CHECK_CAPS                3
+ +#define CEPH_I_WORK_FLUSH_SNAPS               4
   
   /*
    * We set the ERROR_WRITE bit when we start seeing write errors on an inode
@@@ -964,46 -962,24 +964,49 @@@ extern int ceph_inode_holds_cap(struct 
   
   extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
   extern void __ceph_do_pending_vmtruncate(struct inode *inode);
- -extern void ceph_queue_vmtruncate(struct inode *inode);
- -extern void ceph_queue_invalidate(struct inode *inode);
- -extern void ceph_queue_writeback(struct inode *inode);
+ +
   extern void ceph_async_iput(struct inode *inode);
   
+ +void ceph_queue_inode_work(struct inode *inode, int work_bit);
+ +
+ +static inline void ceph_queue_vmtruncate(struct inode *inode)
+ +{
+ +      ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE);
+ +}
+ +
+ +static inline void ceph_queue_invalidate(struct inode *inode)
+ +{
+ +      ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES);
+ +}
+ +
+ +static inline void ceph_queue_writeback(struct inode *inode)
+ +{
+ +      ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK);
+ +}
+ +
+ +static inline void ceph_queue_check_caps(struct inode *inode)
+ +{
+ +      ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS);
+ +}
+ +
+ +static inline void ceph_queue_flush_snaps(struct inode *inode)
+ +{
+ +      ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS);
+ +}
+ +
   extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
                              int mask, bool force);
   static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
   {
         return __ceph_do_getattr(inode, NULL, mask, force);
   }
- extern int ceph_permission(struct inode *inode, int mask);
+ extern int ceph_permission(struct user_namespace *mnt_userns,
+                          struct inode *inode, int mask);
   extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
- extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
- extern int ceph_getattr(const struct path *path, struct kstat *stat,
+ extern int ceph_setattr(struct user_namespace *mnt_userns,
+                       struct dentry *dentry, struct iattr *attr);
+ extern int ceph_getattr(struct user_namespace *mnt_userns,
+                       const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int flags);
   
   /* xattr.c */
@@@ -1064,7 -1040,8 +1067,8 @@@ void ceph_release_acl_sec_ctx(struct ce
   #ifdef CONFIG_CEPH_FS_POSIX_ACL
   
   struct posix_acl *ceph_get_acl(struct inode *, int);
- int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+ int ceph_set_acl(struct user_namespace *mnt_userns,
+                struct inode *inode, struct posix_acl *acl, int type);
   int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
                        struct ceph_acl_sec_ctx *as_ctx);
   void ceph_init_inode_acls(struct inode *inode,
@@@ -1132,7 -1109,6 +1136,7 @@@ extern void ceph_take_cap_refs(struct c
                                 bool snap_rwsem_locked);
   extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
   extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+ +extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
   extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
                                             int had);
   extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
diff --combined fs/cifs/cifsfs.c

index ab883e8,39e51dc..38534e0
--- 1/fs/cifs/cifsfs.c
--- 2/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@@ -305,7 -305,8 +305,8 @@@ static long cifs_fallocate(struct file 
         return -EOPNOTSUPP;
   }
   
- static int cifs_permission(struct inode *inode, int mask)
+ static int cifs_permission(struct user_namespace *mnt_userns,
+                          struct inode *inode, int mask)
   {
         struct cifs_sb_info *cifs_sb;
   
@@@ -320,7 -321,7 +321,7 @@@
                 on the client (above and beyond ACL on servers) for
                 servers which do not support setting and viewing mode bits,
                 so allowing client to check permissions is useful */
-               return generic_permission(inode, mask);
+               return generic_permission(&init_user_ns, inode, mask);
   }
   
   static struct kmem_cache *cifs_inode_cachep;
@@@ -469,7 -470,7 +470,7 @@@ cifs_show_cache_flavor(struct seq_file 
   static int cifs_show_devname(struct seq_file *m, struct dentry *root)
   {
         struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
- -      char *devname = kstrdup(cifs_sb->ctx->UNC, GFP_KERNEL);
+ +      char *devname = kstrdup(cifs_sb->ctx->source, GFP_KERNEL);
   
         if (devname == NULL)
                 seq_puts(m, "none");
@@@ -822,7 -823,7 +823,7 @@@ cifs_smb3_do_mount(struct file_system_t
                 goto out;
         }
   
- -      rc = cifs_setup_volume_info(cifs_sb->ctx);
+ +      rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC);
         if (rc) {
                 root = ERR_PTR(rc);
                 goto out;
diff --combined fs/cifs/dir.c

index 97ac363,68f4f85..a3fb81e
--- 1/fs/cifs/dir.c
--- 2/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@@ -567,8 -567,8 +567,8 @@@ out_free_xid
         return rc;
   }
   
- int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
-               bool excl)
+ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode,
+               struct dentry *direntry, umode_t mode, bool excl)
   {
         int rc;
         unsigned int xid = get_xid();
@@@ -611,8 -611,8 +611,8 @@@ out_free_xid
         return rc;
   }
   
- int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
-               dev_t device_number)
+ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode,
+              struct dentry *direntry, umode_t mode, dev_t device_number)
   {
         int rc = -EPERM;
         unsigned int xid;
@@@ -737,7 -737,6 +737,7 @@@ static in
   cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
   {
         struct inode *inode;
+ +      int rc;
   
         if (flags & LOOKUP_RCU)
                 return -ECHILD;
@@@ -747,25 -746,8 +747,25 @@@
                 if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode)))
                         CIFS_I(inode)->time = 0; /* force reval */
   
- -              if (cifs_revalidate_dentry(direntry))
- -                      return 0;
+ +              rc = cifs_revalidate_dentry(direntry);
+ +              if (rc) {
+ +                      cifs_dbg(FYI, "cifs_revalidate_dentry failed with rc=%d", rc);
+ +                      switch (rc) {
+ +                      case -ENOENT:
+ +                      case -ESTALE:
+ +                              /*
+ +                               * Those errors mean the dentry is invalid
+ +                               * (file was deleted or recreated)
+ +                               */
+ +                              return 0;
+ +                      default:
+ +                              /*
+ +                               * Otherwise some unexpected error happened
+ +                               * report it as-is to VFS layer
+ +                               */
+ +                              return rc;
+ +                      }
+ +              }
                 else {
                         /*
                          * If the inode wasn't known to be a dfs entry when
diff --combined fs/ecryptfs/inode.c

index 58d0f71,55da9a9..18e9285
--- 1/fs/ecryptfs/inode.c
--- 2/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@@ -141,7 -141,8 +141,8 @@@ static int ecryptfs_do_unlink(struct in
         else if (d_unhashed(lower_dentry))
                 rc = -EINVAL;
         else
-               rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+               rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry,
+                               NULL);
         if (rc) {
                 printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
                 goto out_unlock;
@@@ -180,7 -181,8 +181,8 @@@ ecryptfs_do_create(struct inode *direct
   
         lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
         lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
+       rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+                       mode, true);
         if (rc) {
                 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
                        "rc = [%d]\n", __func__, rc);
@@@ -190,7 -192,8 +192,8 @@@
         inode = __ecryptfs_get_inode(d_inode(lower_dentry),
                                      directory_inode->i_sb);
         if (IS_ERR(inode)) {
-               vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+               vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry),
+                          lower_dentry, NULL);
                 goto out_lock;
         }
         fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
@@@ -254,7 -257,8 +257,8 @@@ out
    * Returns zero on success; non-zero on error condition
    */
   static int
- ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+ ecryptfs_create(struct user_namespace *mnt_userns,
+               struct inode *directory_inode, struct dentry *ecryptfs_dentry,
                 umode_t mode, bool excl)
   {
         struct inode *ecryptfs_inode;
@@@ -436,8 -440,8 +440,8 @@@ static int ecryptfs_link(struct dentry 
         dget(lower_old_dentry);
         dget(lower_new_dentry);
         lower_dir_dentry = lock_parent(lower_new_dentry);
-       rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
-                     lower_new_dentry, NULL);
+       rc = vfs_link(lower_old_dentry, &init_user_ns,
+                     d_inode(lower_dir_dentry), lower_new_dentry, NULL);
         if (rc || d_really_is_negative(lower_new_dentry))
                 goto out_lock;
         rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@@ -460,7 -464,8 +464,8 @@@ static int ecryptfs_unlink(struct inod
         return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
   }
   
- static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+ static int ecryptfs_symlink(struct user_namespace *mnt_userns,
+                           struct inode *dir, struct dentry *dentry,
                             const char *symname)
   {
         int rc;
@@@ -481,7 -486,7 +486,7 @@@
                                                   strlen(symname));
         if (rc)
                 goto out_lock;
-       rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
+       rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
                          encoded_symname);
         kfree(encoded_symname);
         if (rc || d_really_is_negative(lower_dentry))
@@@ -499,7 -504,8 +504,8 @@@ out_lock
         return rc;
   }
   
- static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                         struct dentry *dentry, umode_t mode)
   {
         int rc;
         struct dentry *lower_dentry;
@@@ -507,7 -513,8 +513,8 @@@
   
         lower_dentry = ecryptfs_dentry_to_lower(dentry);
         lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+       rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+                      mode);
         if (rc || d_really_is_negative(lower_dentry))
                 goto out;
         rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@@ -541,7 -548,7 +548,7 @@@ static int ecryptfs_rmdir(struct inode 
         else if (d_unhashed(lower_dentry))
                 rc = -EINVAL;
         else
-               rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+               rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry);
         if (!rc) {
                 clear_nlink(d_inode(dentry));
                 fsstack_copy_attr_times(dir, lower_dir_inode);
@@@ -555,7 -562,8 +562,8 @@@
   }
   
   static int
- ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+              struct dentry *dentry, umode_t mode, dev_t dev)
   {
         int rc;
         struct dentry *lower_dentry;
@@@ -563,7 -571,8 +571,8 @@@
   
         lower_dentry = ecryptfs_dentry_to_lower(dentry);
         lower_dir_dentry = lock_parent(lower_dentry);
-       rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+       rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry,
+                      mode, dev);
         if (rc || d_really_is_negative(lower_dentry))
                 goto out;
         rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@@ -579,9 -588,9 +588,9 @@@ out
   }
   
   static int
- ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-               struct inode *new_dir, struct dentry *new_dentry,
-               unsigned int flags)
+ ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+               struct dentry *old_dentry, struct inode *new_dir,
+               struct dentry *new_dentry, unsigned int flags)
   {
         int rc;
         struct dentry *lower_old_dentry;
@@@ -590,6 -599,7 +599,7 @@@
         struct dentry *lower_new_dir_dentry;
         struct dentry *trap;
         struct inode *target_inode;
+       struct renamedata rd = {};
   
         if (flags)
                 return -EINVAL;
@@@ -619,9 -629,14 +629,14 @@@
                 rc = -ENOTEMPTY;
                 goto out_lock;
         }
-       rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
-                       d_inode(lower_new_dir_dentry), lower_new_dentry,
-                       NULL, 0);
+ 
+       rd.old_mnt_userns       = &init_user_ns;
+       rd.old_dir              = d_inode(lower_old_dir_dentry);
+       rd.old_dentry           = lower_old_dentry;
+       rd.new_mnt_userns       = &init_user_ns;
+       rd.new_dir              = d_inode(lower_new_dir_dentry);
+       rd.new_dentry           = lower_new_dentry;
+       rc = vfs_rename(&rd);
         if (rc)
                 goto out_lock;
         if (target_inode)
@@@ -855,16 -870,19 +870,19 @@@ int ecryptfs_truncate(struct dentry *de
                 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
   
                 inode_lock(d_inode(lower_dentry));
-               rc = notify_change(lower_dentry, &lower_ia, NULL);
+               rc = notify_change(&init_user_ns, lower_dentry,
+                                  &lower_ia, NULL);
                 inode_unlock(d_inode(lower_dentry));
         }
         return rc;
   }
   
   static int
- ecryptfs_permission(struct inode *inode, int mask)
+ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+                   int mask)
   {
-       return inode_permission(ecryptfs_inode_to_lower(inode), mask);
+       return inode_permission(&init_user_ns,
+                               ecryptfs_inode_to_lower(inode), mask);
   }
   
   /**
@@@ -879,7 -897,8 +897,8 @@@
    * All other metadata changes will be passed right to the lower filesystem,
    * and we will just update our inode to look like the lower.
    */
- static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+ static int ecryptfs_setattr(struct user_namespace *mnt_userns,
+                           struct dentry *dentry, struct iattr *ia)
   {
         int rc = 0;
         struct dentry *lower_dentry;
@@@ -933,7 -952,7 +952,7 @@@
         }
         mutex_unlock(&crypt_stat->cs_mutex);
   
-       rc = setattr_prepare(dentry, ia);
+       rc = setattr_prepare(&init_user_ns, dentry, ia);
         if (rc)
                 goto out;
         if (ia->ia_valid & ATTR_SIZE) {
@@@ -959,14 -978,15 +978,15 @@@
                 lower_ia.ia_valid &= ~ATTR_MODE;
   
         inode_lock(d_inode(lower_dentry));
-       rc = notify_change(lower_dentry, &lower_ia, NULL);
+       rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL);
         inode_unlock(d_inode(lower_dentry));
   out:
         fsstack_copy_attr_all(inode, lower_inode);
         return rc;
   }
   
- static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat,
+ static int ecryptfs_getattr_link(struct user_namespace *mnt_userns,
+                                const struct path *path, struct kstat *stat,
                                  u32 request_mask, unsigned int flags)
   {
         struct dentry *dentry = path->dentry;
@@@ -975,7 -995,7 +995,7 @@@
   
         mount_crypt_stat = &ecryptfs_superblock_to_private(
                                                 dentry->d_sb)->mount_crypt_stat;
-       generic_fillattr(d_inode(dentry), stat);
+       generic_fillattr(&init_user_ns, d_inode(dentry), stat);
         if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
                 char *target;
                 size_t targetsiz;
@@@ -991,7 -1011,8 +1011,8 @@@
         return rc;
   }
   
- static int ecryptfs_getattr(const struct path *path, struct kstat *stat,
+ static int ecryptfs_getattr(struct user_namespace *mnt_userns,
+                           const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int flags)
   {
         struct dentry *dentry = path->dentry;
@@@ -1003,7 -1024,7 +1024,7 @@@
         if (!rc) {
                 fsstack_copy_attr_all(d_inode(dentry),
                                       ecryptfs_inode_to_lower(d_inode(dentry)));
-               generic_fillattr(d_inode(dentry), stat);
+               generic_fillattr(&init_user_ns, d_inode(dentry), stat);
                 stat->blocks = lower_stat.blocks;
         }
         return rc;
@@@ -1016,19 -1037,16 +1037,19 @@@ ecryptfs_setxattr(struct dentry *dentry
   {
         int rc;
         struct dentry *lower_dentry;
+ +      struct inode *lower_inode;
   
         lower_dentry = ecryptfs_dentry_to_lower(dentry);
- -      if (!(d_inode(lower_dentry)->i_opflags & IOP_XATTR)) {
+ +      lower_inode = d_inode(lower_dentry);
+ +      if (!(lower_inode->i_opflags & IOP_XATTR)) {
                 rc = -EOPNOTSUPP;
                 goto out;
         }
- -      rc = vfs_setxattr(&init_user_ns, lower_dentry, name, value, size,
- -                        flags);
+ +      inode_lock(lower_inode);
-       rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, NULL);
++      rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL);
+ +      inode_unlock(lower_inode);
         if (!rc && inode)
- -              fsstack_copy_attr_all(inode, d_inode(lower_dentry));
+ +              fsstack_copy_attr_all(inode, lower_inode);
   out:
         return rc;
   }
@@@ -1091,7 -1109,7 +1112,7 @@@ static int ecryptfs_removexattr(struct 
                 goto out;
         }
         inode_lock(lower_inode);
-       rc = __vfs_removexattr(lower_dentry, name);
+       rc = __vfs_removexattr(&init_user_ns, lower_dentry, name);
         inode_unlock(lower_inode);
   out:
         return rc;
@@@ -1135,6 -1153,7 +1156,7 @@@ static int ecryptfs_xattr_get(const str
   }
   
   static int ecryptfs_xattr_set(const struct xattr_handler *handler,
+                             struct user_namespace *mnt_userns,
                               struct dentry *dentry, struct inode *inode,
                               const char *name, const void *value, size_t size,
                               int flags)
diff --combined fs/exec.c

index 5a853f0,48d1e8b..6f3c020
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -708,7 -708,7 +708,7 @@@ static int shift_arg_pages(struct vm_ar
                 return -ENOMEM;
   
         lru_add_drain();
- -      tlb_gather_mmu(&tlb, mm, old_start, old_end);
+ +      tlb_gather_mmu(&tlb, mm);
         if (new_end > old_start) {
                 /*
                  * when the old and new regions overlap clear from new_end.
@@@ -725,7 -725,7 +725,7 @@@
                 free_pgd_range(&tlb, old_start, old_end, new_end,
                         vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
         }
- -      tlb_finish_mmu(&tlb, old_start, old_end);
+ +      tlb_finish_mmu(&tlb);
   
         /*
          * Shrink the vma to just the new range.  Always succeeds.
@@@ -1404,14 -1404,15 +1404,15 @@@ EXPORT_SYMBOL(begin_new_exec)
   void would_dump(struct linux_binprm *bprm, struct file *file)
   {
         struct inode *inode = file_inode(file);
-       if (inode_permission(inode, MAY_READ) < 0) {
+       struct user_namespace *mnt_userns = file_mnt_user_ns(file);
+       if (inode_permission(mnt_userns, inode, MAY_READ) < 0) {
                 struct user_namespace *old, *user_ns;
                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
   
                 /* Ensure mm->user_ns contains the executable */
                 user_ns = old = bprm->mm->user_ns;
                 while ((user_ns != &init_user_ns) &&
-                      !privileged_wrt_inode_uidgid(user_ns, inode))
+                      !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode))
                         user_ns = user_ns->parent;
   
                 if (old != user_ns) {
@@@ -1579,6 -1580,7 +1580,7 @@@ static void check_unsafe_exec(struct li
   static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
   {
         /* Handle suid and sgid on files */
+       struct user_namespace *mnt_userns;
         struct inode *inode;
         unsigned int mode;
         kuid_t uid;
@@@ -1595,13 -1597,15 +1597,15 @@@
         if (!(mode & (S_ISUID|S_ISGID)))
                 return;
   
+       mnt_userns = file_mnt_user_ns(file);
+ 
         /* Be careful if suid/sgid is set */
         inode_lock(inode);
   
         /* reload atomically mode/uid/gid now that lock held */
         mode = inode->i_mode;
-       uid = inode->i_uid;
-       gid = inode->i_gid;
+       uid = i_uid_into_mnt(mnt_userns, inode);
+       gid = i_gid_into_mnt(mnt_userns, inode);
         inode_unlock(inode);
   
         /* We ignore suid/sgid if there are no mappings for them in the ns */
diff --combined fs/exfat/exfat_fs.h

index 764bc64,d905bb9..fa21421
--- 1/fs/exfat/exfat_fs.h
--- 2/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@@ -408,7 -408,7 +408,7 @@@ int exfat_count_num_clusters(struct sup
   int exfat_load_bitmap(struct super_block *sb);
   void exfat_free_bitmap(struct exfat_sb_info *sbi);
   int exfat_set_bitmap(struct inode *inode, unsigned int clu);
- -void exfat_clear_bitmap(struct inode *inode, unsigned int clu);
+ +void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
   unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
   int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
   
@@@ -416,9 -416,11 +416,11 @@@
   extern const struct file_operations exfat_file_operations;
   int __exfat_truncate(struct inode *inode, loff_t new_size);
   void exfat_truncate(struct inode *inode, loff_t size);
- int exfat_setattr(struct dentry *dentry, struct iattr *attr);
- int exfat_getattr(const struct path *path, struct kstat *stat,
-               unsigned int request_mask, unsigned int query_flags);
+ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                 struct iattr *attr);
+ int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                 struct kstat *stat, unsigned int request_mask,
+                 unsigned int query_flags);
   int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
   
   /* namei.c */
diff --combined fs/exfat/file.c

index 183ffdf,3aa6eb4..f783cf3
--- 1/fs/exfat/file.c
--- 2/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@@ -267,13 -267,14 +267,14 @@@ write_size
         mutex_unlock(&sbi->s_lock);
   }
   
- int exfat_getattr(const struct path *path, struct kstat *stat,
-               unsigned int request_mask, unsigned int query_flags)
+ int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path,
+                 struct kstat *stat, unsigned int request_mask,
+                 unsigned int query_flags)
   {
         struct inode *inode = d_backing_inode(path->dentry);
         struct exfat_inode_info *ei = EXFAT_I(inode);
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         exfat_truncate_atime(&stat->atime);
         stat->result_mask |= STATX_BTIME;
         stat->btime.tv_sec = ei->i_crtime.tv_sec;
@@@ -282,7 -283,8 +283,8 @@@
         return 0;
   }
   
- int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                 struct iattr *attr)
   {
         struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
         struct inode *inode = dentry->d_inode;
@@@ -305,7 -307,7 +307,7 @@@
                                 ATTR_TIMES_SET);
         }
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(&init_user_ns, dentry, attr);
         attr->ia_valid = ia_valid;
         if (error)
                 goto out;
@@@ -340,7 -342,7 +342,7 @@@
                 up_write(&EXFAT_I(inode)->truncate_lock);
         }
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         exfat_truncate_atime(&inode->i_atime);
         mark_inode_dirty(inode);
   
@@@ -361,7 -363,7 +363,7 @@@ int exfat_file_fsync(struct file *filp
         if (err)
                 return err;
   
- -      return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +      return blkdev_issue_flush(inode->i_sb->s_bdev);
   }
   
   const struct file_operations exfat_file_operations = {
diff --combined fs/ext4/ialloc.c

index 20f2fcb,bf90289..633ae7b
--- 1/fs/ext4/ialloc.c
--- 2/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@@ -919,7 -919,8 +919,8 @@@ static int ext4_xattr_credits_for_new_i
    * For other inodes, search forward from the parent directory's block
    * group to find a free inode.
    */
- struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+ struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
+                              handle_t *handle, struct inode *dir,
                                umode_t mode, const struct qstr *qstr,
                                __u32 goal, uid_t *owner, __u32 i_flags,
                                int handle_type, unsigned int line_no,
@@@ -969,10 -970,10 +970,10 @@@
                 i_gid_write(inode, owner[1]);
         } else if (test_opt(sb, GRPID)) {
                 inode->i_mode = mode;
-               inode->i_uid = current_fsuid();
+               inode->i_uid = fsuid_into_mnt(mnt_userns);
                 inode->i_gid = dir->i_gid;
         } else
-               inode_init_owner(inode, dir, mode);
+               inode_init_owner(mnt_userns, inode, dir, mode);
   
         if (ext4_has_feature_project(sb) &&
             ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
@@@ -1583,7 -1584,7 +1584,7 @@@ int ext4_init_inode_table(struct super_
         if (ret < 0)
                 goto err_out;
         if (barrier)
- -              blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
+ +              blkdev_issue_flush(sb->s_bdev);
   
   skip_zeroout:
         ext4_lock_group(sb, group);
diff --combined fs/ext4/inode.c

index de79052,8fbf85b..650c5ac
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -20,6 -20,7 +20,7 @@@
    */
   
   #include <linux/fs.h>
+ #include <linux/mount.h>
   #include <linux/time.h>
   #include <linux/highuid.h>
   #include <linux/pagemap.h>
@@@ -4961,11 -4962,15 +4962,11 @@@ static void __ext4_update_other_inode_t
         if (!inode)
                 return;
   
- -      if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- -                             I_DIRTY_INODE)) ||
- -          ((inode->i_state & I_DIRTY_TIME) == 0))
+ +      if (!inode_is_dirtytime_only(inode))
                 return;
   
         spin_lock(&inode->i_lock);
- -      if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
- -                              I_DIRTY_INODE)) == 0) &&
- -          (inode->i_state & I_DIRTY_TIME)) {
+ +      if (inode_is_dirtytime_only(inode)) {
                 struct ext4_inode_info  *ei = EXT4_I(inode);
   
                 inode->i_state &= ~I_DIRTY_TIME;
@@@ -5315,7 -5320,8 +5316,8 @@@ static void ext4_wait_for_tail_page_com
    *
    * Called with inode->i_mutex down.
    */
- int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         int error, rc = 0;
@@@ -5333,7 -5339,7 +5335,7 @@@
                                   ATTR_GID | ATTR_TIMES_SET))))
                 return -EPERM;
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(mnt_userns, dentry, attr);
         if (error)
                 return error;
   
@@@ -5508,7 -5514,7 +5510,7 @@@ out_mmap_sem
         }
   
         if (!error) {
-               setattr_copy(inode, attr);
+               setattr_copy(mnt_userns, inode, attr);
                 mark_inode_dirty(inode);
         }
   
@@@ -5520,7 -5526,7 +5522,7 @@@
                 ext4_orphan_del(NULL, inode);
   
         if (!error && (ia_valid & ATTR_MODE))
-               rc = posix_acl_chmod(inode, inode->i_mode);
+               rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
   
   err_out:
         if  (error)
@@@ -5531,8 -5537,8 +5533,8 @@@
         return error;
   }
   
- int ext4_getattr(const struct path *path, struct kstat *stat,
-                u32 request_mask, unsigned int query_flags)
+ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                struct kstat *stat, u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
         struct ext4_inode *raw_inode;
@@@ -5567,17 -5573,18 +5569,18 @@@
                                   STATX_ATTR_NODUMP |
                                   STATX_ATTR_VERITY);
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(mnt_userns, inode, stat);
         return 0;
   }
   
- int ext4_file_getattr(const struct path *path, struct kstat *stat,
+ int ext4_file_getattr(struct user_namespace *mnt_userns,
+                     const struct path *path, struct kstat *stat,
                       u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
         u64 delalloc_blocks;
   
-       ext4_getattr(path, stat, request_mask, query_flags);
+       ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
   
         /*
          * If there is inline data in the inode, the inode will normally not
@@@ -5933,16 -5940,26 +5936,16 @@@ out
    * If the inode is marked synchronous, we don't honour that here - doing
    * so would cause a commit on atime updates, which we don't bother doing.
    * We handle synchronous inodes at the highest possible level.
- - *
- - * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
- - * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
- - * to copy into the on-disk inode structure are the timestamp files.
    */
   void ext4_dirty_inode(struct inode *inode, int flags)
   {
         handle_t *handle;
   
- -      if (flags == I_DIRTY_TIME)
- -              return;
         handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
         if (IS_ERR(handle))
- -              goto out;
- -
+ +              return;
         ext4_mark_inode_dirty(handle, inode);
- -
         ext4_journal_stop(handle);
- -out:
- -      return;
   }
   
   int ext4_change_inode_journal_flag(struct inode *inode, int val)
diff --combined fs/ext4/ioctl.c

index 713b1ae,56ad9c4..a2cf350
--- 1/fs/ext4/ioctl.c
--- 2/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@@ -107,10 -107,12 +107,12 @@@ void ext4_reset_inode_seed(struct inod
    * important fields of the inodes.
    *
    * @sb:         the super block of the filesystem
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
    *
    */
   static long swap_inode_boot_loader(struct super_block *sb,
+                               struct user_namespace *mnt_userns,
                                 struct inode *inode)
   {
         handle_t *handle;
@@@ -139,7 -141,8 +141,8 @@@
         }
   
         if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-           !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+           !inode_owner_or_capable(mnt_userns, inode) ||
+           !capable(CAP_SYS_ADMIN)) {
                 err = -EPERM;
                 goto journal_err_out;
         }
@@@ -814,6 -817,7 +817,7 @@@ static long __ext4_ioctl(struct file *f
         struct inode *inode = file_inode(filp);
         struct super_block *sb = inode->i_sb;
         struct ext4_inode_info *ei = EXT4_I(inode);
+       struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
         unsigned int flags;
   
         ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
@@@ -829,7 -833,7 +833,7 @@@
         case FS_IOC_SETFLAGS: {
                 int err;
   
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EACCES;
   
                 if (get_user(flags, (int __user *) arg))
@@@ -871,7 -875,7 +875,7 @@@
                 __u32 generation;
                 int err;
   
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EPERM;
   
                 if (ext4_has_metadata_csum(inode->i_sb)) {
@@@ -1010,7 -1014,7 +1014,7 @@@ mext_out
         case EXT4_IOC_MIGRATE:
         {
                 int err;
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EACCES;
   
                 err = mnt_want_write_file(filp);
@@@ -1032,7 -1036,7 +1036,7 @@@
         case EXT4_IOC_ALLOC_DA_BLKS:
         {
                 int err;
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EACCES;
   
                 err = mnt_want_write_file(filp);
@@@ -1051,7 -1055,7 +1055,7 @@@
                 err = mnt_want_write_file(filp);
                 if (err)
                         return err;
-               err = swap_inode_boot_loader(sb, inode);
+               err = swap_inode_boot_loader(sb, mnt_userns, inode);
                 mnt_drop_write_file(filp);
                 return err;
         }
@@@ -1217,7 -1221,7 +1221,7 @@@ resizefs_out
   
         case EXT4_IOC_CLEAR_ES_CACHE:
         {
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EACCES;
                 ext4_clear_inode_es(inode);
                 return 0;
@@@ -1263,7 -1267,7 +1267,7 @@@
                         return -EFAULT;
   
                 /* Make sure caller has proper permission */
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(mnt_userns, inode))
                         return -EACCES;
   
                 if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
@@@ -1309,12 -1313,6 +1313,12 @@@ out
                         return -EOPNOTSUPP;
                 return fsverity_ioctl_measure(filp, (void __user *)arg);
   
+ +      case FS_IOC_READ_VERITY_METADATA:
+ +              if (!ext4_has_feature_verity(sb))
+ +                      return -EOPNOTSUPP;
+ +              return fsverity_ioctl_read_metadata(filp,
+ +                                                  (const void __user *)arg);
+ +
         default:
                 return -ENOTTY;
         }
@@@ -1397,7 -1395,6 +1401,7 @@@ long ext4_compat_ioctl(struct file *fil
         case FS_IOC_GETFSMAP:
         case FS_IOC_ENABLE_VERITY:
         case FS_IOC_MEASURE_VERITY:
+ +      case FS_IOC_READ_VERITY_METADATA:
         case EXT4_IOC_CLEAR_ES_CACHE:
         case EXT4_IOC_GETSTATE:
         case EXT4_IOC_GET_ES_CACHE:
diff --combined fs/ext4/super.c

index fb59851,a77fbb7..802bd26
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -5709,7 -5709,7 +5709,7 @@@ static int ext4_sync_fs(struct super_bl
                 needs_barrier = true;
         if (needs_barrier) {
                 int err;
- -              err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+ +              err = blkdev_issue_flush(sb->s_bdev);
                 if (!ret)
                         ret = err;
         }
@@@ -6654,7 -6654,7 +6654,7 @@@ static struct file_system_type ext4_fs_
         .name           = "ext4",
         .mount          = ext4_mount,
         .kill_sb        = kill_block_super,
-       .fs_flags       = FS_REQUIRES_DEV,
+       .fs_flags       = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
   };
   MODULE_ALIAS_FS("ext4");
   
diff --combined fs/f2fs/acl.c

index 732ec10,a19e86c..965037a
--- 1/fs/f2fs/acl.c
--- 2/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@@ -200,27 -200,6 +200,27 @@@ struct posix_acl *f2fs_get_acl(struct i
         return __f2fs_get_acl(inode, type, NULL);
   }
   
-       if (!in_group_p(inode->i_gid) &&
-           !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+ +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
+ +                        struct posix_acl **acl)
+ +{
+ +      umode_t mode = inode->i_mode;
+ +      int error;
+ +
+ +      if (is_inode_flag_set(inode, FI_ACL_MODE))
+ +              mode = F2FS_I(inode)->i_acl_mode;
+ +
+ +      error = posix_acl_equiv_mode(*acl, &mode);
+ +      if (error < 0)
+ +              return error;
+ +      if (error == 0)
+ +              *acl = NULL;
++      if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
++          !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ +              mode &= ~S_ISGID;
+ +      *mode_p = mode;
+ +      return 0;
+ +}
+ +
   static int __f2fs_set_acl(struct inode *inode, int type,
                         struct posix_acl *acl, struct page *ipage)
   {
@@@ -234,7 -213,8 +234,7 @@@
         case ACL_TYPE_ACCESS:
                 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
                 if (acl && !ipage) {
- -                      error = posix_acl_update_mode(&init_user_ns, inode,
- -                                                    &mode, &acl);
+ +                      error = f2fs_acl_update_mode(inode, &mode, &acl);
                         if (error)
                                 return error;
                         set_acl_inode(inode, mode);
@@@ -269,7 -249,8 +269,8 @@@
         return error;
   }
   
- int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+                struct posix_acl *acl, int type)
   {
         if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
                 return -EIO;
diff --combined fs/f2fs/f2fs.h

index 506c801,c9002b1..e2d302a
--- 1/fs/f2fs/f2fs.h
--- 2/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@@ -43,6 -43,7 +43,6 @@@ enum 
         FAULT_KVMALLOC,
         FAULT_PAGE_ALLOC,
         FAULT_PAGE_GET,
- -      FAULT_ALLOC_BIO,
         FAULT_ALLOC_NID,
         FAULT_ORPHAN,
         FAULT_BLOCK,
@@@ -96,7 -97,6 +96,7 @@@ extern const char *f2fs_fault_name[FAUL
   #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
   #define F2FS_MOUNT_NORECOVERY         0x04000000
   #define F2FS_MOUNT_ATGC                       0x08000000
+ +#define F2FS_MOUNT_MERGE_CHECKPOINT   0x10000000
   
   #define F2FS_OPTION(sbi)      ((sbi)->mount_opt)
   #define clear_opt(sbi, option)        (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@@ -146,7 -146,6 +146,7 @@@ struct f2fs_mount_info 
         /* For compression */
         unsigned char compress_algorithm;       /* algorithm type */
         unsigned char compress_log_size;        /* cluster log size */
+ +      unsigned char compress_level;           /* compress level */
         bool compress_chksum;                   /* compressed data chksum */
         unsigned char compress_ext_cnt;         /* extension count */
         int compress_mode;                      /* compression mode */
@@@ -267,26 -266,6 +267,26 @@@ struct fsync_node_entry 
         unsigned int seq_id;    /* sequence id */
   };
   
+ +struct ckpt_req {
+ +      struct completion wait;         /* completion for checkpoint done */
+ +      struct llist_node llnode;       /* llist_node to be linked in wait queue */
+ +      int ret;                        /* return code of checkpoint */
+ +      ktime_t queue_time;             /* request queued time */
+ +};
+ +
+ +struct ckpt_req_control {
+ +      struct task_struct *f2fs_issue_ckpt;    /* checkpoint task */
+ +      int ckpt_thread_ioprio;                 /* checkpoint merge thread ioprio */
+ +      wait_queue_head_t ckpt_wait_queue;      /* waiting queue for wake-up */
+ +      atomic_t issued_ckpt;           /* # of actually issued ckpts */
+ +      atomic_t total_ckpt;            /* # of total ckpts */
+ +      atomic_t queued_ckpt;           /* # of queued ckpts */
+ +      struct llist_head issue_list;   /* list for command issue */
+ +      spinlock_t stat_lock;           /* lock for below checkpoint time stats */
+ +      unsigned int cur_time;          /* cur wait time in msec for currently issued checkpoint */
+ +      unsigned int peak_time;         /* peak wait time in msec until now */
+ +};
+ +
   /* for the bitmap indicate blocks to be discarded */
   struct discard_entry {
         struct list_head list;  /* list head */
@@@ -738,6 -717,7 +738,6 @@@ struct f2fs_inode_info 
         struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
         struct task_struct *inmem_task; /* store inmemory task */
         struct mutex inmem_lock;        /* lock for inmemory pages */
- -      pgoff_t ra_offset;              /* ongoing readahead offset */
         struct extent_tree *extent_tree;        /* cached extent_tree entry */
   
         /* avoid racing between foreground op and gc */
@@@ -755,7 -735,6 +755,7 @@@
         atomic_t i_compr_blocks;                /* # of compressed blocks */
         unsigned char i_compress_algorithm;     /* algorithm type */
         unsigned char i_log_cluster_size;       /* log of cluster size */
+ +      unsigned char i_compress_level;         /* compress level (lz4hc,zstd) */
         unsigned short i_compress_flag;         /* compress flag */
         unsigned int i_cluster_size;            /* cluster size */
   };
@@@ -1331,8 -1310,6 +1331,8 @@@ struct compress_data 
   
   #define F2FS_COMPRESSED_PAGE_MAGIC    0xF5F2C000
   
+ +#define       COMPRESS_LEVEL_OFFSET   8
+ +
   /* compress context */
   struct compress_ctx {
         struct inode *inode;            /* inode the context belong to */
@@@ -1360,7 -1337,7 +1360,7 @@@ struct compress_io_ctx 
         atomic_t pending_pages;         /* in-flight compressed page count */
   };
   
- -/* decompress io context for read IO path */
+ +/* Context for decompressing one cluster on the read IO path */
   struct decompress_io_ctx {
         u32 magic;                      /* magic number to indicate page is compressed */
         struct inode *inode;            /* inode the context belong to */
@@@ -1376,37 -1353,11 +1376,37 @@@
         struct compress_data *cbuf;     /* virtual mapped address on cpages */
         size_t rlen;                    /* valid data length in rbuf */
         size_t clen;                    /* valid data length in cbuf */
- -      atomic_t pending_pages;         /* in-flight compressed page count */
- -      atomic_t verity_pages;          /* in-flight page count for verity */
- -      bool failed;                    /* indicate IO error during decompression */
+ +
+ +      /*
+ +       * The number of compressed pages remaining to be read in this cluster.
+ +       * This is initially nr_cpages.  It is decremented by 1 each time a page
+ +       * has been read (or failed to be read).  When it reaches 0, the cluster
+ +       * is decompressed (or an error is reported).
+ +       *
+ +       * If an error occurs before all the pages have been submitted for I/O,
+ +       * then this will never reach 0.  In this case the I/O submitter is
+ +       * responsible for calling f2fs_decompress_end_io() instead.
+ +       */
+ +      atomic_t remaining_pages;
+ +
+ +      /*
+ +       * Number of references to this decompress_io_ctx.
+ +       *
+ +       * One reference is held for I/O completion.  This reference is dropped
+ +       * after the pagecache pages are updated and unlocked -- either after
+ +       * decompression (and verity if enabled), or after an error.
+ +       *
+ +       * In addition, each compressed page holds a reference while it is in a
+ +       * bio.  These references are necessary prevent compressed pages from
+ +       * being freed while they are still in a bio.
+ +       */
+ +      refcount_t refcnt;
+ +
+ +      bool failed;                    /* IO error occurred before decompression? */
+ +      bool need_verity;               /* need fs-verity verification after decompression? */
         void *private;                  /* payload buffer for specified decompression algorithm */
         void *private2;                 /* extra payload buffer */
+ +      struct work_struct verity_work; /* work to verify the decompressed pages */
   };
   
   #define NULL_CLUSTER                  ((unsigned int)(~0))
@@@ -1453,7 -1404,6 +1453,7 @@@ struct f2fs_sb_info 
         wait_queue_head_t cp_wait;
         unsigned long last_time[MAX_TIME];      /* to store time in jiffies */
         long interval_time[MAX_TIME];           /* to store thresholds */
+ +      struct ckpt_req_control cprc_info;      /* for checkpoint request control */
   
         struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
   
@@@ -1494,6 -1444,7 +1494,6 @@@
         unsigned int total_sections;            /* total section count */
         unsigned int total_node_count;          /* total node block count */
         unsigned int total_valid_node_count;    /* valid node block count */
- -      loff_t max_file_blocks;                 /* max block index of file */
         int dir_level;                          /* directory level */
         int readdir_ra;                         /* readahead inode in readdir */
         u64 max_io_bytes;                       /* max io bytes to merge IOs */
@@@ -1590,12 -1541,9 +1590,12 @@@
         unsigned int node_io_flag;
   
         /* For sysfs suppport */
- -      struct kobject s_kobj;
+ +      struct kobject s_kobj;                  /* /sys/fs/f2fs/<devname> */
         struct completion s_kobj_unregister;
   
+ +      struct kobject s_stat_kobj;             /* /sys/fs/f2fs/<devname>/stat */
+ +      struct completion s_stat_kobj_unregister;
+ +
         /* For shrinker support */
         struct list_head s_list;
         int s_ndevs;                            /* number of devices */
@@@ -3187,9 -3135,10 +3187,10 @@@ void f2fs_truncate_data_blocks(struct d
   int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock);
   int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock);
   int f2fs_truncate(struct inode *inode);
- int f2fs_getattr(const struct path *path, struct kstat *stat,
-                       u32 request_mask, unsigned int flags);
- int f2fs_setattr(struct dentry *dentry, struct iattr *attr);
+ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                struct kstat *stat, u32 request_mask, unsigned int flags);
+ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                struct iattr *attr);
   int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end);
   void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
   int f2fs_precache_extents(struct inode *inode);
@@@ -3284,7 -3233,6 +3285,7 @@@ int f2fs_inode_dirtied(struct inode *in
   void f2fs_inode_synced(struct inode *inode);
   int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
   int f2fs_quota_sync(struct super_block *sb, int type);
+ +loff_t max_file_blocks(struct inode *inode);
   void f2fs_quota_off_umount(struct super_block *sb);
   int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
   int f2fs_sync_fs(struct super_block *sb, int sync);
@@@ -3471,16 -3419,13 +3472,16 @@@ int f2fs_write_checkpoint(struct f2fs_s
   void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
   int __init f2fs_create_checkpoint_caches(void);
   void f2fs_destroy_checkpoint_caches(void);
+ +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi);
+ +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi);
+ +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi);
+ +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
   
   /*
    * data.c
    */
   int __init f2fs_init_bioset(void);
   void f2fs_destroy_bioset(void);
- -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
   int f2fs_init_bio_entry_cache(void);
   void f2fs_destroy_bio_entry_cache(void);
   void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@@ -3525,7 -3470,7 +3526,7 @@@ int f2fs_write_single_data_page(struct 
                                 struct bio **bio, sector_t *last_block,
                                 struct writeback_control *wbc,
                                 enum iostat_type io_type,
- -                              int compr_blocks);
+ +                              int compr_blocks, bool allow_balance);
   void f2fs_invalidate_page(struct page *page, unsigned int offset,
                         unsigned int length);
   int f2fs_release_page(struct page *page, gfp_t wait);
@@@ -3586,8 -3531,6 +3587,8 @@@ struct f2fs_stat_info 
         int nr_discarding, nr_discarded;
         int nr_discard_cmd;
         unsigned int undiscard_blks;
+ +      int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
+ +      unsigned int cur_ckpt_time, peak_ckpt_time;
         int inline_xattr, inline_inode, inline_dir, append, update, orphans;
         int compr_inode;
         unsigned long long compr_blocks;
@@@ -3773,6 -3716,8 +3774,6 @@@ void f2fs_update_sit_info(struct f2fs_s
   #define stat_dec_compr_inode(inode)                   do { } while (0)
   #define stat_add_compr_blocks(inode, blocks)          do { } while (0)
   #define stat_sub_compr_blocks(inode, blocks)          do { } while (0)
- -#define stat_inc_atomic_write(inode)                  do { } while (0)
- -#define stat_dec_atomic_write(inode)                  do { } while (0)
   #define stat_update_max_atomic_write(inode)           do { } while (0)
   #define stat_inc_volatile_write(inode)                        do { } while (0)
   #define stat_dec_volatile_write(inode)                        do { } while (0)
@@@ -3932,7 -3877,7 +3933,7 @@@ void f2fs_compress_write_end_io(struct 
   bool f2fs_is_compress_backend_ready(struct inode *inode);
   int f2fs_init_compress_mempool(void);
   void f2fs_destroy_compress_mempool(void);
- -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+ +void f2fs_end_read_compressed_page(struct page *page, bool failed);
   bool f2fs_cluster_is_empty(struct compress_ctx *cc);
   bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
   void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
@@@ -3945,8 -3890,9 +3946,8 @@@ int f2fs_read_multi_pages(struct compre
                                 unsigned nr_pages, sector_t *last_block_in_bio,
                                 bool is_readahead, bool for_write);
   struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
- -void f2fs_free_dic(struct decompress_io_ctx *dic);
- -void f2fs_decompress_end_io(struct page **rpages,
- -                      unsigned int cluster_size, bool err, bool verity);
+ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
+ +void f2fs_put_page_dic(struct page *page);
   int f2fs_init_compress_ctx(struct compress_ctx *cc);
   void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
   void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@@ -3970,14 -3916,6 +3971,14 @@@ static inline struct page *f2fs_compres
   }
   static inline int f2fs_init_compress_mempool(void) { return 0; }
   static inline void f2fs_destroy_compress_mempool(void) { }
+ +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+ +{
+ +      WARN_ON_ONCE(1);
+ +}
+ +static inline void f2fs_put_page_dic(struct page *page)
+ +{
+ +      WARN_ON_ONCE(1);
+ +}
   static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
   static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
   static inline int __init f2fs_init_compress_cache(void) { return 0; }
@@@ -3997,11 -3935,6 +3998,11 @@@ static inline void set_compress_context
                                 1 << COMPRESS_CHKSUM : 0;
         F2FS_I(inode)->i_cluster_size =
                         1 << F2FS_I(inode)->i_log_cluster_size;
+ +      if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+ +                      F2FS_OPTION(sbi).compress_level)
+ +              F2FS_I(inode)->i_compress_flag |=
+ +                              F2FS_OPTION(sbi).compress_level <<
+ +                              COMPRESS_LEVEL_OFFSET;
         F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
         set_inode_flag(inode, FI_COMPRESSED_FILE);
         stat_inc_compr_inode(inode);
@@@ -4182,12 -4115,6 +4183,12 @@@ static inline bool f2fs_force_buffered_
         return false;
   }
   
+ +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+ +{
+ +      return fsverity_active(inode) &&
+ +             idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ +}
+ +
   #ifdef CONFIG_F2FS_FAULT_INJECTION
   extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
                                                         unsigned int type);
diff --combined fs/f2fs/file.c

index 471a6ff,8f1e97e..d26ff2a
--- 1/fs/f2fs/file.c
--- 2/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@@ -29,6 -29,7 +29,6 @@@
   #include "xattr.h"
   #include "acl.h"
   #include "gc.h"
- -#include "trace.h"
   #include <trace/events/f2fs.h>
   #include <uapi/linux/f2fs.h>
   
@@@ -59,9 -60,6 +59,9 @@@ static vm_fault_t f2fs_vm_page_mkwrite(
         bool need_alloc = true;
         int err = 0;
   
+ +      if (unlikely(IS_IMMUTABLE(inode)))
+ +              return VM_FAULT_SIGBUS;
+ +
         if (unlikely(f2fs_cp_error(sbi))) {
                 err = -EIO;
                 goto err;
@@@ -72,10 -70,6 +72,10 @@@
                 goto err;
         }
   
+ +      err = f2fs_convert_inline_inode(inode);
+ +      if (err)
+ +              goto err;
+ +
   #ifdef CONFIG_F2FS_FS_COMPRESSION
         if (f2fs_compressed_file(inode)) {
                 int ret = f2fs_is_compressed_cluster(inode, page->index);
@@@ -372,6 -366,7 +372,6 @@@ flush_out
         f2fs_update_time(sbi, REQ_TIME);
   out:
         trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
- -      f2fs_trace_ios(NULL, 1);
         return ret;
   }
   
@@@ -488,9 -483,6 +488,9 @@@ static loff_t f2fs_llseek(struct file *
         struct inode *inode = file->f_mapping->host;
         loff_t maxbytes = inode->i_sb->s_maxbytes;
   
+ +      if (f2fs_compressed_file(inode))
+ +              maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+ +
         switch (whence) {
         case SEEK_SET:
         case SEEK_CUR:
@@@ -510,6 -502,7 +510,6 @@@
   static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
   {
         struct inode *inode = file_inode(file);
- -      int err;
   
         if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
                 return -EIO;
@@@ -517,6 -510,11 +517,6 @@@
         if (!f2fs_is_compress_backend_ready(inode))
                 return -EOPNOTSUPP;
   
- -      /* we don't need to use inline_data strictly */
- -      err = f2fs_convert_inline_inode(inode);
- -      if (err)
- -              return err;
- -
         file_accessed(file);
         vma->vm_ops = &f2fs_file_vm_ops;
         set_inode_flag(inode, FI_MMAP_FILE);
@@@ -669,7 -667,7 +669,7 @@@ int f2fs_do_truncate_blocks(struct inod
   
         free_from = (pgoff_t)F2FS_BLK_ALIGN(from);
   
- -      if (free_from >= sbi->max_file_blocks)
+ +      if (free_from >= max_file_blocks(inode))
                 goto free_partial;
   
         if (lock)
@@@ -769,10 -767,6 +769,10 @@@ int f2fs_truncate(struct inode *inode
                 return -EIO;
         }
   
+ +      err = dquot_initialize(inode);
+ +      if (err)
+ +              return err;
+ +
         /* we should check inline_data size */
         if (!f2fs_may_inline_data(inode)) {
                 err = f2fs_convert_inline_inode(inode);
@@@ -789,8 -783,8 +789,8 @@@
         return 0;
   }
   
- int f2fs_getattr(const struct path *path, struct kstat *stat,
-                u32 request_mask, unsigned int query_flags)
+ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                struct kstat *stat, u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
         struct f2fs_inode_info *fi = F2FS_I(inode);
@@@ -826,7 -820,7 +826,7 @@@
                                   STATX_ATTR_NODUMP |
                                   STATX_ATTR_VERITY);
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
   
         /* we need to show initial sectors used for inline_data/dentries */
         if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@@ -837,7 -831,8 +837,8 @@@
   }
   
   #ifdef CONFIG_F2FS_FS_POSIX_ACL
- static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+ static void __setattr_copy(struct user_namespace *mnt_userns,
+                          struct inode *inode, const struct iattr *attr)
   {
         unsigned int ia_valid = attr->ia_valid;
   
@@@ -853,9 -848,9 +854,9 @@@
                 inode->i_ctime = attr->ia_ctime;
         if (ia_valid & ATTR_MODE) {
                 umode_t mode = attr->ia_mode;
+               kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
   
-               if (!in_group_p(inode->i_gid) &&
-                       !capable_wrt_inode_uidgid(inode, CAP_FSETID))
- -              if (!in_group_p(kgid) && !capable(CAP_FSETID))
++              if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
                         mode &= ~S_ISGID;
                 set_acl_inode(inode, mode);
         }
@@@ -864,7 -859,8 +865,8 @@@
   #define __setattr_copy setattr_copy
   #endif
   
- int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         int err;
@@@ -872,19 -868,11 +874,19 @@@
         if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
                 return -EIO;
   
+ +      if (unlikely(IS_IMMUTABLE(inode)))
+ +              return -EPERM;
+ +
+ +      if (unlikely(IS_APPEND(inode) &&
+ +                      (attr->ia_valid & (ATTR_MODE | ATTR_UID |
+ +                                ATTR_GID | ATTR_TIMES_SET))))
+ +              return -EPERM;
+ +
         if ((attr->ia_valid & ATTR_SIZE) &&
                 !f2fs_is_compress_backend_ready(inode))
                 return -EOPNOTSUPP;
   
-       err = setattr_prepare(dentry, attr);
+       err = setattr_prepare(&init_user_ns, dentry, attr);
         if (err)
                 return err;
   
@@@ -960,14 -948,13 +962,14 @@@
                 spin_unlock(&F2FS_I(inode)->i_size_lock);
         }
   
-       __setattr_copy(inode, attr);
+       __setattr_copy(&init_user_ns, inode, attr);
   
         if (attr->ia_valid & ATTR_MODE) {
-               err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
- -              err = posix_acl_chmod(&init_user_ns, inode,
- -                                    f2fs_get_inode_mode(inode));
- -              if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
- -                      inode->i_mode = F2FS_I(inode)->i_acl_mode;
++              err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+ +
+ +              if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ +                      if (!err)
+ +                              inode->i_mode = F2FS_I(inode)->i_acl_mode;
                         clear_inode_flag(inode, FI_ACL_MODE);
                 }
         }
@@@ -1978,7 -1965,7 +1980,7 @@@ static int f2fs_ioc_setflags(struct fil
         u32 iflags;
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         if (get_user(fsflags, (int __user *)arg))
@@@ -2025,7 -2012,7 +2027,7 @@@ static int f2fs_ioc_start_atomic_write(
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         if (!S_ISREG(inode->i_mode))
@@@ -2092,7 -2079,7 +2094,7 @@@ static int f2fs_ioc_commit_atomic_write
         struct inode *inode = file_inode(filp);
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         ret = mnt_want_write_file(filp);
@@@ -2134,7 -2121,7 +2136,7 @@@ static int f2fs_ioc_start_volatile_writ
         struct inode *inode = file_inode(filp);
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         if (!S_ISREG(inode->i_mode))
@@@ -2169,7 -2156,7 +2171,7 @@@ static int f2fs_ioc_release_volatile_wr
         struct inode *inode = file_inode(filp);
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         ret = mnt_want_write_file(filp);
@@@ -2198,7 -2185,7 +2200,7 @@@ static int f2fs_ioc_abort_volatile_writ
         struct inode *inode = file_inode(filp);
         int ret;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         ret = mnt_want_write_file(filp);
@@@ -2747,7 -2734,7 +2749,7 @@@ static int f2fs_ioc_defragment(struct f
                 return -EINVAL;
   
         if (unlikely((range.start + range.len) >> PAGE_SHIFT >
- -                                      sbi->max_file_blocks))
+ +                                      max_file_blocks(inode)))
                 return -EINVAL;
   
         err = mnt_want_write_file(filp);
@@@ -3175,7 -3162,7 +3177,7 @@@ static int f2fs_ioc_fssetxattr(struct f
                 return -EFAULT;
   
         /* Make sure caller has proper permission */
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS)
@@@ -3310,7 -3297,7 +3312,7 @@@ int f2fs_precache_extents(struct inode 
         map.m_next_extent = &m_next_extent;
         map.m_seg_type = NO_CHECK_TYPE;
         map.m_may_create = false;
- -      end = F2FS_I_SB(inode)->max_file_blocks;
+ +      end = max_file_blocks(inode);
   
         while (map.m_lblk < end) {
                 map.m_len = end - map.m_lblk;
@@@ -3374,14 -3361,6 +3376,14 @@@ static int f2fs_ioc_measure_verity(stru
         return fsverity_ioctl_measure(filp, (void __user *)arg);
   }
   
+ +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg)
+ +{
+ +      if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+ +              return -EOPNOTSUPP;
+ +
+ +      return fsverity_ioctl_read_metadata(filp, (const void __user *)arg);
+ +}
+ +
   static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
   {
         struct inode *inode = file_inode(filp);
@@@ -4068,10 -4047,8 +4070,10 @@@ static int redirty_blocks(struct inode 
   
         for (i = 0; i < page_len; i++, redirty_idx++) {
                 page = find_lock_page(mapping, redirty_idx);
- -              if (!page)
- -                      ret = -ENOENT;
+ +              if (!page) {
+ +                      ret = -ENOMEM;
+ +                      break;
+ +              }
                 set_page_dirty(page);
                 f2fs_put_page(page, 1);
                 f2fs_put_page(page, 0);
@@@ -4299,8 -4276,6 +4301,8 @@@ static long __f2fs_ioctl(struct file *f
                 return f2fs_ioc_enable_verity(filp, arg);
         case FS_IOC_MEASURE_VERITY:
                 return f2fs_ioc_measure_verity(filp, arg);
+ +      case FS_IOC_READ_VERITY_METADATA:
+ +              return f2fs_ioc_read_verity_metadata(filp, arg);
         case FS_IOC_GETFSLABEL:
                 return f2fs_ioc_getfslabel(filp, arg);
         case FS_IOC_SETFSLABEL:
@@@ -4378,11 -4353,6 +4380,11 @@@ static ssize_t f2fs_file_write_iter(str
                 inode_lock(inode);
         }
   
+ +      if (unlikely(IS_IMMUTABLE(inode))) {
+ +              ret = -EPERM;
+ +              goto unlock;
+ +      }
+ +
         ret = generic_write_checks(iocb, from);
         if (ret > 0) {
                 bool preallocated = false;
@@@ -4447,7 -4417,6 +4449,7 @@@ write
                 if (ret > 0)
                         f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
         }
+ +unlock:
         inode_unlock(inode);
   out:
         trace_f2fs_file_write_iter(inode, iocb->ki_pos,
@@@ -4558,7 -4527,6 +4560,7 @@@ long f2fs_compat_ioctl(struct file *fil
         case F2FS_IOC_RESIZE_FS:
         case FS_IOC_ENABLE_VERITY:
         case FS_IOC_MEASURE_VERITY:
+ +      case FS_IOC_READ_VERITY_METADATA:
         case FS_IOC_GETFSLABEL:
         case FS_IOC_SETFSLABEL:
         case F2FS_IOC_GET_COMPRESS_BLOCKS:
diff --combined fs/f2fs/namei.c

index 8878049,c061a67..17bd072
--- 1/fs/f2fs/namei.c
--- 2/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@@ -46,7 -46,7 +46,7 @@@ static struct inode *f2fs_new_inode(str
   
         nid_free = true;
   
-       inode_init_owner(inode, dir, mode);
+       inode_init_owner(&init_user_ns, inode, dir, mode);
   
         inode->i_ino = ino;
         inode->i_blocks = 0;
@@@ -314,8 -314,8 +314,8 @@@ static void set_compress_inode(struct f
         }
   }
   
- static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                                               bool excl)
+ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
+                      struct dentry *dentry, umode_t mode, bool excl)
   {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         struct inode *inode;
@@@ -637,8 -637,8 +637,8 @@@ static const char *f2fs_get_link(struc
         return link;
   }
   
- static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
-                                       const char *symname)
+ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+                       struct dentry *dentry, const char *symname)
   {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         struct inode *inode;
@@@ -717,7 -717,8 +717,8 @@@ out_free_encrypted_link
         return err;
   }
   
- static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                     struct dentry *dentry, umode_t mode)
   {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         struct inode *inode;
@@@ -770,8 -771,8 +771,8 @@@ static int f2fs_rmdir(struct inode *dir
         return -ENOTEMPTY;
   }
   
- static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
-                               umode_t mode, dev_t rdev)
+ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+                     struct dentry *dentry, umode_t mode, dev_t rdev)
   {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         struct inode *inode;
@@@ -855,11 -856,7 +856,11 @@@ static int __f2fs_tmpfile(struct inode 
   
         if (whiteout) {
                 f2fs_i_links_write(inode, false);
+ +
+ +              spin_lock(&inode->i_lock);
                 inode->i_state |= I_LINKABLE;
+ +              spin_unlock(&inode->i_lock);
+ +
                 *whiteout = inode;
         } else {
                 d_tmpfile(dentry, inode);
@@@ -878,7 -875,8 +879,8 @@@ out
         return err;
   }
   
- static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+                       struct dentry *dentry, umode_t mode)
   {
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
   
@@@ -1045,11 -1043,7 +1047,11 @@@ static int f2fs_rename(struct inode *ol
                 err = f2fs_add_link(old_dentry, whiteout);
                 if (err)
                         goto put_out_dir;
+ +
+ +              spin_lock(&whiteout->i_lock);
                 whiteout->i_state &= ~I_LINKABLE;
+ +              spin_unlock(&whiteout->i_lock);
+ +
                 iput(whiteout);
         }
   
@@@ -1255,7 -1249,8 +1257,8 @@@ out
         return err;
   }
   
- static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ static int f2fs_rename2(struct user_namespace *mnt_userns,
+                       struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
   {
diff --combined fs/f2fs/xattr.c

index 8159fae,10081bf..490f843
--- 1/fs/f2fs/xattr.c
--- 2/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@@ -64,6 -64,7 +64,7 @@@ static int f2fs_xattr_generic_get(cons
   }
   
   static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+               struct user_namespace *mnt_userns,
                 struct dentry *unused, struct inode *inode,
                 const char *name, const void *value,
                 size_t size, int flags)
@@@ -107,6 -108,7 +108,7 @@@ static int f2fs_xattr_advise_get(const 
   }
   
   static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+               struct user_namespace *mnt_userns,
                 struct dentry *unused, struct inode *inode,
                 const char *name, const void *value,
                 size_t size, int flags)
@@@ -114,7 -116,7 +116,7 @@@
         unsigned char old_advise = F2FS_I(inode)->i_advise;
         unsigned char new_advise;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EPERM;
         if (value == NULL)
                 return -EINVAL;
@@@ -327,7 -329,7 +329,7 @@@ static int lookup_all_xattrs(struct ino
         void *last_addr = NULL;
         nid_t xnid = F2FS_I(inode)->i_xattr_nid;
         unsigned int inline_size = inline_xattr_size(inode);
- -      int err = 0;
+ +      int err;
   
         if (!xnid && !inline_size)
                 return -ENODATA;
@@@ -515,7 -517,7 +517,7 @@@ int f2fs_getxattr(struct inode *inode, 
                 void *buffer, size_t buffer_size, struct page *ipage)
   {
         struct f2fs_xattr_entry *entry = NULL;
- -      int error = 0;
+ +      int error;
         unsigned int size, len;
         void *base_addr = NULL;
         int base_size;
@@@ -562,7 -564,7 +564,7 @@@ ssize_t f2fs_listxattr(struct dentry *d
         struct inode *inode = d_inode(dentry);
         struct f2fs_xattr_entry *entry;
         void *base_addr, *last_base_addr;
- -      int error = 0;
+ +      int error;
         size_t rest = buffer_size;
   
         down_read(&F2FS_I(inode)->i_xattr_sem);
@@@ -632,7 -634,7 +634,7 @@@ static int __f2fs_setxattr(struct inod
         int found, newsize;
         size_t len;
         __u32 new_hsize;
- -      int error = 0;
+ +      int error;
   
         if (name == NULL)
                 return -EINVAL;
@@@ -673,7 -675,7 +675,7 @@@
                 }
   
                 if (value && f2fs_xattr_value_same(here, value, size))
- -                      goto exit;
+ +                      goto same;
         } else if ((flags & XATTR_REPLACE)) {
                 error = -ENODATA;
                 goto exit;
@@@ -738,20 -740,17 +740,20 @@@
         if (error)
                 goto exit;
   
- -      if (is_inode_flag_set(inode, FI_ACL_MODE)) {
- -              inode->i_mode = F2FS_I(inode)->i_acl_mode;
- -              inode->i_ctime = current_time(inode);
- -              clear_inode_flag(inode, FI_ACL_MODE);
- -      }
         if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
                         !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
                 f2fs_set_encrypted_inode(inode);
         f2fs_mark_inode_dirty_sync(inode, true);
         if (!error && S_ISDIR(inode->i_mode))
                 set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
+ +
+ +same:
+ +      if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+ +              inode->i_mode = F2FS_I(inode)->i_acl_mode;
+ +              inode->i_ctime = current_time(inode);
+ +              clear_inode_flag(inode, FI_ACL_MODE);
+ +      }
+ +
   exit:
         kfree(base_addr);
         return error;
diff --combined fs/fat/file.c

index 5fee74f,da7c562..13855ba
--- 1/fs/fat/file.c
--- 2/fs/fat/file.c
+++ b/fs/fat/file.c
@@@ -95,7 -95,7 +95,7 @@@ static int fat_ioctl_set_attributes(str
                 goto out_unlock_inode;
   
         /* This MUST be done before doing anything irreversible... */
-       err = fat_setattr(file->f_path.dentry, &ia);
+       err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia);
         if (err)
                 goto out_unlock_inode;
   
@@@ -195,7 -195,7 +195,7 @@@ int fat_file_fsync(struct file *filp, l
         if (err)
                 return err;
   
- -      return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +      return blkdev_issue_flush(inode->i_sb->s_bdev);
   }
   
   
@@@ -394,11 -394,11 +394,11 @@@ void fat_truncate_blocks(struct inode *
         fat_flush_inodes(inode->i_sb, inode, NULL);
   }
   
- int fat_getattr(const struct path *path, struct kstat *stat,
-               u32 request_mask, unsigned int flags)
+ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path,
+               struct kstat *stat, u32 request_mask, unsigned int flags)
   {
         struct inode *inode = d_inode(path->dentry);
-       generic_fillattr(inode, stat);
+       generic_fillattr(mnt_userns, inode, stat);
         stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
   
         if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
@@@ -447,12 -447,13 +447,13 @@@ static int fat_sanitize_mode(const stru
         return 0;
   }
   
- static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
+ static int fat_allow_set_time(struct user_namespace *mnt_userns,
+                             struct msdos_sb_info *sbi, struct inode *inode)
   {
         umode_t allow_utime = sbi->options.allow_utime;
   
-       if (!uid_eq(current_fsuid(), inode->i_uid)) {
-               if (in_group_p(inode->i_gid))
+       if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
+               if (in_group_p(i_gid_into_mnt(mnt_userns, inode)))
                         allow_utime >>= 3;
                 if (allow_utime & MAY_WRITE)
                         return 1;
@@@ -466,7 -467,8 +467,8 @@@
   /* valid file mode bits */
   #define FAT_VALID_MODE        (S_IFREG | S_IFDIR | S_IRWXUGO)
   
- int fat_setattr(struct dentry *dentry, struct iattr *attr)
+ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+               struct iattr *attr)
   {
         struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
         struct inode *inode = d_inode(dentry);
@@@ -476,11 -478,11 +478,11 @@@
         /* Check for setting the inode time. */
         ia_valid = attr->ia_valid;
         if (ia_valid & TIMES_SET_FLAGS) {
-               if (fat_allow_set_time(sbi, inode))
+               if (fat_allow_set_time(mnt_userns, sbi, inode))
                         attr->ia_valid &= ~TIMES_SET_FLAGS;
         }
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(mnt_userns, dentry, attr);
         attr->ia_valid = ia_valid;
         if (error) {
                 if (sbi->options.quiet)
@@@ -550,7 -552,7 +552,7 @@@
                 fat_truncate_time(inode, &attr->ia_mtime, S_MTIME);
         attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME);
   
-       setattr_copy(inode, attr);
+       setattr_copy(mnt_userns, inode, attr);
         mark_inode_dirty(inode);
   out:
         return error;
diff --combined fs/fcntl.c

index 483ef88,f6ac528..dfc72f1
--- 1/fs/fcntl.c
--- 2/fs/fcntl.c
+++ b/fs/fcntl.c
@@@ -25,6 -25,7 +25,7 @@@
   #include <linux/user_namespace.h>
   #include <linux/memfd.h>
   #include <linux/compat.h>
+ #include <linux/mount.h>
   
   #include <linux/poll.h>
   #include <asm/siginfo.h>
@@@ -46,7 -47,7 +47,7 @@@ static int setfl(int fd, struct file * 
   
         /* O_NOATIME can only be set by the owner or superuser */
         if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-               if (!inode_owner_or_capable(inode))
+               if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode))
                         return -EPERM;
   
         /* required for strict SunOS emulation */
@@@ -148,15 -149,11 +149,15 @@@ void f_delown(struct file *filp
   
   pid_t f_getown(struct file *filp)
   {
- -      pid_t pid;
+ +      pid_t pid = 0;
         read_lock(&filp->f_owner.lock);
- -      pid = pid_vnr(filp->f_owner.pid);
- -      if (filp->f_owner.pid_type == PIDTYPE_PGID)
- -              pid = -pid;
+ +      rcu_read_lock();
+ +      if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
+ +              pid = pid_vnr(filp->f_owner.pid);
+ +              if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ +                      pid = -pid;
+ +      }
+ +      rcu_read_unlock();
         read_unlock(&filp->f_owner.lock);
         return pid;
   }
@@@ -204,14 -201,11 +205,14 @@@ static int f_setown_ex(struct file *fil
   static int f_getown_ex(struct file *filp, unsigned long arg)
   {
         struct f_owner_ex __user *owner_p = (void __user *)arg;
- -      struct f_owner_ex owner;
+ +      struct f_owner_ex owner = {};
         int ret = 0;
   
         read_lock(&filp->f_owner.lock);
- -      owner.pid = pid_vnr(filp->f_owner.pid);
+ +      rcu_read_lock();
+ +      if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
+ +              owner.pid = pid_vnr(filp->f_owner.pid);
+ +      rcu_read_unlock();
         switch (filp->f_owner.pid_type) {
         case PIDTYPE_PID:
                 owner.type = F_OWNER_TID;
diff --combined fs/gfs2/file.c

index 07f49e5,8f55238..95bbdd4
--- 1/fs/gfs2/file.c
--- 2/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@@ -238,7 -238,7 +238,7 @@@ static int do_gfs2_set_flags(struct fil
                 goto out;
   
         error = -EACCES;
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 goto out;
   
         error = 0;
@@@ -256,7 -256,7 +256,7 @@@
             !capable(CAP_LINUX_IMMUTABLE))
                 goto out;
         if (!IS_IMMUTABLE(inode)) {
-               error = gfs2_permission(inode, MAY_WRITE);
+               error = gfs2_permission(&init_user_ns, inode, MAY_WRITE);
                 if (error)
                         goto out;
         }
@@@ -749,7 -749,7 +749,7 @@@ static int gfs2_fsync(struct file *file
   {
         struct address_space *mapping = file->f_mapping;
         struct inode *inode = mapping->host;
- -      int sync_state = inode->i_state & I_DIRTY_ALL;
+ +      int sync_state = inode->i_state & I_DIRTY;
         struct gfs2_inode *ip = GFS2_I(inode);
         int ret = 0, ret1 = 0;
   
@@@ -762,7 -762,7 +762,7 @@@
         if (!gfs2_is_jdata(ip))
                 sync_state &= ~I_DIRTY_PAGES;
         if (datasync)
- -              sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
+ +              sync_state &= ~I_DIRTY_SYNC;
   
         if (sync_state) {
                 ret = sync_inode_metadata(inode, 1);
@@@ -797,7 -797,9 +797,7 @@@ static ssize_t gfs2_file_direct_read(st
         if (ret)
                 goto out_uninit;
   
- -      ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
- -                         is_sync_kiocb(iocb));
- -
+ +      ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
         gfs2_glock_dq(gh);
   out_uninit:
         gfs2_holder_uninit(gh);
@@@ -831,7 -833,8 +831,7 @@@ static ssize_t gfs2_file_direct_write(s
         if (offset + len > i_size_read(&ip->i_inode))
                 goto out;
   
- -      ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
- -                         is_sync_kiocb(iocb));
+ +      ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
         if (ret == -ENOTBLK)
                 ret = 0;
   out:
diff --combined fs/hfsplus/inode.c

index ca46432,7a937de..078c5c8
--- 1/fs/hfsplus/inode.c
--- 2/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@@ -241,12 -241,13 +241,13 @@@ static int hfsplus_file_release(struct 
         return 0;
   }
   
- static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hfsplus_setattr(struct user_namespace *mnt_userns,
+                          struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         int error;
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(&init_user_ns, dentry, attr);
         if (error)
                 return error;
   
@@@ -264,14 -265,15 +265,15 @@@
                 inode->i_mtime = inode->i_ctime = current_time(inode);
         }
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         mark_inode_dirty(inode);
   
         return 0;
   }
   
- int hfsplus_getattr(const struct path *path, struct kstat *stat,
-                   u32 request_mask, unsigned int query_flags)
+ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                   struct kstat *stat, u32 request_mask,
+                   unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
         struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@@ -286,7 -288,7 +288,7 @@@
         stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP;
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         return 0;
   }
   
@@@ -340,7 -342,7 +342,7 @@@ int hfsplus_file_fsync(struct file *fil
         }
   
         if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- -              blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +              blkdev_issue_flush(inode->i_sb->s_bdev);
   
         inode_unlock(inode);
   
@@@ -376,7 -378,7 +378,7 @@@ struct inode *hfsplus_new_inode(struct 
                 return NULL;
   
         inode->i_ino = sbi->next_cnid++;
-       inode_init_owner(inode, dir, mode);
+       inode_init_owner(&init_user_ns, inode, dir, mode);
         set_nlink(inode, 1);
         inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
   
diff --combined fs/hostfs/hostfs_kern.c

index 4a5beca,7c918cd..29e4077
--- 1/fs/hostfs/hostfs_kern.c
--- 2/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@@ -34,8 -34,6 +34,8 @@@ static inline struct hostfs_inode_info 
   
   #define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
   
+ +static struct kmem_cache *hostfs_inode_cache;
+ +
   /* Changed in hostfs_args before the kernel starts running */
   static char *root_ino = "";
   static int append = 0;
@@@ -223,7 -221,7 +223,7 @@@ static struct inode *hostfs_alloc_inode
   {
         struct hostfs_inode_info *hi;
   
- -      hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
+ +      hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
         if (hi == NULL)
                 return NULL;
         hi->fd = -1;
@@@ -245,7 -243,7 +245,7 @@@ static void hostfs_evict_inode(struct i
   
   static void hostfs_free_inode(struct inode *inode)
   {
- -      kfree(HOSTFS_I(inode));
+ +      kmem_cache_free(hostfs_inode_cache, HOSTFS_I(inode));
   }
   
   static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
@@@ -557,8 -555,8 +557,8 @@@ static int read_name(struct inode *ino
         return 0;
   }
   
- static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                        bool excl)
+ static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+                        struct dentry *dentry, umode_t mode, bool excl)
   {
         struct inode *inode;
         char *name;
@@@ -656,8 -654,8 +656,8 @@@ static int hostfs_unlink(struct inode *
         return err;
   }
   
- static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
-                         const char *to)
+ static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino,
+                         struct dentry *dentry, const char *to)
   {
         char *file;
         int err;
@@@ -669,7 -667,8 +669,8 @@@
         return err;
   }
   
- static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+ static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino,
+                       struct dentry *dentry, umode_t mode)
   {
         char *file;
         int err;
@@@ -693,7 -692,8 +694,8 @@@ static int hostfs_rmdir(struct inode *i
         return err;
   }
   
- static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+                       struct dentry *dentry, umode_t mode, dev_t dev)
   {
         struct inode *inode;
         char *name;
@@@ -731,7 -731,8 +733,8 @@@
         return err;
   }
   
- static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ static int hostfs_rename2(struct user_namespace *mnt_userns,
+                         struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry,
                           unsigned int flags)
   {
@@@ -759,7 -760,8 +762,8 @@@
         return err;
   }
   
- static int hostfs_permission(struct inode *ino, int desired)
+ static int hostfs_permission(struct user_namespace *mnt_userns,
+                            struct inode *ino, int desired)
   {
         char *name;
         int r = 0, w = 0, x = 0, err;
@@@ -781,11 -783,12 +785,12 @@@
                 err = access_file(name, r, w, x);
         __putname(name);
         if (!err)
-               err = generic_permission(ino, desired);
+               err = generic_permission(&init_user_ns, ino, desired);
         return err;
   }
   
- static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hostfs_setattr(struct user_namespace *mnt_userns,
+                         struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         struct hostfs_iattr attrs;
@@@ -794,7 -797,7 +799,7 @@@
   
         int fd = HOSTFS_I(inode)->fd;
   
-       err = setattr_prepare(dentry, attr);
+       err = setattr_prepare(&init_user_ns, dentry, attr);
         if (err)
                 return err;
   
@@@ -851,7 -854,7 +856,7 @@@
             attr->ia_size != i_size_read(inode))
                 truncate_setsize(inode, attr->ia_size);
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         mark_inode_dirty(inode);
         return 0;
   }
@@@ -988,16 -991,12 +993,16 @@@ MODULE_ALIAS_FS("hostfs")
   
   static int __init init_hostfs(void)
   {
+ +      hostfs_inode_cache = KMEM_CACHE(hostfs_inode_info, 0);
+ +      if (!hostfs_inode_cache)
+ +              return -ENOMEM;
         return register_filesystem(&hostfs_type);
   }
   
   static void __exit exit_hostfs(void)
   {
         unregister_filesystem(&hostfs_type);
+ +      kmem_cache_destroy(hostfs_inode_cache);
   }
   
   module_init(init_hostfs)
diff --combined fs/hugetlbfs/inode.c

index 21c20fd,c5c32eb..b7a72f5
--- 1/fs/hugetlbfs/inode.c
--- 2/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@@ -735,10 -735,9 +735,10 @@@ static long hugetlbfs_fallocate(struct 
   
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
   
+ +              set_page_huge_active(page);
                 /*
                  * unlock_page because locked by add_to_page_cache()
- -               * page_put due to reference from alloc_huge_page()
+ +               * put_page() due to reference from alloc_huge_page()
                  */
                 unlock_page(page);
                 put_page(page);
@@@ -752,7 -751,8 +752,8 @@@ out
         return error;
   }
   
- static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+ static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
+                            struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         struct hstate *h = hstate_inode(inode);
@@@ -762,7 -762,7 +763,7 @@@
   
         BUG_ON(!inode);
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(&init_user_ns, dentry, attr);
         if (error)
                 return error;
   
@@@ -781,7 -781,7 +782,7 @@@
                         return error;
         }
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         mark_inode_dirty(inode);
         return 0;
   }
@@@ -837,7 -837,7 +838,7 @@@ static struct inode *hugetlbfs_get_inod
                 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
   
                 inode->i_ino = get_next_ino();
-               inode_init_owner(inode, dir, mode);
+               inode_init_owner(&init_user_ns, inode, dir, mode);
                 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                 &hugetlbfs_i_mmap_rwsem_key);
                 inode->i_mapping->a_ops = &hugetlbfs_aops;
@@@ -899,33 -899,39 +900,39 @@@ static int do_hugetlbfs_mknod(struct in
         return error;
   }
   
- static int hugetlbfs_mknod(struct inode *dir,
-                       struct dentry *dentry, umode_t mode, dev_t dev)
+ static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+                          struct dentry *dentry, umode_t mode, dev_t dev)
   {
         return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
   }
   
- static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                          struct dentry *dentry, umode_t mode)
   {
-       int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+       int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
+                                    mode | S_IFDIR, 0);
         if (!retval)
                 inc_nlink(dir);
         return retval;
   }
   
- static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+ static int hugetlbfs_create(struct user_namespace *mnt_userns,
+                           struct inode *dir, struct dentry *dentry,
+                           umode_t mode, bool excl)
   {
-       return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+       return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
   }
   
- static int hugetlbfs_tmpfile(struct inode *dir,
-                       struct dentry *dentry, umode_t mode)
+ static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
+                            struct inode *dir, struct dentry *dentry,
+                            umode_t mode)
   {
         return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
   }
   
- static int hugetlbfs_symlink(struct inode *dir,
-                       struct dentry *dentry, const char *symname)
+ static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
+                            struct inode *dir, struct dentry *dentry,
+                            const char *symname)
   {
         struct inode *inode;
         int error = -ENOSPC;
diff --combined fs/inode.c

index 8742421,0815196..6dba963
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -1493,7 -1493,7 +1493,7 @@@ struct inode *find_inode_rcu(struct sup
   EXPORT_SYMBOL(find_inode_rcu);
   
   /**
- - * find_inode_by_rcu - Find an inode in the inode cache
+ + * find_inode_by_ino_rcu - Find an inode in the inode cache
    * @sb:               Super block of file system to search
    * @ino:      The inode number to match
    *
@@@ -1743,26 -1743,24 +1743,26 @@@ static int relatime_need_update(struct 
   
   int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
   {
- -      int iflags = I_DIRTY_TIME;
- -      bool dirty = false;
- -
- -      if (flags & S_ATIME)
- -              inode->i_atime = *time;
- -      if (flags & S_VERSION)
- -              dirty = inode_maybe_inc_iversion(inode, false);
- -      if (flags & S_CTIME)
- -              inode->i_ctime = *time;
- -      if (flags & S_MTIME)
- -              inode->i_mtime = *time;
- -      if ((flags & (S_ATIME | S_CTIME | S_MTIME)) &&
- -          !(inode->i_sb->s_flags & SB_LAZYTIME))
- -              dirty = true;
- -
- -      if (dirty)
- -              iflags |= I_DIRTY_SYNC;
- -      __mark_inode_dirty(inode, iflags);
+ +      int dirty_flags = 0;
+ +
+ +      if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
+ +              if (flags & S_ATIME)
+ +                      inode->i_atime = *time;
+ +              if (flags & S_CTIME)
+ +                      inode->i_ctime = *time;
+ +              if (flags & S_MTIME)
+ +                      inode->i_mtime = *time;
+ +
+ +              if (inode->i_sb->s_flags & SB_LAZYTIME)
+ +                      dirty_flags |= I_DIRTY_TIME;
+ +              else
+ +                      dirty_flags |= I_DIRTY_SYNC;
+ +      }
+ +
+ +      if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
+ +              dirty_flags |= I_DIRTY_SYNC;
+ +
+ +      __mark_inode_dirty(inode, dirty_flags);
         return 0;
   }
   EXPORT_SYMBOL(generic_update_time);
@@@ -1779,7 -1777,7 +1779,7 @@@ static int update_time(struct inode *in
   }
   
   /**
- - *    touch_atime     -       update the access time
+ + *    atime_needs_update      -       update the access time
    *    @path: the &struct path to update
    *    @inode: inode to update
    *
@@@ -1798,7 -1796,7 +1798,7 @@@ bool atime_needs_update(const struct pa
         /* Atime updates will likely cause i_uid and i_gid to be written
          * back improprely if their true value is unknown to the vfs.
          */
-       if (HAS_UNMAPPED_ID(inode))
+       if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode))
                 return false;
   
         if (IS_NOATIME(inode))
@@@ -1905,7 -1903,8 +1905,8 @@@ int dentry_needs_remove_privs(struct de
         return mask;
   }
   
- static int __remove_privs(struct dentry *dentry, int kill)
+ static int __remove_privs(struct user_namespace *mnt_userns,
+                         struct dentry *dentry, int kill)
   {
         struct iattr newattrs;
   
@@@ -1914,7 -1913,7 +1915,7 @@@
          * Note we call this on write, so notify_change will not
          * encounter any conflicting delegations:
          */
-       return notify_change(dentry, &newattrs, NULL);
+       return notify_change(mnt_userns, dentry, &newattrs, NULL);
   }
   
   /*
@@@ -1941,7 -1940,7 +1942,7 @@@ int file_remove_privs(struct file *file
         if (kill < 0)
                 return kill;
         if (kill)
-               error = __remove_privs(dentry, kill);
+               error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
         if (!error)
                 inode_has_no_xattr(inode);
   
@@@ -2132,14 -2131,21 +2133,21 @@@ EXPORT_SYMBOL(init_special_inode)
   
   /**
    * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
+  * @mnt_userns:       User namespace of the mount the inode was created from
    * @inode: New inode
    * @dir: Directory inode
    * @mode: mode of the new inode
+  *
+  * If the inode has been created through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions
+  * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
+  * checking is to be performed on the raw inode simply passs init_user_ns.
    */
- void inode_init_owner(struct inode *inode, const struct inode *dir,
-                       umode_t mode)
+ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+                     const struct inode *dir, umode_t mode)
   {
-       inode->i_uid = current_fsuid();
+       inode->i_uid = fsuid_into_mnt(mnt_userns);
         if (dir && dir->i_mode & S_ISGID) {
                 inode->i_gid = dir->i_gid;
   
@@@ -2147,31 -2153,41 +2155,41 @@@
                 if (S_ISDIR(mode))
                         mode |= S_ISGID;
                 else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
-                        !in_group_p(inode->i_gid) &&
-                        !capable_wrt_inode_uidgid(dir, CAP_FSETID))
+                        !in_group_p(i_gid_into_mnt(mnt_userns, dir)) &&
+                        !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
                         mode &= ~S_ISGID;
         } else
-               inode->i_gid = current_fsgid();
+               inode->i_gid = fsgid_into_mnt(mnt_userns);
         inode->i_mode = mode;
   }
   EXPORT_SYMBOL(inode_init_owner);
   
   /**
    * inode_owner_or_capable - check current task permissions to inode
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: inode being checked
    *
    * Return true if current either has CAP_FOWNER in a namespace with the
    * inode owner uid mapped, or owns the file.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
    */
- bool inode_owner_or_capable(const struct inode *inode)
+ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+                           const struct inode *inode)
   {
+       kuid_t i_uid;
         struct user_namespace *ns;
   
-       if (uid_eq(current_fsuid(), inode->i_uid))
+       i_uid = i_uid_into_mnt(mnt_userns, inode);
+       if (uid_eq(current_fsuid(), i_uid))
                 return true;
   
         ns = current_user_ns();
-       if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
+       if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
                 return true;
         return false;
   }
diff --combined fs/internal.h

index 49bfb37,6c8a4ed..6aeae7e
--- 1/fs/internal.h
--- 2/fs/internal.h
+++ b/fs/internal.h
@@@ -15,7 -15,6 +15,7 @@@ struct mount
   struct shrink_control;
   struct fs_context;
   struct user_namespace;
+ +struct pipe_inode_info;
   
   /*
    * block_dev.c
@@@ -74,7 -73,7 +74,7 @@@ extern int vfs_path_lookup(struct dentr
                            const char *, unsigned int, struct path *);
   long do_rmdir(int dfd, struct filename *name);
   long do_unlinkat(int dfd, struct filename *name);
- int may_linkat(struct path *link);
+ int may_linkat(struct user_namespace *mnt_userns, struct path *link);
   int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
                  struct filename *newname, unsigned int flags);
   
@@@ -133,7 -132,6 +133,7 @@@ extern struct file *do_file_open_root(s
                 const char *, const struct open_flags *);
   extern struct open_how build_open_how(int flags, umode_t mode);
   extern int build_open_flags(const struct open_how *how, struct open_flags *op);
+ +extern int __close_fd_get_file(unsigned int fd, struct file **res);
   
   long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
   int chmod_common(const struct path *path, umode_t mode);
@@@ -195,11 -193,3 +195,11 @@@ int sb_init_dio_done_wq(struct super_bl
    */
   int do_statx(int dfd, const char __user *filename, unsigned flags,
              unsigned int mask, struct statx __user *buffer);
+ +
+ +/*
+ + * fs/splice.c:
+ + */
+ +long splice_file_to_pipe(struct file *in,
+ +                       struct pipe_inode_info *opipe,
+ +                       loff_t *offset,
+ +                       size_t len, unsigned int flags);
diff --combined fs/libfs.c

index 1e55176,967aefd..e2de540
--- 1/fs/libfs.c
--- 2/fs/libfs.c
+++ b/fs/libfs.c
@@@ -27,11 -27,12 +27,12 @@@
   
   #include "internal.h"
   
- int simple_getattr(const struct path *path, struct kstat *stat,
-                  u32 request_mask, unsigned int query_flags)
+ int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                  struct kstat *stat, u32 request_mask,
+                  unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
         return 0;
   }
@@@ -447,9 -448,9 +448,9 @@@ int simple_rmdir(struct inode *dir, str
   }
   EXPORT_SYMBOL(simple_rmdir);
   
- int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
-                 struct inode *new_dir, struct dentry *new_dentry,
-                 unsigned int flags)
+ int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+                 struct dentry *old_dentry, struct inode *new_dir,
+                 struct dentry *new_dentry, unsigned int flags)
   {
         struct inode *inode = d_inode(old_dentry);
         int they_are_dirs = d_is_dir(old_dentry);
@@@ -492,18 -493,19 +493,19 @@@ EXPORT_SYMBOL(simple_rename)
    * on simple regular filesystems.  Anything that needs to change on-disk
    * or wire state on size changes needs its own setattr method.
    */
- int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                  struct iattr *iattr)
   {
         struct inode *inode = d_inode(dentry);
         int error;
   
-       error = setattr_prepare(dentry, iattr);
+       error = setattr_prepare(mnt_userns, dentry, iattr);
         if (error)
                 return error;
   
         if (iattr->ia_valid & ATTR_SIZE)
                 truncate_setsize(inode, iattr->ia_size);
-       setattr_copy(inode, iattr);
+       setattr_copy(mnt_userns, inode, iattr);
         mark_inode_dirty(inode);
         return 0;
   }
@@@ -1117,7 -1119,7 +1119,7 @@@ int generic_file_fsync(struct file *fil
         err = __generic_file_fsync(file, start, end, datasync);
         if (err)
                 return err;
- -      return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +      return blkdev_issue_flush(inode->i_sb->s_bdev);
   }
   EXPORT_SYMBOL(generic_file_fsync);
   
@@@ -1214,6 -1216,11 +1216,6 @@@ static int anon_set_page_dirty(struct p
         return 0;
   };
   
- -/*
- - * A single inode exists for all anon_inode files. Contrary to pipes,
- - * anon_inode inodes have no associated per-instance data, so we need
- - * only allocate one of them.
- - */
   struct inode *alloc_anon_inode(struct super_block *s)
   {
         static const struct address_space_operations anon_aops = {
@@@ -1295,15 -1302,17 +1297,17 @@@ static struct dentry *empty_dir_lookup(
         return ERR_PTR(-ENOENT);
   }
   
- static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+ static int empty_dir_getattr(struct user_namespace *mnt_userns,
+                            const struct path *path, struct kstat *stat,
                              u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         return 0;
   }
   
- static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+ static int empty_dir_setattr(struct user_namespace *mnt_userns,
+                            struct dentry *dentry, struct iattr *attr)
   {
         return -EPERM;
   }
@@@ -1383,8 -1392,8 +1387,8 @@@ static bool needs_casefold(const struc
    *
    * Return: 0 if names match, 1 if mismatch, or -ERRNO
    */
- -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- -                        const char *str, const struct qstr *name)
+ +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ +                              const char *str, const struct qstr *name)
   {
         const struct dentry *parent = READ_ONCE(dentry->d_parent);
         const struct inode *dir = READ_ONCE(parent->d_inode);
@@@ -1421,6 -1430,7 +1425,6 @@@ fallback
                 return 1;
         return !!memcmp(str, name->name, len);
   }
- -EXPORT_SYMBOL(generic_ci_d_compare);
   
   /**
    * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@@ -1429,7 -1439,7 +1433,7 @@@
    *
    * Return: 0 if hash was successful or unchanged, and -EINVAL on error
    */
- -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+ +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
   {
         const struct inode *dir = READ_ONCE(dentry->d_inode);
         struct super_block *sb = dentry->d_sb;
@@@ -1444,6 -1454,7 +1448,6 @@@
                 return -EINVAL;
         return 0;
   }
- -EXPORT_SYMBOL(generic_ci_d_hash);
   
   static const struct dentry_operations generic_ci_dentry_ops = {
         .d_hash = generic_ci_d_hash,
diff --combined fs/namei.c

index de74ad2,dbf53b3..216f16e
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -259,7 -259,24 +259,24 @@@ void putname(struct filename *name
                 __putname(name);
   }
   
- static int check_acl(struct inode *inode, int mask)
+ /**
+  * check_acl - perform ACL permission checking
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @inode:    inode to check permissions on
+  * @mask:     right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+  *
+  * This function performs the ACL permission checking. Since this function
+  * retrieve POSIX acls it needs to know whether it is called from a blocking or
+  * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ static int check_acl(struct user_namespace *mnt_userns,
+                    struct inode *inode, int mask)
   {
   #ifdef CONFIG_FS_POSIX_ACL
         struct posix_acl *acl;
@@@ -271,14 -288,14 +288,14 @@@
                 /* no ->get_acl() calls in RCU mode... */
                 if (is_uncached_acl(acl))
                         return -ECHILD;
-               return posix_acl_permission(inode, acl, mask);
+               return posix_acl_permission(mnt_userns, inode, acl, mask);
         }
   
         acl = get_acl(inode, ACL_TYPE_ACCESS);
         if (IS_ERR(acl))
                 return PTR_ERR(acl);
         if (acl) {
-               int error = posix_acl_permission(inode, acl, mask);
+               int error = posix_acl_permission(mnt_userns, inode, acl, mask);
                 posix_acl_release(acl);
                 return error;
         }
@@@ -287,18 -304,31 +304,31 @@@
         return -EAGAIN;
   }
   
- /*
-  * This does the basic UNIX permission checking.
+ /**
+  * acl_permission_check - perform basic UNIX permission checking
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @inode:    inode to check permissions on
+  * @mask:     right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+  *
+  * This function performs the basic UNIX permission checking. Since this
+  * function may retrieve POSIX acls it needs to know whether it is called from a
+  * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
    *
-  * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit,
-  * for RCU walking.
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
    */
- static int acl_permission_check(struct inode *inode, int mask)
+ static int acl_permission_check(struct user_namespace *mnt_userns,
+                               struct inode *inode, int mask)
   {
         unsigned int mode = inode->i_mode;
+       kuid_t i_uid;
   
         /* Are we the owner? If so, ACL's don't matter */
-       if (likely(uid_eq(current_fsuid(), inode->i_uid))) {
+       i_uid = i_uid_into_mnt(mnt_userns, inode);
+       if (likely(uid_eq(current_fsuid(), i_uid))) {
                 mask &= 7;
                 mode >>= 6;
                 return (mask & ~mode) ? -EACCES : 0;
@@@ -306,7 -336,7 +336,7 @@@
   
         /* Do we have ACL's? */
         if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
-               int error = check_acl(inode, mask);
+               int error = check_acl(mnt_userns, inode, mask);
                 if (error != -EAGAIN)
                         return error;
         }
@@@ -320,7 -350,8 +350,8 @@@
          * about? Need to check group ownership if so.
          */
         if (mask & (mode ^ (mode >> 3))) {
-               if (in_group_p(inode->i_gid))
+               kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
+               if (in_group_p(kgid))
                         mode >>= 3;
         }
   
@@@ -330,6 -361,7 +361,7 @@@
   
   /**
    * generic_permission -  check for access rights on a Posix-like filesystem
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode:    inode to check access rights for
    * @mask:     right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
    *            %MAY_NOT_BLOCK ...)
@@@ -342,25 -374,33 +374,33 @@@
    * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
    * request cannot be satisfied (eg. requires blocking or too much complexity).
    * It would then be called again in ref-walk mode.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
    */
- int generic_permission(struct inode *inode, int mask)
+ int generic_permission(struct user_namespace *mnt_userns, struct inode *inode,
+                      int mask)
   {
         int ret;
   
         /*
          * Do the basic permission checks.
          */
-       ret = acl_permission_check(inode, mask);
+       ret = acl_permission_check(mnt_userns, inode, mask);
         if (ret != -EACCES)
                 return ret;
   
         if (S_ISDIR(inode->i_mode)) {
                 /* DACs are overridable for directories */
                 if (!(mask & MAY_WRITE))
-                       if (capable_wrt_inode_uidgid(inode,
+                       if (capable_wrt_inode_uidgid(mnt_userns, inode,
                                                      CAP_DAC_READ_SEARCH))
                                 return 0;
-               if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+               if (capable_wrt_inode_uidgid(mnt_userns, inode,
+                                            CAP_DAC_OVERRIDE))
                         return 0;
                 return -EACCES;
         }
@@@ -370,7 -410,8 +410,8 @@@
          */
         mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
         if (mask == MAY_READ)
-               if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
+               if (capable_wrt_inode_uidgid(mnt_userns, inode,
+                                            CAP_DAC_READ_SEARCH))
                         return 0;
         /*
          * Read/write DACs are always overridable.
@@@ -378,31 -419,38 +419,38 @@@
          * at least one exec bit set.
          */
         if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
-               if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+               if (capable_wrt_inode_uidgid(mnt_userns, inode,
+                                            CAP_DAC_OVERRIDE))
                         return 0;
   
         return -EACCES;
   }
   EXPORT_SYMBOL(generic_permission);
   
- /*
+ /**
+  * do_inode_permission - UNIX permission checking
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @inode:    inode to check permissions on
+  * @mask:     right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
+  *
    * We _really_ want to just do "generic_permission()" without
    * even looking at the inode->i_op values. So we keep a cache
    * flag in inode->i_opflags, that says "this has not special
    * permission function, use the fast case".
    */
- static inline int do_inode_permission(struct inode *inode, int mask)
+ static inline int do_inode_permission(struct user_namespace *mnt_userns,
+                                     struct inode *inode, int mask)
   {
         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                 if (likely(inode->i_op->permission))
-                       return inode->i_op->permission(inode, mask);
+                       return inode->i_op->permission(mnt_userns, inode, mask);
   
                 /* This gets set once for the inode lifetime */
                 spin_lock(&inode->i_lock);
                 inode->i_opflags |= IOP_FASTPERM;
                 spin_unlock(&inode->i_lock);
         }
-       return generic_permission(inode, mask);
+       return generic_permission(mnt_userns, inode, mask);
   }
   
   /**
@@@ -427,8 -475,9 +475,9 @@@ static int sb_permission(struct super_b
   
   /**
    * inode_permission - Check for access rights to a given inode
-  * @inode: Inode to check permission on
-  * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+  * @mnt_userns:       User namespace of the mount the inode was found from
+  * @inode:    Inode to check permission on
+  * @mask:     Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
    *
    * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
    * this, letting us set arbitrary permissions for filesystem access without
@@@ -436,7 -485,8 +485,8 @@@
    *
    * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
    */
- int inode_permission(struct inode *inode, int mask)
+ int inode_permission(struct user_namespace *mnt_userns,
+                    struct inode *inode, int mask)
   {
         int retval;
   
@@@ -456,11 -506,11 +506,11 @@@
                  * written back improperly if their true value is unknown
                  * to the vfs.
                  */
-               if (HAS_UNMAPPED_ID(inode))
+               if (HAS_UNMAPPED_ID(mnt_userns, inode))
                         return -EACCES;
         }
   
-       retval = do_inode_permission(inode, mask);
+       retval = do_inode_permission(mnt_userns, inode, mask);
         if (retval)
                 return retval;
   
@@@ -630,11 -680,6 +680,11 @@@ static inline bool legitimize_path(stru
   static bool legitimize_links(struct nameidata *nd)
   {
         int i;
+ +      if (unlikely(nd->flags & LOOKUP_CACHED)) {
+ +              drop_links(nd);
+ +              nd->depth = 0;
+ +              return false;
+ +      }
         for (i = 0; i < nd->depth; i++) {
                 struct saved *last = nd->stack + i;
                 if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
@@@ -674,17 -719,17 +724,17 @@@ static bool legitimize_root(struct name
    */
   
   /**
- - * unlazy_walk - try to switch to ref-walk mode.
+ + * try_to_unlazy - try to switch to ref-walk mode.
    * @nd: nameidata pathwalk data
- - * Returns: 0 on success, -ECHILD on failure
+ + * Returns: true on success, false on failure
    *
- - * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ + * try_to_unlazy attempts to legitimize the current nd->path and nd->root
    * for ref-walk mode.
    * Must be called from rcu-walk context.
- - * Nothing should touch nameidata between unlazy_walk() failure and
+ + * Nothing should touch nameidata between try_to_unlazy() failure and
    * terminate_walk().
    */
- -static int unlazy_walk(struct nameidata *nd)
+ +static bool try_to_unlazy(struct nameidata *nd)
   {
         struct dentry *parent = nd->path.dentry;
   
@@@ -699,30 -744,30 +749,30 @@@
                 goto out;
         rcu_read_unlock();
         BUG_ON(nd->inode != parent->d_inode);
- -      return 0;
+ +      return true;
   
   out1:
         nd->path.mnt = NULL;
         nd->path.dentry = NULL;
   out:
         rcu_read_unlock();
- -      return -ECHILD;
+ +      return false;
   }
   
   /**
- - * unlazy_child - try to switch to ref-walk mode.
+ + * try_to_unlazy_next - try to switch to ref-walk mode.
    * @nd: nameidata pathwalk data
- - * @dentry: child of nd->path.dentry
- - * @seq: seq number to check dentry against
- - * Returns: 0 on success, -ECHILD on failure
+ + * @dentry: next dentry to step into
+ + * @seq: seq number to check @dentry against
+ + * Returns: true on success, false on failure
    *
- - * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
- - * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
- - * @nd.  Must be called from rcu-walk context.
- - * Nothing should touch nameidata between unlazy_child() failure and
+ + * Similar to to try_to_unlazy(), but here we have the next dentry already
+ + * picked by rcu-walk and want to legitimize that in addition to the current
+ + * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
+ + * Nothing should touch nameidata between try_to_unlazy_next() failure and
    * terminate_walk().
    */
- -static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+ +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq)
   {
         BUG_ON(!(nd->flags & LOOKUP_RCU));
   
@@@ -752,7 -797,7 +802,7 @@@
         if (unlikely(!legitimize_root(nd)))
                 goto out_dput;
         rcu_read_unlock();
- -      return 0;
+ +      return true;
   
   out2:
         nd->path.mnt = NULL;
@@@ -760,11 -805,11 +810,11 @@@ out1
         nd->path.dentry = NULL;
   out:
         rcu_read_unlock();
- -      return -ECHILD;
+ +      return false;
   out_dput:
         rcu_read_unlock();
         dput(dentry);
- -      return -ECHILD;
+ +      return false;
   }
   
   static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
@@@ -797,8 -842,7 +847,8 @@@ static int complete_walk(struct nameida
                  */
                 if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
                         nd->root.mnt = NULL;
- -              if (unlikely(unlazy_walk(nd)))
+ +              nd->flags &= ~LOOKUP_CACHED;
+ +              if (!try_to_unlazy(nd))
                         return -ECHILD;
         }
   
@@@ -960,11 -1004,16 +1010,16 @@@ int sysctl_protected_regular __read_mos
    */
   static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
   {
+       struct user_namespace *mnt_userns;
+       kuid_t i_uid;
+ 
         if (!sysctl_protected_symlinks)
                 return 0;
   
+       mnt_userns = mnt_user_ns(nd->path.mnt);
+       i_uid = i_uid_into_mnt(mnt_userns, inode);
         /* Allowed if owner and follower match. */
-       if (uid_eq(current_cred()->fsuid, inode->i_uid))
+       if (uid_eq(current_cred()->fsuid, i_uid))
                 return 0;
   
         /* Allowed if parent directory not sticky and world-writable. */
@@@ -972,7 -1021,7 +1027,7 @@@
                 return 0;
   
         /* Allowed if parent directory and link owner match. */
-       if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
+       if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
                 return 0;
   
         if (nd->flags & LOOKUP_RCU)
@@@ -985,6 -1034,7 +1040,7 @@@
   
   /**
    * safe_hardlink_source - Check for safe hardlink conditions
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: the source inode to hardlink from
    *
    * Return false if at least one of the following conditions:
@@@ -995,7 -1045,8 +1051,8 @@@
    *
    * Otherwise returns true.
    */
- static bool safe_hardlink_source(struct inode *inode)
+ static bool safe_hardlink_source(struct user_namespace *mnt_userns,
+                                struct inode *inode)
   {
         umode_t mode = inode->i_mode;
   
@@@ -1012,7 -1063,7 +1069,7 @@@
                 return false;
   
         /* Hardlinking to unreadable or unwritable sources is dangerous. */
-       if (inode_permission(inode, MAY_READ | MAY_WRITE))
+       if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE))
                 return false;
   
         return true;
@@@ -1020,6 -1071,7 +1077,7 @@@
   
   /**
    * may_linkat - Check permissions for creating a hardlink
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @link: the source to hardlink from
    *
    * Block hardlink when all of:
@@@ -1028,14 -1080,21 +1086,21 @@@
    *  - hardlink source is unsafe (see safe_hardlink_source() above)
    *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
    *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  *
    * Returns 0 if successful, -ve on error.
    */
- int may_linkat(struct path *link)
+ int may_linkat(struct user_namespace *mnt_userns, struct path *link)
   {
         struct inode *inode = link->dentry->d_inode;
   
         /* Inode writeback is not safe when the uid or gid are invalid. */
-       if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+       if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+           !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
                 return -EOVERFLOW;
   
         if (!sysctl_protected_hardlinks)
@@@ -1044,7 -1103,8 +1109,8 @@@
         /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
          * otherwise, it must be a safe source.
          */
-       if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
+       if (safe_hardlink_source(mnt_userns, inode) ||
+           inode_owner_or_capable(mnt_userns, inode))
                 return 0;
   
         audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
@@@ -1055,6 -1115,7 +1121,7 @@@
    * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
    *                      should be allowed, or not, on files that already
    *                      exist.
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @dir_mode: mode bits of directory
    * @dir_uid: owner of directory
    * @inode: the inode of the file to open
@@@ -1070,16 -1131,25 +1137,25 @@@
    * the directory doesn't have to be world writable: being group writable will
    * be enough.
    *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  *
    * Returns 0 if the open is allowed, -ve on error.
    */
- static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
-                               struct inode * const inode)
+ static int may_create_in_sticky(struct user_namespace *mnt_userns,
+                               struct nameidata *nd, struct inode *const inode)
   {
+       umode_t dir_mode = nd->dir_mode;
+       kuid_t dir_uid = nd->dir_uid;
+ 
         if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
             (!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
             likely(!(dir_mode & S_ISVTX)) ||
-           uid_eq(inode->i_uid, dir_uid) ||
-           uid_eq(current_fsuid(), inode->i_uid))
+           uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
+           uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
                 return 0;
   
         if (likely(dir_mode & 0002) ||
@@@ -1378,7 -1448,7 +1454,7 @@@ static inline int handle_mounts(struct 
                         return -ENOENT;
                 if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
                         return 0;
- -              if (unlazy_child(nd, dentry, seq))
+ +              if (!try_to_unlazy_next(nd, dentry, seq))
                         return -ECHILD;
                 // *path might've been clobbered by __follow_mount_rcu()
                 path->mnt = nd->path.mnt;
@@@ -1472,7 -1542,7 +1548,7 @@@ static struct dentry *lookup_fast(struc
                 unsigned seq;
                 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
                 if (unlikely(!dentry)) {
- -                      if (unlazy_walk(nd))
+ +                      if (!try_to_unlazy(nd))
                                 return ERR_PTR(-ECHILD);
                         return NULL;
                 }
@@@ -1499,9 -1569,9 +1575,9 @@@
                 status = d_revalidate(dentry, nd->flags);
                 if (likely(status > 0))
                         return dentry;
- -              if (unlazy_child(nd, dentry, seq))
+ +              if (!try_to_unlazy_next(nd, dentry, seq))
                         return ERR_PTR(-ECHILD);
- -              if (unlikely(status == -ECHILD))
+ +              if (status == -ECHILD)
                         /* we'd been told to redo it in non-rcu mode */
                         status = d_revalidate(dentry, nd->flags);
         } else {
@@@ -1569,14 -1639,18 +1645,15 @@@ static struct dentry *lookup_slow(cons
         return res;
   }
   
- static inline int may_lookup(struct nameidata *nd)
+ static inline int may_lookup(struct user_namespace *mnt_userns,
+                            struct nameidata *nd)
   {
         if (nd->flags & LOOKUP_RCU) {
-               int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- -              int err = inode_permission(mnt_userns, nd->inode,
- -                                         MAY_EXEC | MAY_NOT_BLOCK);
- -              if (err != -ECHILD)
++              int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+ +              if (err != -ECHILD || !try_to_unlazy(nd))
                         return err;
- -              if (unlazy_walk(nd))
- -                      return -ECHILD;
         }
-       return inode_permission(nd->inode, MAY_EXEC);
+       return inode_permission(mnt_userns, nd->inode, MAY_EXEC);
   }
   
   static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
@@@ -1596,7 -1670,7 +1673,7 @@@
                 // unlazy even if we fail to grab the link - cleanup needs it
                 bool grabbed_link = legitimize_path(nd, link, seq);
   
- -              if (unlazy_walk(nd) != 0 || !grabbed_link)
+ +              if (!try_to_unlazy(nd) != 0 || !grabbed_link)
                         return -ECHILD;
   
                 if (nd_alloc_stack(nd))
@@@ -1638,7 -1712,7 +1715,7 @@@ static const char *pick_link(struct nam
                 touch_atime(&last->link);
                 cond_resched();
         } else if (atime_needs_update(&last->link, inode)) {
- -              if (unlikely(unlazy_walk(nd)))
+ +              if (!try_to_unlazy(nd))
                         return ERR_PTR(-ECHILD);
                 touch_atime(&last->link);
         }
@@@ -1655,8 -1729,11 +1732,8 @@@
                 get = inode->i_op->get_link;
                 if (nd->flags & LOOKUP_RCU) {
                         res = get(NULL, inode, &last->done);
- -                      if (res == ERR_PTR(-ECHILD)) {
- -                              if (unlikely(unlazy_walk(nd)))
- -                                      return ERR_PTR(-ECHILD);
+ +                      if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                 res = get(link->dentry, inode, &last->done);
- -                      }
                 } else {
                         res = get(link->dentry, inode, &last->done);
                 }
@@@ -2122,11 -2199,13 +2199,13 @@@ static int link_path_walk(const char *n
   
         /* At this point we know we have a real path component. */
         for(;;) {
+               struct user_namespace *mnt_userns;
                 const char *link;
                 u64 hash_len;
                 int type;
   
-               err = may_lookup(nd);
+               mnt_userns = mnt_user_ns(nd->path.mnt);
+               err = may_lookup(mnt_userns, nd);
                 if (err)
                         return err;
   
@@@ -2174,7 -2253,7 +2253,7 @@@
   OK:
                         /* pathname or trailing symlink, done */
                         if (!depth) {
-                               nd->dir_uid = nd->inode->i_uid;
+                               nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
                                 nd->dir_mode = nd->inode->i_mode;
                                 nd->flags &= ~LOOKUP_PARENT;
                                 return 0;
@@@ -2196,7 -2275,7 +2275,7 @@@
                 }
                 if (unlikely(!d_can_lookup(nd->path.dentry))) {
                         if (nd->flags & LOOKUP_RCU) {
- -                              if (unlazy_walk(nd))
+ +                              if (!try_to_unlazy(nd))
                                         return -ECHILD;
                         }
                         return -ENOTDIR;
@@@ -2210,10 -2289,6 +2289,10 @@@ static const char *path_init(struct nam
         int error;
         const char *s = nd->name->name;
   
+ +      /* LOOKUP_CACHED requires RCU, ask caller to retry */
+ +      if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+ +              return ERR_PTR(-EAGAIN);
+ +
         if (!*s)
                 flags &= ~LOOKUP_RCU;
         if (flags & LOOKUP_RCU)
@@@ -2511,7 -2586,7 +2590,7 @@@ static int lookup_one_len_common(const 
                         return err;
         }
   
-       return inode_permission(base->d_inode, MAY_EXEC);
+       return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC);
   }
   
   /**
@@@ -2656,15 -2731,16 +2735,16 @@@ int user_path_at_empty(int dfd, const c
   }
   EXPORT_SYMBOL(user_path_at_empty);
   
- int __check_sticky(struct inode *dir, struct inode *inode)
+ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+                  struct inode *inode)
   {
         kuid_t fsuid = current_fsuid();
   
-       if (uid_eq(inode->i_uid, fsuid))
+       if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
                 return 0;
-       if (uid_eq(dir->i_uid, fsuid))
+       if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
                 return 0;
-       return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
+       return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
   }
   EXPORT_SYMBOL(__check_sticky);
   
@@@ -2688,7 -2764,8 +2768,8 @@@
    * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
    *     nfs_async_unlink().
    */
- static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
+                     struct dentry *victim, bool isdir)
   {
         struct inode *inode = d_backing_inode(victim);
         int error;
@@@ -2700,19 -2777,21 +2781,21 @@@
         BUG_ON(victim->d_parent->d_inode != dir);
   
         /* Inode writeback is not safe when the uid or gid are invalid. */
-       if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+       if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+           !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
                 return -EOVERFLOW;
   
         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
   
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
         if (IS_APPEND(dir))
                 return -EPERM;
   
-       if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-           IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
+       if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) ||
+           IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
+           HAS_UNMAPPED_ID(mnt_userns, inode))
                 return -EPERM;
         if (isdir) {
                 if (!d_is_dir(victim))
@@@ -2737,7 -2816,8 +2820,8 @@@
    *  4. We should have write and exec permissions on dir
    *  5. We can't do it if dir is immutable (done in permission())
    */
- static inline int may_create(struct inode *dir, struct dentry *child)
+ static inline int may_create(struct user_namespace *mnt_userns,
+                            struct inode *dir, struct dentry *child)
   {
         struct user_namespace *s_user_ns;
         audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@@ -2746,10 -2826,10 +2830,10 @@@
         if (IS_DEADDIR(dir))
                 return -ENOENT;
         s_user_ns = dir->i_sb->s_user_ns;
-       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-           !kgid_has_mapping(s_user_ns, current_fsgid()))
+       if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+           !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
                 return -EOVERFLOW;
-       return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
   }
   
   /*
@@@ -2796,10 -2876,26 +2880,26 @@@ void unlock_rename(struct dentry *p1, s
   }
   EXPORT_SYMBOL(unlock_rename);
   
- int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-               bool want_excl)
+ /**
+  * vfs_create - create new file
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dir:      inode of @dentry
+  * @dentry:   pointer to dentry of the base directory
+  * @mode:     mode of the new file
+  * @want_excl:        whether the file must not yet exist
+  *
+  * Create a new file.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ int vfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+              struct dentry *dentry, umode_t mode, bool want_excl)
   {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt_userns, dir, dentry);
         if (error)
                 return error;
   
@@@ -2810,7 -2906,7 +2910,7 @@@
         error = security_inode_create(dir, dentry, mode);
         if (error)
                 return error;
-       error = dir->i_op->create(dir, dentry, mode, want_excl);
+       error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl);
         if (!error)
                 fsnotify_create(dir, dentry);
         return error;
@@@ -2822,7 -2918,7 +2922,7 @@@ int vfs_mkobj(struct dentry *dentry, um
                 void *arg)
   {
         struct inode *dir = dentry->d_parent->d_inode;
-       int error = may_create(dir, dentry);
+       int error = may_create(&init_user_ns, dir, dentry);
         if (error)
                 return error;
   
@@@ -2844,7 -2940,8 +2944,8 @@@ bool may_open_dev(const struct path *pa
                 !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
   }
   
- static int may_open(const struct path *path, int acc_mode, int flag)
+ static int may_open(struct user_namespace *mnt_userns, const struct path *path,
+                   int acc_mode, int flag)
   {
         struct dentry *dentry = path->dentry;
         struct inode *inode = dentry->d_inode;
@@@ -2879,7 -2976,7 +2980,7 @@@
                 break;
         }
   
-       error = inode_permission(inode, MAY_OPEN | acc_mode);
+       error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode);
         if (error)
                 return error;
   
@@@ -2894,13 -2991,13 +2995,13 @@@
         }
   
         /* O_NOATIME can only be set by the owner or superuser */
-       if (flag & O_NOATIME && !inode_owner_or_capable(inode))
+       if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode))
                 return -EPERM;
   
         return 0;
   }
   
- static int handle_truncate(struct file *filp)
+ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
   {
         const struct path *path = &filp->f_path;
         struct inode *inode = path->dentry->d_inode;
@@@ -2914,7 -3011,7 +3015,7 @@@
         if (!error)
                 error = security_path_truncate(path);
         if (!error) {
-               error = do_truncate(path->dentry, 0,
+               error = do_truncate(mnt_userns, path->dentry, 0,
                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                     filp);
         }
@@@ -2929,7 -3026,9 +3030,9 @@@ static inline int open_to_namei_flags(i
         return flag;
   }
   
- static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
+ static int may_o_create(struct user_namespace *mnt_userns,
+                       const struct path *dir, struct dentry *dentry,
+                       umode_t mode)
   {
         struct user_namespace *s_user_ns;
         int error = security_path_mknod(dir, dentry, mode, 0);
@@@ -2937,11 -3036,12 +3040,12 @@@
                 return error;
   
         s_user_ns = dir->dentry->d_sb->s_user_ns;
-       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
-           !kgid_has_mapping(s_user_ns, current_fsgid()))
+       if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) ||
+           !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns)))
                 return -EOVERFLOW;
   
-       error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(mnt_userns, dir->dentry->d_inode,
+                                MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
   
@@@ -3020,6 -3120,7 +3124,7 @@@ static struct dentry *lookup_open(struc
                                   const struct open_flags *op,
                                   bool got_write)
   {
+       struct user_namespace *mnt_userns;
         struct dentry *dir = nd->path.dentry;
         struct inode *dir_inode = dir->d_inode;
         int open_flag = op->open_flag;
@@@ -3067,13 -3168,15 +3172,15 @@@
          */
         if (unlikely(!got_write))
                 open_flag &= ~O_TRUNC;
+       mnt_userns = mnt_user_ns(nd->path.mnt);
         if (open_flag & O_CREAT) {
                 if (open_flag & O_EXCL)
                         open_flag &= ~O_TRUNC;
                 if (!IS_POSIXACL(dir->d_inode))
                         mode &= ~current_umask();
                 if (likely(got_write))
-                       create_error = may_o_create(&nd->path, dentry, mode);
+                       create_error = may_o_create(mnt_userns, &nd->path,
+                                                   dentry, mode);
                 else
                         create_error = -EROFS;
         }
@@@ -3108,8 -3211,9 +3215,9 @@@
                         error = -EACCES;
                         goto out_dput;
                 }
-               error = dir_inode->i_op->create(dir_inode, dentry, mode,
-                                               open_flag & O_EXCL);
+ 
+               error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
+                                               mode, open_flag & O_EXCL);
                 if (error)
                         goto out_dput;
         }
@@@ -3134,6 -3238,7 +3242,6 @@@ static const char *open_last_lookups(st
         struct inode *inode;
         struct dentry *dentry;
         const char *res;
- -      int error;
   
         nd->flags |= op->intent;
   
@@@ -3157,8 -3262,9 +3265,8 @@@
         } else {
                 /* create side of things */
                 if (nd->flags & LOOKUP_RCU) {
- -                      error = unlazy_walk(nd);
- -                      if (unlikely(error))
- -                              return ERR_PTR(error);
+ +                      if (!try_to_unlazy(nd))
+ +                              return ERR_PTR(-ECHILD);
                 }
                 audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
                 /* trailing slashes? */
@@@ -3167,7 -3273,9 +3275,7 @@@
         }
   
         if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
- -              error = mnt_want_write(nd->path.mnt);
- -              if (!error)
- -                      got_write = true;
+ +              got_write = !mnt_want_write(nd->path.mnt);
                 /*
                  * do _not_ fail yet - we might not need that or fail with
                  * a different error; let lookup_open() decide; we'll be
@@@ -3213,6 -3321,7 +3321,7 @@@ finish_lookup
   static int do_open(struct nameidata *nd,
                    struct file *file, const struct open_flags *op)
   {
+       struct user_namespace *mnt_userns;
         int open_flag = op->open_flag;
         bool do_truncate;
         int acc_mode;
@@@ -3225,12 -3334,13 +3334,13 @@@
         }
         if (!(file->f_mode & FMODE_CREATED))
                 audit_inode(nd->name, nd->path.dentry, 0);
+       mnt_userns = mnt_user_ns(nd->path.mnt);
         if (open_flag & O_CREAT) {
                 if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                         return -EEXIST;
                 if (d_is_dir(nd->path.dentry))
                         return -EISDIR;
-               error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
+               error = may_create_in_sticky(mnt_userns, nd,
                                              d_backing_inode(nd->path.dentry));
                 if (unlikely(error))
                         return error;
@@@ -3250,13 -3360,13 +3360,13 @@@
                         return error;
                 do_truncate = true;
         }
-       error = may_open(&nd->path, acc_mode, open_flag);
+       error = may_open(mnt_userns, &nd->path, acc_mode, open_flag);
         if (!error && !(file->f_mode & FMODE_OPENED))
                 error = vfs_open(&nd->path, file);
         if (!error)
                 error = ima_file_check(file, op->acc_mode);
         if (!error && do_truncate)
-               error = handle_truncate(file);
+               error = handle_truncate(mnt_userns, file);
         if (unlikely(error > 0)) {
                 WARN_ON(1);
                 error = -EINVAL;
@@@ -3266,7 -3376,23 +3376,23 @@@
         return error;
   }
   
- struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
+ /**
+  * vfs_tmpfile - create tmpfile
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dentry:   pointer to dentry of the base directory
+  * @mode:     mode of the new tmpfile
+  * @open_flags:       flags
+  *
+  * Create a temporary file.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+                          struct dentry *dentry, umode_t mode, int open_flag)
   {
         struct dentry *child = NULL;
         struct inode *dir = dentry->d_inode;
@@@ -3274,7 -3400,7 +3400,7 @@@
         int error;
   
         /* we want directory to be writable */
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 goto out_err;
         error = -EOPNOTSUPP;
@@@ -3284,7 -3410,7 +3410,7 @@@
         child = d_alloc(dentry, &slash_name);
         if (unlikely(!child))
                 goto out_err;
-       error = dir->i_op->tmpfile(dir, child, mode);
+       error = dir->i_op->tmpfile(mnt_userns, dir, child, mode);
         if (error)
                 goto out_err;
         error = -ENOENT;
@@@ -3296,7 -3422,7 +3422,7 @@@
                 inode->i_state |= I_LINKABLE;
                 spin_unlock(&inode->i_lock);
         }
-       ima_post_create_tmpfile(inode);
+       ima_post_create_tmpfile(mnt_userns, inode);
         return child;
   
   out_err:
@@@ -3309,6 -3435,7 +3435,7 @@@ static int do_tmpfile(struct nameidata 
                 const struct open_flags *op,
                 struct file *file)
   {
+       struct user_namespace *mnt_userns;
         struct dentry *child;
         struct path path;
         int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
@@@ -3317,7 -3444,8 +3444,8 @@@
         error = mnt_want_write(path.mnt);
         if (unlikely(error))
                 goto out;
-       child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
+       mnt_userns = mnt_user_ns(path.mnt);
+       child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag);
         error = PTR_ERR(child);
         if (IS_ERR(child))
                 goto out2;
@@@ -3325,9 -3453,11 +3453,9 @@@
         path.dentry = child;
         audit_inode(nd->name, child, 0);
         /* Don't check for other permissions, the inode was just created */
-       error = may_open(&path, 0, op->open_flag);
+       error = may_open(mnt_userns, &path, 0, op->open_flag);
- -      if (error)
- -              goto out2;
- -      file->f_path.mnt = path.mnt;
- -      error = finish_open(file, child, NULL);
+ +      if (!error)
+ +              error = vfs_open(&path, file);
   out2:
         mnt_drop_write(path.mnt);
   out:
@@@ -3527,10 -3657,27 +3655,27 @@@ inline struct dentry *user_path_create(
   }
   EXPORT_SYMBOL(user_path_create);
   
- int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ /**
+  * vfs_mknod - create device node or file
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dir:      inode of @dentry
+  * @dentry:   pointer to dentry of the base directory
+  * @mode:     mode of the new device node or file
+  * @dev:      device number of device to create
+  *
+  * Create a device node or file.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+             struct dentry *dentry, umode_t mode, dev_t dev)
   {
         bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt_userns, dir, dentry);
   
         if (error)
                 return error;
@@@ -3550,7 -3697,7 +3695,7 @@@
         if (error)
                 return error;
   
-       error = dir->i_op->mknod(dir, dentry, mode, dev);
+       error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev);
         if (!error)
                 fsnotify_create(dir, dentry);
         return error;
@@@ -3577,6 -3724,7 +3722,7 @@@ static int may_mknod(umode_t mode
   static long do_mknodat(int dfd, const char __user *filename, umode_t mode,
                 unsigned int dev)
   {
+       struct user_namespace *mnt_userns;
         struct dentry *dentry;
         struct path path;
         int error;
@@@ -3595,18 -3743,22 +3741,22 @@@ retry
         error = security_path_mknod(&path, dentry, mode, dev);
         if (error)
                 goto out;
+ 
+       mnt_userns = mnt_user_ns(path.mnt);
         switch (mode & S_IFMT) {
                 case 0: case S_IFREG:
-                       error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+                       error = vfs_create(mnt_userns, path.dentry->d_inode,
+                                          dentry, mode, true);
                         if (!error)
-                               ima_post_path_mknod(dentry);
+                               ima_post_path_mknod(mnt_userns, dentry);
                         break;
                 case S_IFCHR: case S_IFBLK:
-                       error = vfs_mknod(path.dentry->d_inode,dentry,mode,
-                                       new_decode_dev(dev));
+                       error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+                                         dentry, mode, new_decode_dev(dev));
                         break;
                 case S_IFIFO: case S_IFSOCK:
-                       error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
+                       error = vfs_mknod(mnt_userns, path.dentry->d_inode,
+                                         dentry, mode, 0);
                         break;
         }
   out:
@@@ -3629,9 -3781,25 +3779,25 @@@ SYSCALL_DEFINE3(mknod, const char __use
         return do_mknodat(AT_FDCWD, filename, mode, dev);
   }
   
- int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ /**
+  * vfs_mkdir - create directory
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dir:      inode of @dentry
+  * @dentry:   pointer to dentry of the base directory
+  * @mode:     mode of the new directory
+  *
+  * Create a directory.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+             struct dentry *dentry, umode_t mode)
   {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt_userns, dir, dentry);
         unsigned max_links = dir->i_sb->s_max_links;
   
         if (error)
@@@ -3648,7 -3816,7 +3814,7 @@@
         if (max_links && dir->i_nlink >= max_links)
                 return -EMLINK;
   
-       error = dir->i_op->mkdir(dir, dentry, mode);
+       error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode);
         if (!error)
                 fsnotify_mkdir(dir, dentry);
         return error;
@@@ -3670,8 -3838,12 +3836,12 @@@ retry
         if (!IS_POSIXACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = security_path_mkdir(&path, dentry, mode);
-       if (!error)
-               error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+       if (!error) {
+               struct user_namespace *mnt_userns;
+               mnt_userns = mnt_user_ns(path.mnt);
+               error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry,
+                                 mode);
+       }
         done_path_create(&path, dentry);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@@ -3690,9 -3862,24 +3860,24 @@@ SYSCALL_DEFINE2(mkdir, const char __use
         return do_mkdirat(AT_FDCWD, pathname, mode);
   }
   
- int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+ /**
+  * vfs_rmdir - remove directory
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dir:      inode of @dentry
+  * @dentry:   pointer to dentry of the base directory
+  *
+  * Remove a directory.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
+                    struct dentry *dentry)
   {
-       int error = may_delete(dir, dentry, 1);
+       int error = may_delete(mnt_userns, dir, dentry, 1);
   
         if (error)
                 return error;
@@@ -3732,6 -3919,7 +3917,7 @@@ EXPORT_SYMBOL(vfs_rmdir)
   
   long do_rmdir(int dfd, struct filename *name)
   {
+       struct user_namespace *mnt_userns;
         int error = 0;
         struct dentry *dentry;
         struct path path;
@@@ -3772,7 -3960,8 +3958,8 @@@ retry
         error = security_path_rmdir(&path, dentry);
         if (error)
                 goto exit3;
-       error = vfs_rmdir(path.dentry->d_inode, dentry);
+       mnt_userns = mnt_user_ns(path.mnt);
+       error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry);
   exit3:
         dput(dentry);
   exit2:
@@@ -3795,6 -3984,7 +3982,7 @@@ SYSCALL_DEFINE1(rmdir, const char __use
   
   /**
    * vfs_unlink - unlink a filesystem object
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @dir:      parent directory
    * @dentry:   victim
    * @delegated_inode: returns victim inode, if the inode is delegated.
@@@ -3810,11 -4000,18 +3998,18 @@@
    * Alternatively, a caller may pass NULL for delegated_inode.  This may
    * be appropriate for callers that expect the underlying filesystem not
    * to be NFS exported.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
    */
- int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
+              struct dentry *dentry, struct inode **delegated_inode)
   {
         struct inode *target = dentry->d_inode;
-       int error = may_delete(dir, dentry, 0);
+       int error = may_delete(mnt_userns, dir, dentry, 0);
   
         if (error)
                 return error;
@@@ -3885,6 -4082,8 +4080,8 @@@ retry_deleg
         dentry = __lookup_hash(&last, path.dentry, lookup_flags);
         error = PTR_ERR(dentry);
         if (!IS_ERR(dentry)) {
+               struct user_namespace *mnt_userns;
+ 
                 /* Why not before? Because we want correct error value */
                 if (last.name[last.len])
                         goto slashes;
@@@ -3895,7 -4094,9 +4092,9 @@@
                 error = security_path_unlink(&path, dentry);
                 if (error)
                         goto exit2;
-               error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+               mnt_userns = mnt_user_ns(path.mnt);
+               error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry,
+                                  &delegated_inode);
   exit2:
                 dput(dentry);
         }
@@@ -3944,9 -4145,25 +4143,25 @@@ SYSCALL_DEFINE1(unlink, const char __us
         return do_unlinkat(AT_FDCWD, getname(pathname));
   }
   
- int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+ /**
+  * vfs_symlink - create symlink
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dir:      inode of @dentry
+  * @dentry:   pointer to dentry of the base directory
+  * @oldname:  name of the file to link to
+  *
+  * Create a symlink.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
+  */
+ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+               struct dentry *dentry, const char *oldname)
   {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt_userns, dir, dentry);
   
         if (error)
                 return error;
@@@ -3958,7 -4175,7 +4173,7 @@@
         if (error)
                 return error;
   
-       error = dir->i_op->symlink(dir, dentry, oldname);
+       error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname);
         if (!error)
                 fsnotify_create(dir, dentry);
         return error;
@@@ -3984,8 -4201,13 +4199,13 @@@ retry
                 goto out_putname;
   
         error = security_path_symlink(&path, dentry, from->name);
-       if (!error)
-               error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+       if (!error) {
+               struct user_namespace *mnt_userns;
+ 
+               mnt_userns = mnt_user_ns(path.mnt);
+               error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry,
+                                   from->name);
+       }
         done_path_create(&path, dentry);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@@ -4010,6 -4232,7 +4230,7 @@@ SYSCALL_DEFINE2(symlink, const char __u
   /**
    * vfs_link - create a new link
    * @old_dentry:       object to be linked
+  * @mnt_userns:       the user namespace of the mount
    * @dir:      new parent
    * @new_dentry:       where to create the new link
    * @delegated_inode: returns inode needing a delegation break
@@@ -4025,8 -4248,16 +4246,16 @@@
    * Alternatively, a caller may pass NULL for delegated_inode.  This may
    * be appropriate for callers that expect the underlying filesystem not
    * to be NFS exported.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then take
+  * care to map the inode according to @mnt_userns before checking permissions.
+  * On non-idmapped mounts or if permission checking is to be performed on the
+  * raw inode simply passs init_user_ns.
    */
- int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+ int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns,
+            struct inode *dir, struct dentry *new_dentry,
+            struct inode **delegated_inode)
   {
         struct inode *inode = old_dentry->d_inode;
         unsigned max_links = dir->i_sb->s_max_links;
@@@ -4035,7 -4266,7 +4264,7 @@@
         if (!inode)
                 return -ENOENT;
   
-       error = may_create(dir, new_dentry);
+       error = may_create(mnt_userns, dir, new_dentry);
         if (error)
                 return error;
   
@@@ -4052,7 -4283,7 +4281,7 @@@
          * be writen back improperly if their true value is unknown to
          * the vfs.
          */
-       if (HAS_UNMAPPED_ID(inode))
+       if (HAS_UNMAPPED_ID(mnt_userns, inode))
                 return -EPERM;
         if (!dir->i_op->link)
                 return -EPERM;
@@@ -4099,6 -4330,7 +4328,7 @@@ EXPORT_SYMBOL(vfs_link)
   static int do_linkat(int olddfd, const char __user *oldname, int newdfd,
               const char __user *newname, int flags)
   {
+       struct user_namespace *mnt_userns;
         struct dentry *new_dentry;
         struct path old_path, new_path;
         struct inode *delegated_inode = NULL;
@@@ -4134,13 -4366,15 +4364,15 @@@ retry
         error = -EXDEV;
         if (old_path.mnt != new_path.mnt)
                 goto out_dput;
-       error = may_linkat(&old_path);
+       mnt_userns = mnt_user_ns(new_path.mnt);
+       error = may_linkat(mnt_userns, &old_path);
         if (unlikely(error))
                 goto out_dput;
         error = security_path_link(old_path.dentry, &new_path, new_dentry);
         if (error)
                 goto out_dput;
-       error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+       error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode,
+                        new_dentry, &delegated_inode);
   out_dput:
         done_path_create(&new_path, new_dentry);
         if (delegated_inode) {
@@@ -4174,12 -4408,14 +4406,14 @@@ SYSCALL_DEFINE2(link, const char __use
   
   /**
    * vfs_rename - rename a filesystem object
-  * @old_dir:  parent of source
-  * @old_dentry:       source
-  * @new_dir:  parent of destination
-  * @new_dentry:       destination
-  * @delegated_inode: returns an inode needing a delegation break
-  * @flags:    rename flags
+  * @old_mnt_userns:   old user namespace of the mount the inode was found from
+  * @old_dir:          parent of source
+  * @old_dentry:               source
+  * @new_mnt_userns:   new user namespace of the mount the inode was found from
+  * @new_dir:          parent of destination
+  * @new_dentry:               destination
+  * @delegated_inode:  returns an inode needing a delegation break
+  * @flags:            rename flags
    *
    * The caller must hold multiple mutexes--see lock_rename()).
    *
@@@ -4222,11 -4458,14 +4456,14 @@@
    *       ->i_mutex on parents, which works but leads to some truly excessive
    *       locking].
    */
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-              struct inode *new_dir, struct dentry *new_dentry,
-              struct inode **delegated_inode, unsigned int flags)
+ int vfs_rename(struct renamedata *rd)
   {
         int error;
+       struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+       struct dentry *old_dentry = rd->old_dentry;
+       struct dentry *new_dentry = rd->new_dentry;
+       struct inode **delegated_inode = rd->delegated_inode;
+       unsigned int flags = rd->flags;
         bool is_dir = d_is_dir(old_dentry);
         struct inode *source = old_dentry->d_inode;
         struct inode *target = new_dentry->d_inode;
@@@ -4237,19 -4476,21 +4474,21 @@@
         if (source == target)
                 return 0;
   
-       error = may_delete(old_dir, old_dentry, is_dir);
+       error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir);
         if (error)
                 return error;
   
         if (!target) {
-               error = may_create(new_dir, new_dentry);
+               error = may_create(rd->new_mnt_userns, new_dir, new_dentry);
         } else {
                 new_is_dir = d_is_dir(new_dentry);
   
                 if (!(flags & RENAME_EXCHANGE))
-                       error = may_delete(new_dir, new_dentry, is_dir);
+                       error = may_delete(rd->new_mnt_userns, new_dir,
+                                          new_dentry, is_dir);
                 else
-                       error = may_delete(new_dir, new_dentry, new_is_dir);
+                       error = may_delete(rd->new_mnt_userns, new_dir,
+                                          new_dentry, new_is_dir);
         }
         if (error)
                 return error;
@@@ -4263,12 -4504,14 +4502,14 @@@
          */
         if (new_dir != old_dir) {
                 if (is_dir) {
-                       error = inode_permission(source, MAY_WRITE);
+                       error = inode_permission(rd->old_mnt_userns, source,
+                                                MAY_WRITE);
                         if (error)
                                 return error;
                 }
                 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-                       error = inode_permission(target, MAY_WRITE);
+                       error = inode_permission(rd->new_mnt_userns, target,
+                                                MAY_WRITE);
                         if (error)
                                 return error;
                 }
@@@ -4308,8 -4551,8 +4549,8 @@@
                 if (error)
                         goto out;
         }
-       error = old_dir->i_op->rename(old_dir, old_dentry,
-                                      new_dir, new_dentry, flags);
+       error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry,
+                                     new_dir, new_dentry, flags);
         if (error)
                 goto out;
   
@@@ -4350,6 -4593,7 +4591,7 @@@ EXPORT_SYMBOL(vfs_rename)
   int do_renameat2(int olddfd, struct filename *from, int newdfd,
                  struct filename *to, unsigned int flags)
   {
+       struct renamedata rd;
         struct dentry *old_dentry, *new_dentry;
         struct dentry *trap;
         struct path old_path, new_path;
@@@ -4453,9 -4697,16 +4695,16 @@@ retry_deleg
                                      &new_path, new_dentry, flags);
         if (error)
                 goto exit5;
-       error = vfs_rename(old_path.dentry->d_inode, old_dentry,
-                          new_path.dentry->d_inode, new_dentry,
-                          &delegated_inode, flags);
+ 
+       rd.old_dir         = old_path.dentry->d_inode;
+       rd.old_dentry      = old_dentry;
+       rd.old_mnt_userns  = mnt_user_ns(old_path.mnt);
+       rd.new_dir         = new_path.dentry->d_inode;
+       rd.new_dentry      = new_dentry;
+       rd.new_mnt_userns  = mnt_user_ns(new_path.mnt);
+       rd.delegated_inode = &delegated_inode;
+       rd.flags           = flags;
+       error = vfs_rename(&rd);
   exit5:
         dput(new_dentry);
   exit4:
diff --combined fs/nfsd/export.c

index 7c863f2,e456421..9421dae
--- 1/fs/nfsd/export.c
--- 2/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@@ -331,29 -331,12 +331,29 @@@ static void nfsd4_fslocs_free(struct nf
         fsloc->locations = NULL;
   }
   
+ +static int export_stats_init(struct export_stats *stats)
+ +{
+ +      stats->start_time = ktime_get_seconds();
+ +      return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
+ +}
+ +
+ +static void export_stats_reset(struct export_stats *stats)
+ +{
+ +      nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
+ +}
+ +
+ +static void export_stats_destroy(struct export_stats *stats)
+ +{
+ +      nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
+ +}
+ +
   static void svc_export_put(struct kref *ref)
   {
         struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
         path_put(&exp->ex_path);
         auth_domain_put(exp->ex_client);
         nfsd4_fslocs_free(&exp->ex_fslocs);
+ +      export_stats_destroy(&exp->ex_stats);
         kfree(exp->ex_uuid);
         kfree_rcu(exp, ex_rcu);
   }
@@@ -386,8 -369,9 +386,9 @@@ static struct svc_export *svc_export_up
                                             struct svc_export *old);
   static struct svc_export *svc_export_lookup(struct svc_export *);
   
- static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ static int check_export(struct path *path, int *flags, unsigned char *uuid)
   {
+       struct inode *inode = d_inode(path->dentry);
   
         /*
          * We currently export only dirs, regular files, and (for v4
@@@ -411,6 -395,7 +412,7 @@@
          *       or an FSID number (so NFSEXP_FSID or ->uuid is needed).
          * 2:  We must be able to find an inode from a filehandle.
          *       This means that s_export_op must be set.
+        * 3: We must not currently be on an idmapped mount.
          */
         if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
             !(*flags & NFSEXP_FSID) &&
@@@ -425,6 -410,11 +427,11 @@@
                 return -EINVAL;
         }
   
+       if (mnt_user_ns(path->mnt) != &init_user_ns) {
+               dprintk("exp_export: export of idmapped mounts not yet supported.\n");
+               return -EINVAL;
+       }
+ 
         if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK &&
             !(*flags & NFSEXP_NOSUBTREECHECK)) {
                 dprintk("%s: %s does not support subtree checking!\n",
@@@ -653,8 -643,7 +660,7 @@@ static int svc_export_parse(struct cach
                                 goto out4;
                 }
   
-               err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags,
-                                  exp.ex_uuid);
+               err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid);
                 if (err)
                         goto out4;
                 /*
@@@ -709,47 -698,22 +715,47 @@@ static void exp_flags(struct seq_file *
                 kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
   static void show_secinfo(struct seq_file *m, struct svc_export *exp);
   
+ +static int is_export_stats_file(struct seq_file *m)
+ +{
+ +      /*
+ +       * The export_stats file uses the same ops as the exports file.
+ +       * We use the file's name to determine the reported info per export.
+ +       * There is no rename in nsfdfs, so d_name.name is stable.
+ +       */
+ +      return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats");
+ +}
+ +
   static int svc_export_show(struct seq_file *m,
                            struct cache_detail *cd,
                            struct cache_head *h)
   {
- -      struct svc_export *exp ;
+ +      struct svc_export *exp;
+ +      bool export_stats = is_export_stats_file(m);
   
- -      if (h ==NULL) {
- -              seq_puts(m, "#path domain(flags)\n");
+ +      if (h == NULL) {
+ +              if (export_stats)
+ +                      seq_puts(m, "#path domain start-time\n#\tstats\n");
+ +              else
+ +                      seq_puts(m, "#path domain(flags)\n");
                 return 0;
         }
         exp = container_of(h, struct svc_export, h);
         seq_path(m, &exp->ex_path, " \t\n\\");
         seq_putc(m, '\t');
         seq_escape(m, exp->ex_client->name, " \t\n\\");
+ +      if (export_stats) {
+ +              seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
+ +              seq_printf(m, "\tfh_stale: %lld\n",
+ +                         percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
+ +              seq_printf(m, "\tio_read: %lld\n",
+ +                         percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
+ +              seq_printf(m, "\tio_write: %lld\n",
+ +                         percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
+ +              seq_putc(m, '\n');
+ +              return 0;
+ +      }
         seq_putc(m, '(');
- -      if (test_bit(CACHE_VALID, &h->flags) && 
+ +      if (test_bit(CACHE_VALID, &h->flags) &&
             !test_bit(CACHE_NEGATIVE, &h->flags)) {
                 exp_flags(m, exp->ex_flags, exp->ex_fsid,
                           exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
@@@ -790,7 -754,6 +796,7 @@@ static void svc_export_init(struct cach
         new->ex_layout_types = 0;
         new->ex_uuid = NULL;
         new->cd = item->cd;
+ +      export_stats_reset(&new->ex_stats);
   }
   
   static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@@ -823,15 -786,10 +829,15 @@@
   static struct cache_head *svc_export_alloc(void)
   {
         struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
- -      if (i)
- -              return &i->h;
- -      else
+ +      if (!i)
+ +              return NULL;
+ +
+ +      if (export_stats_init(&i->ex_stats)) {
+ +              kfree(i);
                 return NULL;
+ +      }
+ +
+ +      return &i->h;
   }
   
   static const struct cache_detail svc_export_cache_template = {
@@@ -1293,14 -1251,10 +1299,14 @@@ static int e_show(struct seq_file *m, v
         struct cache_head *cp = p;
         struct svc_export *exp = container_of(cp, struct svc_export, h);
         struct cache_detail *cd = m->private;
+ +      bool export_stats = is_export_stats_file(m);
   
         if (p == SEQ_START_TOKEN) {
                 seq_puts(m, "# Version 1.1\n");
- -              seq_puts(m, "# Path Client(Flags) # IPs\n");
+ +              if (export_stats)
+ +                      seq_puts(m, "# Path Client Start-time\n#\tStats\n");
+ +              else
+ +                      seq_puts(m, "# Path Client(Flags) # IPs\n");
                 return 0;
         }
   
diff --combined fs/nfsd/nfs2acl.c

index 7eeac5b,b83f222..855e177
--- 1/fs/nfsd/nfs2acl.c
--- 2/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@@ -113,10 -113,12 +113,12 @@@ static __be32 nfsacld_proc_setacl(struc
   
         fh_lock(fh);
   
-       error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+       error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+                             argp->acl_access);
         if (error)
                 goto out_drop_lock;
-       error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+       error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+                             argp->acl_default);
         if (error)
                 goto out_drop_lock;
   
@@@ -188,49 -190,63 +190,49 @@@ out
   
   static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
   {
+ +      struct xdr_stream *xdr = &rqstp->rq_arg_stream;
         struct nfsd3_getaclargs *argp = rqstp->rq_argp;
   
- -      p = nfs2svc_decode_fh(p, &argp->fh);
- -      if (!p)
+ +      if (!svcxdr_decode_fhandle(xdr, &argp->fh))
+ +              return 0;
+ +      if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
                 return 0;
- -      argp->mask = ntohl(*p); p++;
   
- -      return xdr_argsize_check(rqstp, p);
+ +      return 1;
   }
   
- -
   static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
   {
+ +      struct xdr_stream *xdr = &rqstp->rq_arg_stream;
         struct nfsd3_setaclargs *argp = rqstp->rq_argp;
- -      struct kvec *head = rqstp->rq_arg.head;
- -      unsigned int base;
- -      int n;
   
- -      p = nfs2svc_decode_fh(p, &argp->fh);
- -      if (!p)
+ +      if (!svcxdr_decode_fhandle(xdr, &argp->fh))
                 return 0;
- -      argp->mask = ntohl(*p++);
- -      if (argp->mask & ~NFS_ACL_MASK ||
- -          !xdr_argsize_check(rqstp, p))
+ +      if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
                 return 0;
- -
- -      base = (char *)p - (char *)head->iov_base;
- -      n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- -                        (argp->mask & NFS_ACL) ?
- -                        &argp->acl_access : NULL);
- -      if (n > 0)
- -              n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- -                                (argp->mask & NFS_DFACL) ?
- -                                &argp->acl_default : NULL);
- -      return (n > 0);
- -}
- -
- -static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
- -{
- -      struct nfsd_fhandle *argp = rqstp->rq_argp;
- -
- -      p = nfs2svc_decode_fh(p, &argp->fh);
- -      if (!p)
+ +      if (argp->mask & ~NFS_ACL_MASK)
+ +              return 0;
+ +      if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ +                                 &argp->acl_access : NULL))
                 return 0;
- -      return xdr_argsize_check(rqstp, p);
+ +      if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ +                                 &argp->acl_default : NULL))
+ +              return 0;
+ +
+ +      return 1;
   }
   
   static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
   {
- -      struct nfsd3_accessargs *argp = rqstp->rq_argp;
+ +      struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ +      struct nfsd3_accessargs *args = rqstp->rq_argp;
   
- -      p = nfs2svc_decode_fh(p, &argp->fh);
- -      if (!p)
+ +      if (!svcxdr_decode_fhandle(xdr, &args->fh))
+ +              return 0;
+ +      if (xdr_stream_decode_u32(xdr, &args->access) < 0)
                 return 0;
- -      argp->access = ntohl(*p++);
   
- -      return xdr_argsize_check(rqstp, p);
+ +      return 1;
   }
   
   /*
@@@ -357,7 -373,6 +359,7 @@@ static const struct svc_procedure nfsd_
                 .pc_ressize = sizeof(struct nfsd_voidres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST,
+ +              .pc_name = "NULL",
         },
         [ACLPROC2_GETACL] = {
                 .pc_func = nfsacld_proc_getacl,
@@@ -368,7 -383,6 +370,7 @@@
                 .pc_ressize = sizeof(struct nfsd3_getaclres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+1+2*(1+ACL),
+ +              .pc_name = "GETACL",
         },
         [ACLPROC2_SETACL] = {
                 .pc_func = nfsacld_proc_setacl,
@@@ -379,18 -393,16 +381,18 @@@
                 .pc_ressize = sizeof(struct nfsd_attrstat),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+AT,
+ +              .pc_name = "SETACL",
         },
         [ACLPROC2_GETATTR] = {
                 .pc_func = nfsacld_proc_getattr,
- -              .pc_decode = nfsaclsvc_decode_fhandleargs,
+ +              .pc_decode = nfssvc_decode_fhandleargs,
                 .pc_encode = nfsaclsvc_encode_attrstatres,
                 .pc_release = nfsaclsvc_release_attrstat,
                 .pc_argsize = sizeof(struct nfsd_fhandle),
                 .pc_ressize = sizeof(struct nfsd_attrstat),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+AT,
+ +              .pc_name = "GETATTR",
         },
         [ACLPROC2_ACCESS] = {
                 .pc_func = nfsacld_proc_access,
@@@ -401,7 -413,6 +403,7 @@@
                 .pc_ressize = sizeof(struct nfsd3_accessres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+AT+1,
+ +              .pc_name = "SETATTR",
         },
   };
   
diff --combined fs/nfsd/nfs3acl.c

index a568b84,f18ec7e..9a6f18d
--- 1/fs/nfsd/nfs3acl.c
--- 2/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@@ -103,10 -103,12 +103,12 @@@ static __be32 nfsd3_proc_setacl(struct 
   
         fh_lock(fh);
   
-       error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+       error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+                             argp->acl_access);
         if (error)
                 goto out_drop_lock;
-       error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+       error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+                             argp->acl_default);
   
   out_drop_lock:
         fh_unlock(fh);
@@@ -124,39 -126,43 +126,39 @@@ out
   /*
    * XDR decode functions
    */
+ +
   static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
   {
+ +      struct xdr_stream *xdr = &rqstp->rq_arg_stream;
         struct nfsd3_getaclargs *args = rqstp->rq_argp;
   
- -      p = nfs3svc_decode_fh(p, &args->fh);
- -      if (!p)
+ +      if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
+ +              return 0;
+ +      if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
                 return 0;
- -      args->mask = ntohl(*p); p++;
   
- -      return xdr_argsize_check(rqstp, p);
+ +      return 1;
   }
   
- -
   static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
   {
- -      struct nfsd3_setaclargs *args = rqstp->rq_argp;
- -      struct kvec *head = rqstp->rq_arg.head;
- -      unsigned int base;
- -      int n;
+ +      struct xdr_stream *xdr = &rqstp->rq_arg_stream;
+ +      struct nfsd3_setaclargs *argp = rqstp->rq_argp;
   
- -      p = nfs3svc_decode_fh(p, &args->fh);
- -      if (!p)
+ +      if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
                 return 0;
- -      args->mask = ntohl(*p++);
- -      if (args->mask & ~NFS_ACL_MASK ||
- -          !xdr_argsize_check(rqstp, p))
+ +      if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
+ +              return 0;
+ +      if (argp->mask & ~NFS_ACL_MASK)
+ +              return 0;
+ +      if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
+ +                                 &argp->acl_access : NULL))
+ +              return 0;
+ +      if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
+ +                                 &argp->acl_default : NULL))
                 return 0;
   
- -      base = (char *)p - (char *)head->iov_base;
- -      n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
- -                        (args->mask & NFS_ACL) ?
- -                        &args->acl_access : NULL);
- -      if (n > 0)
- -              n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
- -                                (args->mask & NFS_DFACL) ?
- -                                &args->acl_default : NULL);
- -      return (n > 0);
+ +      return 1;
   }
   
   /*
@@@ -247,7 -253,6 +249,7 @@@ static const struct svc_procedure nfsd_
                 .pc_ressize = sizeof(struct nfsd_voidres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST,
+ +              .pc_name = "NULL",
         },
         [ACLPROC3_GETACL] = {
                 .pc_func = nfsd3_proc_getacl,
@@@ -258,7 -263,6 +260,7 @@@
                 .pc_ressize = sizeof(struct nfsd3_getaclres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+1+2*(1+ACL),
+ +              .pc_name = "GETACL",
         },
         [ACLPROC3_SETACL] = {
                 .pc_func = nfsd3_proc_setacl,
@@@ -269,7 -273,6 +271,7 @@@
                 .pc_ressize = sizeof(struct nfsd3_attrstat),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+pAT,
+ +              .pc_name = "SETACL",
         },
   };
   
diff --combined fs/nfsd/nfsfh.c

index 4744a27,8d90796..10b4442
--- 1/fs/nfsd/nfsfh.c
--- 2/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@@ -40,7 -40,8 +40,8 @@@ static int nfsd_acceptable(void *expv, 
                 /* make sure parents give x permission to user */
                 int err;
                 parent = dget_parent(tdentry);
-               err = inode_permission(d_inode(parent), MAY_EXEC);
+               err = inode_permission(&init_user_ns,
+                                      d_inode(parent), MAY_EXEC);
                 if (err < 0) {
                         dput(parent);
                         break;
@@@ -349,7 -350,7 +350,7 @@@ out
   __be32
   fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
   {
- -      struct svc_export *exp;
+ +      struct svc_export *exp = NULL;
         struct dentry   *dentry;
         __be32          error;
   
@@@ -422,7 -423,7 +423,7 @@@ skip_pseudoflavor_check
         }
   out:
         if (error == nfserr_stale)
- -              nfsdstats.fh_stale++;
+ +              nfsd_stats_fh_stale_inc(exp);
         return error;
   }
   
diff --combined fs/nfsd/nfsproc.c

index b2f8035,0ea0554..a8d5449
--- 1/fs/nfsd/nfsproc.c
--- 2/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@@ -90,7 -90,7 +90,7 @@@ nfsd_proc_setattr(struct svc_rqst *rqst
                 if (delta < 0)
                         delta = -delta;
                 if (delta < MAX_TOUCH_TIME_ERROR &&
-                   setattr_prepare(fhp->fh_dentry, iap) != 0) {
+                   setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) {
                         /*
                          * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
                          * This will cause notify_change to set these times
@@@ -149,15 -149,14 +149,15 @@@ out
   static __be32
   nfsd_proc_readlink(struct svc_rqst *rqstp)
   {
- -      struct nfsd_readlinkargs *argp = rqstp->rq_argp;
+ +      struct nfsd_fhandle *argp = rqstp->rq_argp;
         struct nfsd_readlinkres *resp = rqstp->rq_resp;
+ +      char *buffer = page_address(*(rqstp->rq_next_page++));
   
         dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
   
         /* Read the symlink. */
         resp->len = NFS_MAXPATHLEN;
- -      resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
+ +      resp->status = nfsd_readlink(rqstp, &argp->fh, buffer, &resp->len);
   
         fh_put(&argp->fh);
         return rpc_success;
@@@ -172,36 -171,32 +172,36 @@@ nfsd_proc_read(struct svc_rqst *rqstp
   {
         struct nfsd_readargs *argp = rqstp->rq_argp;
         struct nfsd_readres *resp = rqstp->rq_resp;
+ +      unsigned int len;
         u32 eof;
+ +      int v;
   
         dprintk("nfsd: READ    %s %d bytes at %d\n",
                 SVCFH_fmt(&argp->fh),
                 argp->count, argp->offset);
   
+ +      argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+ +
+ +      v = 0;
+ +      len = argp->count;
+ +      while (len > 0) {
+ +              struct page *page = *(rqstp->rq_next_page++);
+ +
+ +              rqstp->rq_vec[v].iov_base = page_address(page);
+ +              rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+ +              len -= rqstp->rq_vec[v].iov_len;
+ +              v++;
+ +      }
+ +
         /* Obtain buffer pointer for payload. 19 is 1 word for
          * status, 17 words for fattr, and 1 word for the byte count.
          */
- -
- -      if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
- -              char buf[RPC_MAX_ADDRBUFLEN];
- -              printk(KERN_NOTICE
- -                      "oversized read request from %s (%d bytes)\n",
- -                              svc_print_addr(rqstp, buf, sizeof(buf)),
- -                              argp->count);
- -              argp->count = NFSSVC_MAXBLKSIZE_V2;
- -      }
         svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
   
         resp->count = argp->count;
- -      resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
- -                               argp->offset,
- -                               rqstp->rq_vec, argp->vlen,
- -                               &resp->count,
- -                               &eof);
+ +      fh_copy(&resp->fh, &argp->fh);
+ +      resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+ +                               rqstp->rq_vec, v, &resp->count, &eof);
         if (resp->status == nfs_ok)
                 resp->status = fh_getattr(&resp->fh, &resp->stat);
         else if (resp->status == nfserr_jukebox)
@@@ -553,20 -548,6 +553,20 @@@ nfsd_proc_rmdir(struct svc_rqst *rqstp
         return rpc_success;
   }
   
+ +static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
+ +                                  struct nfsd_readdirres *resp,
+ +                                  int count)
+ +{
+ +      count = min_t(u32, count, PAGE_SIZE);
+ +
+ +      /* Convert byte count to number of words (i.e. >> 2),
+ +       * and reserve room for the NULL ptr & eof flag (-2 words) */
+ +      resp->buflen = (count >> 2) - 2;
+ +
+ +      resp->buffer = page_address(*rqstp->rq_next_page);
+ +      rqstp->rq_next_page++;
+ +}
+ +
   /*
    * Read a portion of a directory.
    */
@@@ -575,24 -556,31 +575,24 @@@ nfsd_proc_readdir(struct svc_rqst *rqst
   {
         struct nfsd_readdirargs *argp = rqstp->rq_argp;
         struct nfsd_readdirres *resp = rqstp->rq_resp;
- -      int             count;
         loff_t          offset;
+ +      __be32          *buffer;
   
         dprintk("nfsd: READDIR  %s %d bytes at %d\n",
                 SVCFH_fmt(&argp->fh),           
                 argp->count, argp->cookie);
   
- -      /* Shrink to the client read size */
- -      count = (argp->count >> 2) - 2;
- -
- -      /* Make sure we've room for the NULL ptr & eof flag */
- -      count -= 2;
- -      if (count < 0)
- -              count = 0;
+ +      nfsd_init_dirlist_pages(rqstp, resp, argp->count);
+ +      buffer = resp->buffer;
   
- -      resp->buffer = argp->buffer;
         resp->offset = NULL;
- -      resp->buflen = count;
         resp->common.err = nfs_ok;
         /* Read directory and encode entries on the fly */
         offset = argp->cookie;
         resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
                                     &resp->common, nfssvc_encode_entry);
   
- -      resp->count = resp->buffer - argp->buffer;
+ +      resp->count = resp->buffer - buffer;
         if (resp->offset)
                 *resp->offset = htonl(offset);
   
@@@ -635,18 -623,16 +635,18 @@@ static const struct svc_procedure nfsd_
                 .pc_ressize = sizeof(struct nfsd_voidres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = 0,
+ +              .pc_name = "NULL",
         },
         [NFSPROC_GETATTR] = {
                 .pc_func = nfsd_proc_getattr,
- -              .pc_decode = nfssvc_decode_fhandle,
+ +              .pc_decode = nfssvc_decode_fhandleargs,
                 .pc_encode = nfssvc_encode_attrstat,
                 .pc_release = nfssvc_release_attrstat,
                 .pc_argsize = sizeof(struct nfsd_fhandle),
                 .pc_ressize = sizeof(struct nfsd_attrstat),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+AT,
+ +              .pc_name = "GETATTR",
         },
         [NFSPROC_SETATTR] = {
                 .pc_func = nfsd_proc_setattr,
@@@ -657,7 -643,6 +657,7 @@@
                 .pc_ressize = sizeof(struct nfsd_attrstat),
                 .pc_cachetype = RC_REPLBUFF,
                 .pc_xdrressize = ST+AT,
+ +              .pc_name = "SETATTR",
         },
         [NFSPROC_ROOT] = {
                 .pc_func = nfsd_proc_root,
@@@ -667,7 -652,6 +667,7 @@@
                 .pc_ressize = sizeof(struct nfsd_voidres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = 0,
+ +              .pc_name = "ROOT",
         },
         [NFSPROC_LOOKUP] = {
                 .pc_func = nfsd_proc_lookup,
@@@ -678,17 -662,15 +678,17 @@@
                 .pc_ressize = sizeof(struct nfsd_diropres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+FH+AT,
+ +              .pc_name = "LOOKUP",
         },
         [NFSPROC_READLINK] = {
                 .pc_func = nfsd_proc_readlink,
- -              .pc_decode = nfssvc_decode_readlinkargs,
+ +              .pc_decode = nfssvc_decode_fhandleargs,
                 .pc_encode = nfssvc_encode_readlinkres,
- -              .pc_argsize = sizeof(struct nfsd_readlinkargs),
+ +              .pc_argsize = sizeof(struct nfsd_fhandle),
                 .pc_ressize = sizeof(struct nfsd_readlinkres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
+ +              .pc_name = "READLINK",
         },
         [NFSPROC_READ] = {
                 .pc_func = nfsd_proc_read,
@@@ -699,7 -681,6 +699,7 @@@
                 .pc_ressize = sizeof(struct nfsd_readres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ +              .pc_name = "READ",
         },
         [NFSPROC_WRITECACHE] = {
                 .pc_func = nfsd_proc_writecache,
@@@ -709,7 -690,6 +709,7 @@@
                 .pc_ressize = sizeof(struct nfsd_voidres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = 0,
+ +              .pc_name = "WRITECACHE",
         },
         [NFSPROC_WRITE] = {
                 .pc_func = nfsd_proc_write,
@@@ -720,7 -700,6 +720,7 @@@
                 .pc_ressize = sizeof(struct nfsd_attrstat),
                 .pc_cachetype = RC_REPLBUFF,
                 .pc_xdrressize = ST+AT,
+ +              .pc_name = "WRITE",
         },
         [NFSPROC_CREATE] = {
                 .pc_func = nfsd_proc_create,
@@@ -731,7 -710,6 +731,7 @@@
                 .pc_ressize = sizeof(struct nfsd_diropres),
                 .pc_cachetype = RC_REPLBUFF,
                 .pc_xdrressize = ST+FH+AT,
+ +              .pc_name = "CREATE",
         },
         [NFSPROC_REMOVE] = {
                 .pc_func = nfsd_proc_remove,
@@@ -741,7 -719,6 +741,7 @@@
                 .pc_ressize = sizeof(struct nfsd_stat),
                 .pc_cachetype = RC_REPLSTAT,
                 .pc_xdrressize = ST,
+ +              .pc_name = "REMOVE",
         },
         [NFSPROC_RENAME] = {
                 .pc_func = nfsd_proc_rename,
@@@ -751,7 -728,6 +751,7 @@@
                 .pc_ressize = sizeof(struct nfsd_stat),
                 .pc_cachetype = RC_REPLSTAT,
                 .pc_xdrressize = ST,
+ +              .pc_name = "RENAME",
         },
         [NFSPROC_LINK] = {
                 .pc_func = nfsd_proc_link,
@@@ -761,7 -737,6 +761,7 @@@
                 .pc_ressize = sizeof(struct nfsd_stat),
                 .pc_cachetype = RC_REPLSTAT,
                 .pc_xdrressize = ST,
+ +              .pc_name = "LINK",
         },
         [NFSPROC_SYMLINK] = {
                 .pc_func = nfsd_proc_symlink,
@@@ -771,7 -746,6 +771,7 @@@
                 .pc_ressize = sizeof(struct nfsd_stat),
                 .pc_cachetype = RC_REPLSTAT,
                 .pc_xdrressize = ST,
+ +              .pc_name = "SYMLINK",
         },
         [NFSPROC_MKDIR] = {
                 .pc_func = nfsd_proc_mkdir,
@@@ -782,7 -756,6 +782,7 @@@
                 .pc_ressize = sizeof(struct nfsd_diropres),
                 .pc_cachetype = RC_REPLBUFF,
                 .pc_xdrressize = ST+FH+AT,
+ +              .pc_name = "MKDIR",
         },
         [NFSPROC_RMDIR] = {
                 .pc_func = nfsd_proc_rmdir,
@@@ -792,7 -765,6 +792,7 @@@
                 .pc_ressize = sizeof(struct nfsd_stat),
                 .pc_cachetype = RC_REPLSTAT,
                 .pc_xdrressize = ST,
+ +              .pc_name = "RMDIR",
         },
         [NFSPROC_READDIR] = {
                 .pc_func = nfsd_proc_readdir,
@@@ -801,17 -773,15 +801,17 @@@
                 .pc_argsize = sizeof(struct nfsd_readdirargs),
                 .pc_ressize = sizeof(struct nfsd_readdirres),
                 .pc_cachetype = RC_NOCACHE,
+ +              .pc_name = "READDIR",
         },
         [NFSPROC_STATFS] = {
                 .pc_func = nfsd_proc_statfs,
- -              .pc_decode = nfssvc_decode_fhandle,
+ +              .pc_decode = nfssvc_decode_fhandleargs,
                 .pc_encode = nfssvc_encode_statfsres,
                 .pc_argsize = sizeof(struct nfsd_fhandle),
                 .pc_ressize = sizeof(struct nfsd_statfsres),
                 .pc_cachetype = RC_NOCACHE,
                 .pc_xdrressize = ST+5,
+ +              .pc_name = "STATFS",
         },
   };
   
diff --combined fs/nfsd/vfs.c

index d316e11,fab8731..fd6be35
--- 1/fs/nfsd/vfs.c
--- 2/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@@ -448,7 -448,7 +448,7 @@@ nfsd_setattr(struct svc_rqst *rqstp, st
                         .ia_size        = iap->ia_size,
                 };
   
-               host_err = notify_change(dentry, &size_attr, NULL);
+               host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
                 if (host_err)
                         goto out_unlock;
                 iap->ia_valid &= ~ATTR_SIZE;
@@@ -463,7 -463,7 +463,7 @@@
         }
   
         iap->ia_valid |= ATTR_CTIME;
-       host_err = notify_change(dentry, iap, NULL);
+       host_err = notify_change(&init_user_ns, dentry, iap, NULL);
   
   out_unlock:
         fh_unlock(fhp);
@@@ -499,7 -499,8 +499,8 @@@ int nfsd4_is_junction(struct dentry *de
                 return 0;
         if (!(inode->i_mode & S_ISVTX))
                 return 0;
-       if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
+       if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME,
+                        NULL, 0) <= 0)
                 return 0;
         return 1;
   }
@@@ -889,7 -890,7 +890,7 @@@ static __be32 nfsd_finish_read(struct s
                                unsigned long *count, u32 *eof, ssize_t host_err)
   {
         if (host_err >= 0) {
- -              nfsdstats.io_read += host_err;
+ +              nfsd_stats_io_read_add(fhp->fh_export, host_err);
                 *eof = nfsd_eof_on_read(file, offset, host_err, *count);
                 *count = host_err;
                 fsnotify_access(file);
@@@ -1040,7 -1041,7 +1041,7 @@@ nfsd_vfs_write(struct svc_rqst *rqstp, 
                 goto out_nfserr;
         }
         *cnt = host_err;
- -      nfsdstats.io_write += *cnt;
+ +      nfsd_stats_io_write_add(exp, *cnt);
         fsnotify_modify(file);
   
         if (stable && use_wgather) {
@@@ -1254,12 -1255,12 +1255,12 @@@ nfsd_create_locked(struct svc_rqst *rqs
         host_err = 0;
         switch (type) {
         case S_IFREG:
-               host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+               host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
                 if (!host_err)
                         nfsd_check_ignore_resizing(iap);
                 break;
         case S_IFDIR:
-               host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+               host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode);
                 if (!host_err && unlikely(d_unhashed(dchild))) {
                         struct dentry *d;
                         d = lookup_one_len(dchild->d_name.name,
@@@ -1287,7 -1288,8 +1288,8 @@@
         case S_IFBLK:
         case S_IFIFO:
         case S_IFSOCK:
-               host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+               host_err = vfs_mknod(&init_user_ns, dirp, dchild,
+                                    iap->ia_mode, rdev);
                 break;
         default:
                 printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
@@@ -1485,7 -1487,7 +1487,7 @@@ do_nfsd_create(struct svc_rqst *rqstp, 
         if (!IS_POSIXACL(dirp))
                 iap->ia_mode &= ~current_umask();
   
-       host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+       host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
         if (host_err < 0) {
                 fh_drop_write(fhp);
                 goto out_nfserr;
@@@ -1609,7 -1611,7 +1611,7 @@@ nfsd_symlink(struct svc_rqst *rqstp, st
         if (IS_ERR(dnew))
                 goto out_nfserr;
   
-       host_err = vfs_symlink(d_inode(dentry), dnew, path);
+       host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path);
         err = nfserrno(host_err);
         if (!err)
                 err = nfserrno(commit_metadata(fhp));
@@@ -1677,7 -1679,7 +1679,7 @@@ nfsd_link(struct svc_rqst *rqstp, struc
         err = nfserr_noent;
         if (d_really_is_negative(dold))
                 goto out_dput;
-       host_err = vfs_link(dold, dirp, dnew, NULL);
+       host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL);
         if (!host_err) {
                 err = nfserrno(commit_metadata(ffhp));
                 if (!err)
@@@ -1797,7 -1799,15 +1799,15 @@@ retry
                 close_cached = true;
                 goto out_dput_old;
         } else {
-               host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
+               struct renamedata rd = {
+                       .old_mnt_userns = &init_user_ns,
+                       .old_dir        = fdir,
+                       .old_dentry     = odentry,
+                       .new_mnt_userns = &init_user_ns,
+                       .new_dir        = tdir,
+                       .new_dentry     = ndentry,
+               };
+               host_err = vfs_rename(&rd);
                 if (!host_err) {
                         host_err = commit_metadata(tfhp);
                         if (!host_err)
@@@ -1884,9 -1894,9 +1894,9 @@@ nfsd_unlink(struct svc_rqst *rqstp, str
         if (type != S_IFDIR) {
                 if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
                         nfsd_close_cached_files(rdentry);
-               host_err = vfs_unlink(dirp, rdentry, NULL);
+               host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
         } else {
-               host_err = vfs_rmdir(dirp, rdentry);
+               host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
         }
   
         if (!host_err)
@@@ -2149,7 -2159,7 +2159,7 @@@ nfsd_getxattr(struct svc_rqst *rqstp, s
   
         inode_lock_shared(inode);
   
-       len = vfs_getxattr(dentry, name, NULL, 0);
+       len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
   
         /*
          * Zero-length attribute, just return.
@@@ -2176,7 -2186,7 +2186,7 @@@
                 goto out;
         }
   
-       len = vfs_getxattr(dentry, name, buf, len);
+       len = vfs_getxattr(&init_user_ns, dentry, name, buf, len);
         if (len <= 0) {
                 kvfree(buf);
                 buf = NULL;
@@@ -2283,7 -2293,8 +2293,8 @@@ nfsd_removexattr(struct svc_rqst *rqstp
   
         fh_lock(fhp);
   
-       ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+       ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry,
+                                      name, NULL);
   
         fh_unlock(fhp);
         fh_drop_write(fhp);
@@@ -2307,8 -2318,8 +2318,8 @@@ nfsd_setxattr(struct svc_rqst *rqstp, s
                 return nfserrno(ret);
         fh_lock(fhp);
   
-       ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
-                                   NULL);
+       ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf,
+                                   len, flags, NULL);
   
         fh_unlock(fhp);
         fh_drop_write(fhp);
@@@ -2391,13 -2402,14 +2402,14 @@@ nfsd_permission(struct svc_rqst *rqstp
                 return 0;
   
         /* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-       err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+       err = inode_permission(&init_user_ns, inode,
+                              acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
   
         /* Allow read access to binaries even when mode 111 */
         if (err == -EACCES && S_ISREG(inode->i_mode) &&
              (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) ||
               acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
-               err = inode_permission(inode, MAY_EXEC);
+               err = inode_permission(&init_user_ns, inode, MAY_EXEC);
   
         return err? nfserrno(err) : 0;
   }
diff --combined fs/notify/fanotify/fanotify_user.c

index b78dd1f,64cfc1a..9e0c1af
--- 1/fs/notify/fanotify/fanotify_user.c
--- 2/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@@ -702,7 -702,7 +702,7 @@@ static int fanotify_find_path(int dfd, 
         }
   
         /* you can only watch an inode if you have read permissions on it */
-       ret = inode_permission(path->dentry->d_inode, MAY_READ);
+       ret = path_permission(path, MAY_READ);
         if (ret) {
                 path_put(path);
                 goto out;
@@@ -976,7 -976,7 +976,7 @@@ SYSCALL_DEFINE2(fanotify_init, unsigne
                 f_flags |= O_NONBLOCK;
   
         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
- -      group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
+ +      group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
         if (IS_ERR(group)) {
                 free_uid(user);
                 return PTR_ERR(group);
diff --combined fs/notify/inotify/inotify_user.c

index 266d17e,e1155d3..c71be4f
--- 1/fs/notify/inotify/inotify_user.c
--- 2/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@@ -352,7 -352,7 +352,7 @@@ static int inotify_find_inode(const cha
         if (error)
                 return error;
         /* you can only watch an inode if you have read permissions on it */
-       error = inode_permission(path->dentry->d_inode, MAY_READ);
+       error = path_permission(path, MAY_READ);
         if (error) {
                 path_put(path);
                 return error;
@@@ -632,11 -632,11 +632,11 @@@ static struct fsnotify_group *inotify_n
         struct fsnotify_group *group;
         struct inotify_event_info *oevent;
   
- -      group = fsnotify_alloc_group(&inotify_fsnotify_ops);
+ +      group = fsnotify_alloc_user_group(&inotify_fsnotify_ops);
         if (IS_ERR(group))
                 return group;
   
- -      oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+ +      oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
         if (unlikely(!oevent)) {
                 fsnotify_destroy_group(group);
                 return ERR_PTR(-ENOMEM);
diff --combined fs/ocfs2/file.c

index df6d709,e3039d9..6611c64
--- 1/fs/ocfs2/file.c
--- 2/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@@ -194,7 -194,7 +194,7 @@@ static int ocfs2_sync_file(struct file 
                 needs_barrier = true;
         err = jbd2_complete_transaction(journal, commit_tid);
         if (needs_barrier) {
- -              ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +              ret = blkdev_issue_flush(inode->i_sb->s_bdev);
                 if (!err)
                         err = ret;
         }
@@@ -1112,7 -1112,8 +1112,8 @@@ out
         return ret;
   }
   
- int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
+ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+                 struct iattr *attr)
   {
         int status = 0, size_change;
         int inode_locked = 0;
@@@ -1142,7 -1143,7 +1143,7 @@@
         if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
                 return 0;
   
-       status = setattr_prepare(dentry, attr);
+       status = setattr_prepare(&init_user_ns, dentry, attr);
         if (status)
                 return status;
   
@@@ -1263,7 -1264,7 +1264,7 @@@
                 }
         }
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         mark_inode_dirty(inode);
   
         status = ocfs2_mark_inode_dirty(handle, inode, bh);
@@@ -1298,8 -1299,8 +1299,8 @@@ bail
         return status;
   }
   
- int ocfs2_getattr(const struct path *path, struct kstat *stat,
-                 u32 request_mask, unsigned int flags)
+ int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
+                 struct kstat *stat, u32 request_mask, unsigned int flags)
   {
         struct inode *inode = d_inode(path->dentry);
         struct super_block *sb = path->dentry->d_sb;
@@@ -1313,7 -1314,7 +1314,7 @@@
                 goto bail;
         }
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         /*
          * If there is inline data in the inode, the inode will normally not
          * have data blocks allocated (it may have an external xattr block).
@@@ -1330,7 -1331,8 +1331,8 @@@ bail
         return err;
   }
   
- int ocfs2_permission(struct inode *inode, int mask)
+ int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
+                    int mask)
   {
         int ret, had_lock;
         struct ocfs2_lock_holder oh;
@@@ -1355,7 -1357,7 +1357,7 @@@
                 dump_stack();
         }
   
-       ret = generic_permission(inode, mask);
+       ret = generic_permission(&init_user_ns, inode, mask);
   
         ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
   out:
diff --combined fs/open.c

index ca54447,4ec3979..e53af13
--- 1/fs/open.c
--- 2/fs/open.c
+++ b/fs/open.c
@@@ -35,8 -35,8 +35,8 @@@
   
   #include "internal.h"
   
- int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-       struct file *filp)
+ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
+               loff_t length, unsigned int time_attrs, struct file *filp)
   {
         int ret;
         struct iattr newattrs;
@@@ -61,13 -61,14 +61,14 @@@
   
         inode_lock(dentry->d_inode);
         /* Note any delegations or leases have already been broken: */
-       ret = notify_change(dentry, &newattrs, NULL);
+       ret = notify_change(mnt_userns, dentry, &newattrs, NULL);
         inode_unlock(dentry->d_inode);
         return ret;
   }
   
   long vfs_truncate(const struct path *path, loff_t length)
   {
+       struct user_namespace *mnt_userns;
         struct inode *inode;
         long error;
   
@@@ -83,7 -84,8 +84,8 @@@
         if (error)
                 goto out;
   
-       error = inode_permission(inode, MAY_WRITE);
+       mnt_userns = mnt_user_ns(path->mnt);
+       error = inode_permission(mnt_userns, inode, MAY_WRITE);
         if (error)
                 goto mnt_drop_write_and_out;
   
@@@ -107,7 -109,7 +109,7 @@@
         if (!error)
                 error = security_path_truncate(path);
         if (!error)
-               error = do_truncate(path->dentry, length, 0, NULL);
+               error = do_truncate(mnt_userns, path->dentry, length, 0, NULL);
   
   put_write_and_out:
         put_write_access(inode);
@@@ -186,13 -188,13 +188,13 @@@ long do_sys_ftruncate(unsigned int fd, 
         /* Check IS_APPEND on real upper inode */
         if (IS_APPEND(file_inode(f.file)))
                 goto out_putf;
- 
         sb_start_write(inode->i_sb);
         error = locks_verify_truncate(inode, f.file, length);
         if (!error)
                 error = security_path_truncate(&f.file->f_path);
         if (!error)
-               error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+               error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
+                                   ATTR_MTIME | ATTR_CTIME, f.file);
         sb_end_write(inode->i_sb);
   out_putf:
         fdput(f);
@@@ -436,7 -438,7 +438,7 @@@ retry
                         goto out_path_release;
         }
   
-       res = inode_permission(inode, mode | MAY_ACCESS);
+       res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS);
         /* SuS v2 requires we report a read only fs too */
         if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                 goto out_path_release;
@@@ -492,7 -494,7 +494,7 @@@ retry
         if (error)
                 goto out;
   
-       error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+       error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
         if (error)
                 goto dput_and_out;
   
@@@ -521,7 -523,7 +523,7 @@@ SYSCALL_DEFINE1(fchdir, unsigned int, f
         if (!d_can_lookup(f.file->f_path.dentry))
                 goto out_putf;
   
-       error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+       error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
         if (!error)
                 set_fs_pwd(current->fs, &f.file->f_path);
   out_putf:
@@@ -540,7 -542,7 +542,7 @@@ retry
         if (error)
                 goto out;
   
-       error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+       error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
         if (error)
                 goto dput_and_out;
   
@@@ -580,7 -582,8 +582,8 @@@ retry_deleg
                 goto out_unlock;
         newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
         newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-       error = notify_change(path->dentry, &newattrs, &delegated_inode);
+       error = notify_change(mnt_user_ns(path->mnt), path->dentry,
+                             &newattrs, &delegated_inode);
   out_unlock:
         inode_unlock(inode);
         if (delegated_inode) {
@@@ -641,6 -644,7 +644,7 @@@ SYSCALL_DEFINE2(chmod, const char __use
   
   int chown_common(const struct path *path, uid_t user, gid_t group)
   {
+       struct user_namespace *mnt_userns;
         struct inode *inode = path->dentry->d_inode;
         struct inode *delegated_inode = NULL;
         int error;
@@@ -651,6 -655,10 +655,10 @@@
         uid = make_kuid(current_user_ns(), user);
         gid = make_kgid(current_user_ns(), group);
   
+       mnt_userns = mnt_user_ns(path->mnt);
+       uid = kuid_from_mnt(mnt_userns, uid);
+       gid = kgid_from_mnt(mnt_userns, gid);
+ 
   retry_deleg:
         newattrs.ia_valid =  ATTR_CTIME;
         if (user != (uid_t) -1) {
@@@ -671,7 -679,8 +679,8 @@@
         inode_lock(inode);
         error = security_path_chown(path, uid, gid);
         if (!error)
-               error = notify_change(path->dentry, &newattrs, &delegated_inode);
+               error = notify_change(mnt_userns, path->dentry, &newattrs,
+                                     &delegated_inode);
         inode_unlock(inode);
         if (delegated_inode) {
                 error = break_deleg_wait(&delegated_inode);
@@@ -1091,12 -1100,6 +1100,12 @@@ inline int build_open_flags(const struc
                 lookup_flags |= LOOKUP_BENEATH;
         if (how->resolve & RESOLVE_IN_ROOT)
                 lookup_flags |= LOOKUP_IN_ROOT;
+ +      if (how->resolve & RESOLVE_CACHED) {
+ +              /* Don't bother even trying for create/truncate/tmpfile open */
+ +              if (flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+ +                      return -EAGAIN;
+ +              lookup_flags |= LOOKUP_CACHED;
+ +      }
   
         op->lookup_flags = lookup_flags;
         return 0;
diff --combined fs/overlayfs/copy_up.c

index 0fed532,f81b836..0b2891c
--- 1/fs/overlayfs/copy_up.c
--- 2/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@@ -84,18 -84,10 +84,18 @@@ int ovl_copy_xattr(struct super_block *
   
                 if (ovl_is_private_xattr(sb, name))
                         continue;
+ +
+ +              error = security_inode_copy_up_xattr(name);
+ +              if (error < 0 && error != -EOPNOTSUPP)
+ +                      break;
+ +              if (error == 1) {
+ +                      error = 0;
+ +                      continue; /* Discard */
+ +              }
   retry:
-               size = vfs_getxattr(old, name, value, value_size);
+               size = vfs_getxattr(&init_user_ns, old, name, value, value_size);
                 if (size == -ERANGE)
-                       size = vfs_getxattr(old, name, NULL, 0);
+                       size = vfs_getxattr(&init_user_ns, old, name, NULL, 0);
   
                 if (size < 0) {
                         error = size;
@@@ -115,7 -107,14 +115,7 @@@
                         goto retry;
                 }
   
-               error = vfs_setxattr(new, name, value, size, 0);
- -              error = security_inode_copy_up_xattr(name);
- -              if (error < 0 && error != -EOPNOTSUPP)
- -                      break;
- -              if (error == 1) {
- -                      error = 0;
- -                      continue; /* Discard */
- -              }
+               error = vfs_setxattr(&init_user_ns, new, name, value, size, 0);
                 if (error) {
                         if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name))
                                 break;
@@@ -236,7 -235,7 +236,7 @@@ static int ovl_set_size(struct dentry *
                 .ia_size = stat->size,
         };
   
-       return notify_change(upperdentry, &attr, NULL);
+       return notify_change(&init_user_ns, upperdentry, &attr, NULL);
   }
   
   static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
@@@ -248,7 -247,7 +248,7 @@@
                 .ia_mtime = stat->mtime,
         };
   
-       return notify_change(upperdentry, &attr, NULL);
+       return notify_change(&init_user_ns, upperdentry, &attr, NULL);
   }
   
   int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
@@@ -260,7 -259,7 +260,7 @@@
                         .ia_valid = ATTR_MODE,
                         .ia_mode = stat->mode,
                 };
-               err = notify_change(upperdentry, &attr, NULL);
+               err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
         }
         if (!err) {
                 struct iattr attr = {
@@@ -268,7 -267,7 +268,7 @@@
                         .ia_uid = stat->uid,
                         .ia_gid = stat->gid,
                 };
-               err = notify_change(upperdentry, &attr, NULL);
+               err = notify_change(&init_user_ns, upperdentry, &attr, NULL);
         }
         if (!err)
                 ovl_set_timestamps(upperdentry, stat);
@@@ -796,7 -795,7 +796,7 @@@ static ssize_t ovl_getxattr(struct dent
         ssize_t res;
         char *buf;
   
-       res = vfs_getxattr(dentry, name, NULL, 0);
+       res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0);
         if (res == -ENODATA || res == -EOPNOTSUPP)
                 res = 0;
   
@@@ -805,7 -804,7 +805,7 @@@
                 if (!buf)
                         return -ENOMEM;
   
-               res = vfs_getxattr(dentry, name, buf, res);
+               res = vfs_getxattr(&init_user_ns, dentry, name, buf, res);
                 if (res < 0)
                         kfree(buf);
                 else
@@@ -847,8 -846,8 +847,8 @@@ static int ovl_copy_up_meta_inode_data(
          * don't want that to happen for normal copy-up operation.
          */
         if (capability) {
-               err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
-                                  capability, cap_size, 0);
+               err = vfs_setxattr(&init_user_ns, upperpath.dentry,
+                                  XATTR_NAME_CAPS, capability, cap_size, 0);
                 if (err)
                         goto out_free;
         }
diff --combined fs/overlayfs/dir.c

index d1efa3a,8b3be73..836f14b
--- 1/fs/overlayfs/dir.c
--- 2/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@@ -449,7 -449,7 +449,7 @@@ static int ovl_set_upper_acl(struct den
         if (err < 0)
                 goto out_free;
   
-       err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+       err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE);
   out_free:
         kfree(buffer);
         return err;
@@@ -508,7 -508,7 +508,7 @@@ static int ovl_create_over_whiteout(str
                         .ia_mode = cattr->mode,
                 };
                 inode_lock(newdentry->d_inode);
-               err = notify_change(newdentry, &attr, NULL);
+               err = notify_change(&init_user_ns, newdentry, &attr, NULL);
                 inode_unlock(newdentry->d_inode);
                 if (err)
                         goto out_cleanup;
@@@ -636,7 -636,7 +636,7 @@@ static int ovl_create_object(struct den
         inode->i_state |= I_CREATING;
         spin_unlock(&inode->i_lock);
   
-       inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+       inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode);
         attr.mode = inode->i_mode;
   
         err = ovl_create_or_link(dentry, inode, &attr, false);
@@@ -650,19 -650,20 +650,20 @@@ out
         return err;
   }
   
- static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                     bool excl)
+ static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir,
+                     struct dentry *dentry, umode_t mode, bool excl)
   {
         return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
   }
   
- static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                    struct dentry *dentry, umode_t mode)
   {
         return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
   }
   
- static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-                    dev_t rdev)
+ static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+                    struct dentry *dentry, umode_t mode, dev_t rdev)
   {
         /* Don't allow creation of "whiteout" on overlay */
         if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
@@@ -671,8 -672,8 +672,8 @@@
         return ovl_create_object(dentry, mode, rdev, NULL);
   }
   
- static int ovl_symlink(struct inode *dir, struct dentry *dentry,
-                      const char *link)
+ static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+                      struct dentry *dentry, const char *link)
   {
         return ovl_create_object(dentry, S_IFLNK, 0, link);
   }
@@@ -821,9 -822,9 +822,9 @@@ static int ovl_remove_upper(struct dent
                 goto out_dput_upper;
   
         if (is_dir)
-               err = vfs_rmdir(dir, upper);
+               err = vfs_rmdir(&init_user_ns, dir, upper);
         else
-               err = vfs_unlink(dir, upper, NULL);
+               err = vfs_unlink(&init_user_ns, dir, upper, NULL);
         ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry));
   
         /*
@@@ -992,8 -993,8 +993,8 @@@ static char *ovl_get_redirect(struct de
   
                 buflen -= thislen;
                 memcpy(&buf[buflen], name, thislen);
- -              tmp = dget_dlock(d->d_parent);
                 spin_unlock(&d->d_lock);
+ +              tmp = dget_parent(d);
   
                 dput(d);
                 d = tmp;
@@@ -1069,9 -1070,9 +1070,9 @@@ static int ovl_set_redirect(struct dent
         return err;
   }
   
- static int ovl_rename(struct inode *olddir, struct dentry *old,
-                     struct inode *newdir, struct dentry *new,
-                     unsigned int flags)
+ static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir,
+                     struct dentry *old, struct inode *newdir,
+                     struct dentry *new, unsigned int flags)
   {
         int err;
         struct dentry *old_upperdir;
diff --combined fs/overlayfs/file.c

index 077d3ad,7d8b84c..dbfb35f
--- 1/fs/overlayfs/file.c
--- 2/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@@ -50,11 -50,11 +50,11 @@@ static struct file *ovl_open_realfile(c
                 acc_mode |= MAY_APPEND;
   
         old_cred = ovl_override_creds(inode->i_sb);
-       err = inode_permission(realinode, MAY_OPEN | acc_mode);
+       err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode);
         if (err) {
                 realfile = ERR_PTR(err);
         } else {
-               if (!inode_owner_or_capable(realinode))
+               if (!inode_owner_or_capable(&init_user_ns, realinode))
                         flags &= ~O_NOATIME;
   
                 realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@@ -398,9 -398,8 +398,9 @@@ static int ovl_fsync(struct file *file
         const struct cred *old_cred;
         int ret;
   
- -      if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb)))
- -              return 0;
+ +      ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
+ +      if (ret <= 0)
+ +              return ret;
   
         ret = ovl_real_fdget_meta(file, &real, !datasync);
         if (ret)
@@@ -521,7 -520,7 +521,7 @@@ static long ovl_ioctl_set_flags(struct 
         long ret;
         struct inode *inode = file_inode(file);
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 return -EACCES;
   
         ret = mnt_want_write_file(file);
diff --combined fs/overlayfs/inode.c

index cf41bcb,e78d45d..003cf83
--- 1/fs/overlayfs/inode.c
--- 2/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@@ -14,14 -14,15 +14,15 @@@
   #include "overlayfs.h"
   
   
- int ovl_setattr(struct dentry *dentry, struct iattr *attr)
+ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+               struct iattr *attr)
   {
         int err;
         bool full_copy_up = false;
         struct dentry *upperdentry;
         const struct cred *old_cred;
   
-       err = setattr_prepare(dentry, attr);
+       err = setattr_prepare(&init_user_ns, dentry, attr);
         if (err)
                 return err;
   
@@@ -79,7 -80,7 +80,7 @@@
   
                 inode_lock(upperdentry->d_inode);
                 old_cred = ovl_override_creds(dentry->d_sb);
-               err = notify_change(upperdentry, attr, NULL);
+               err = notify_change(&init_user_ns, upperdentry, attr, NULL);
                 revert_creds(old_cred);
                 if (!err)
                         ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
@@@ -154,8 -155,8 +155,8 @@@ static int ovl_map_dev_ino(struct dentr
         return 0;
   }
   
- int ovl_getattr(const struct path *path, struct kstat *stat,
-               u32 request_mask, unsigned int flags)
+ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+               struct kstat *stat, u32 request_mask, unsigned int flags)
   {
         struct dentry *dentry = path->dentry;
         enum ovl_path_type type;
@@@ -277,7 -278,8 +278,8 @@@ out
         return err;
   }
   
- int ovl_permission(struct inode *inode, int mask)
+ int ovl_permission(struct user_namespace *mnt_userns,
+                  struct inode *inode, int mask)
   {
         struct inode *upperinode = ovl_inode_upper(inode);
         struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
@@@ -294,7 -296,7 +296,7 @@@
          * Check overlay inode with the creds of task and underlying inode
          * with creds of mounter
          */
-       err = generic_permission(inode, mask);
+       err = generic_permission(&init_user_ns, inode, mask);
         if (err)
                 return err;
   
@@@ -305,7 -307,7 +307,7 @@@
                 /* Make sure mounter can read file for copy up later */
                 mask |= MAY_READ;
         }
-       err = inode_permission(realinode, mask);
+       err = inode_permission(&init_user_ns, realinode, mask);
         revert_creds(old_cred);
   
         return err;
@@@ -352,9 -354,7 +354,9 @@@ int ovl_xattr_set(struct dentry *dentry
                 goto out;
   
         if (!value && !upperdentry) {
-               err = vfs_getxattr(realdentry, name, NULL, 0);
+ +              old_cred = ovl_override_creds(dentry->d_sb);
+               err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0);
+ +              revert_creds(old_cred);
                 if (err < 0)
                         goto out_drop_write;
         }
@@@ -369,10 -369,11 +371,11 @@@
   
         old_cred = ovl_override_creds(dentry->d_sb);
         if (value)
-               err = vfs_setxattr(realdentry, name, value, size, flags);
+               err = vfs_setxattr(&init_user_ns, realdentry, name, value, size,
+                                  flags);
         else {
                 WARN_ON(flags != XATTR_REPLACE);
-               err = vfs_removexattr(realdentry, name);
+               err = vfs_removexattr(&init_user_ns, realdentry, name);
         }
         revert_creds(old_cred);
   
@@@ -394,7 -395,7 +397,7 @@@ int ovl_xattr_get(struct dentry *dentry
                 ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
   
         old_cred = ovl_override_creds(dentry->d_sb);
-       res = vfs_getxattr(realdentry, name, value, size);
+       res = vfs_getxattr(&init_user_ns, realdentry, name, value, size);
         revert_creds(old_cred);
         return res;
   }
diff --combined fs/overlayfs/overlayfs.h

index cb4e2d6,78b9d93..95cff83
--- 1/fs/overlayfs/overlayfs.h
--- 2/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@@ -123,7 -123,7 +123,7 @@@ static inline const char *ovl_xattr(str
   
   static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
   {
-       int err = vfs_rmdir(dir, dentry);
+       int err = vfs_rmdir(&init_user_ns, dir, dentry);
   
         pr_debug("rmdir(%pd2) = %i\n", dentry, err);
         return err;
@@@ -131,7 -131,7 +131,7 @@@
   
   static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
   {
-       int err = vfs_unlink(dir, dentry, NULL);
+       int err = vfs_unlink(&init_user_ns, dir, dentry, NULL);
   
         pr_debug("unlink(%pd2) = %i\n", dentry, err);
         return err;
@@@ -140,7 -140,7 +140,7 @@@
   static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
                               struct dentry *new_dentry)
   {
-       int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+       int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL);
   
         pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
         return err;
@@@ -149,7 -149,7 +149,7 @@@
   static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
                                 umode_t mode)
   {
-       int err = vfs_create(dir, dentry, mode, true);
+       int err = vfs_create(&init_user_ns, dir, dentry, mode, true);
   
         pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
         return err;
@@@ -158,7 -158,7 +158,7 @@@
   static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
                                umode_t mode)
   {
-       int err = vfs_mkdir(dir, dentry, mode);
+       int err = vfs_mkdir(&init_user_ns, dir, dentry, mode);
         pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
         return err;
   }
@@@ -166,7 -166,7 +166,7 @@@
   static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
                                umode_t mode, dev_t dev)
   {
-       int err = vfs_mknod(dir, dentry, mode, dev);
+       int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev);
   
         pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
         return err;
@@@ -175,7 -175,7 +175,7 @@@
   static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
                                  const char *oldname)
   {
-       int err = vfs_symlink(dir, dentry, oldname);
+       int err = vfs_symlink(&init_user_ns, dir, dentry, oldname);
   
         pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
         return err;
@@@ -186,7 -186,7 +186,7 @@@ static inline ssize_t ovl_do_getxattr(s
                                       size_t size)
   {
         const char *name = ovl_xattr(ofs, ox);
-       return vfs_getxattr(dentry, name, value, size);
+       return vfs_getxattr(&init_user_ns, dentry, name, value, size);
   }
   
   static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
@@@ -194,7 -194,7 +194,7 @@@
                                   size_t size)
   {
         const char *name = ovl_xattr(ofs, ox);
-       int err = vfs_setxattr(dentry, name, value, size, 0);
+       int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0);
         pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
                  dentry, name, min((int)size, 48), value, size, err);
         return err;
@@@ -204,7 -204,7 +204,7 @@@ static inline int ovl_do_removexattr(st
                                      enum ovl_xattr ox)
   {
         const char *name = ovl_xattr(ofs, ox);
-       int err = vfs_removexattr(dentry, name);
+       int err = vfs_removexattr(&init_user_ns, dentry, name);
         pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
         return err;
   }
@@@ -214,9 -214,18 +214,18 @@@ static inline int ovl_do_rename(struct 
                                 unsigned int flags)
   {
         int err;
+       struct renamedata rd = {
+               .old_mnt_userns = &init_user_ns,
+               .old_dir        = olddir,
+               .old_dentry     = olddentry,
+               .new_mnt_userns = &init_user_ns,
+               .new_dir        = newdir,
+               .new_dentry     = newdentry,
+               .flags          = flags,
+       };
   
         pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
-       err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
+       err = vfs_rename(&rd);
         if (err) {
                 pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
                          olddentry, newdentry, err);
@@@ -226,14 -235,14 +235,14 @@@
   
   static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
   {
-       int err = vfs_whiteout(dir, dentry);
+       int err = vfs_whiteout(&init_user_ns, dir, dentry);
         pr_debug("whiteout(%pd2) = %i\n", dentry, err);
         return err;
   }
   
   static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
   {
-       struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
+       struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0);
         int err = PTR_ERR_OR_ZERO(ret);
   
         pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
@@@ -324,7 -333,6 +333,7 @@@ int ovl_check_metacopy_xattr(struct ovl
   bool ovl_is_metacopy_dentry(struct dentry *dentry);
   char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
                              int padding);
+ +int ovl_sync_status(struct ovl_fs *ofs);
   
   static inline bool ovl_is_impuredir(struct super_block *sb,
                                     struct dentry *dentry)
@@@ -436,10 -444,12 +445,12 @@@ int ovl_set_nlink_lower(struct dentry *
   unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
                            struct dentry *upperdentry,
                            unsigned int fallback);
- int ovl_setattr(struct dentry *dentry, struct iattr *attr);
- int ovl_getattr(const struct path *path, struct kstat *stat,
-               u32 request_mask, unsigned int flags);
- int ovl_permission(struct inode *inode, int mask);
+ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+               struct iattr *attr);
+ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
+               struct kstat *stat, u32 request_mask, unsigned int flags);
+ int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode,
+                  int mask);
   int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
                   const void *value, size_t size, int flags);
   int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
diff --combined fs/overlayfs/super.c

index d58b8f2,b702c57..fdd72f1
--- 1/fs/overlayfs/super.c
--- 2/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@@ -264,20 -264,11 +264,20 @@@ static int ovl_sync_fs(struct super_blo
         struct super_block *upper_sb;
         int ret;
   
- -      if (!ovl_upper_mnt(ofs))
- -              return 0;
+ +      ret = ovl_sync_status(ofs);
+ +      /*
+ +       * We have to always set the err, because the return value isn't
+ +       * checked in syncfs, and instead indirectly return an error via
+ +       * the sb's writeback errseq, which VFS inspects after this call.
+ +       */
+ +      if (ret < 0) {
+ +              errseq_set(&sb->s_wb_err, -EIO);
+ +              return -EIO;
+ +      }
+ +
+ +      if (!ret)
+ +              return ret;
   
- -      if (!ovl_should_sync(ofs))
- -              return 0;
         /*
          * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
          * All the super blocks will be iterated, including upper_sb.
@@@ -803,17 -794,19 +803,19 @@@ retry
                  * allowed as upper are limited to "normal" ones, where checking
                  * for the above two errors is sufficient.
                  */
-               err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+               err = vfs_removexattr(&init_user_ns, work,
+                                     XATTR_NAME_POSIX_ACL_DEFAULT);
                 if (err && err != -ENODATA && err != -EOPNOTSUPP)
                         goto out_dput;
   
-               err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+               err = vfs_removexattr(&init_user_ns, work,
+                                     XATTR_NAME_POSIX_ACL_ACCESS);
                 if (err && err != -ENODATA && err != -EOPNOTSUPP)
                         goto out_dput;
   
                 /* Clear any inherited mode bits */
                 inode_lock(work->d_inode);
-               err = notify_change(work, &attr, NULL);
+               err = notify_change(&init_user_ns, work, &attr, NULL);
                 inode_unlock(work->d_inode);
                 if (err)
                         goto out_dput;
@@@ -865,6 -858,10 +867,10 @@@ static int ovl_mount_dir_noesc(const ch
                 pr_err("filesystem on '%s' not supported\n", name);
                 goto out_put;
         }
+       if (mnt_user_ns(path->mnt) != &init_user_ns) {
+               pr_err("idmapped layers are currently not supported\n");
+               goto out_put;
+       }
         if (!d_is_dir(path->dentry)) {
                 pr_err("'%s' not a directory\n", name);
                 goto out_put;
@@@ -989,6 -986,7 +995,7 @@@ ovl_posix_acl_xattr_get(const struct xa
   
   static int __maybe_unused
   ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+                       struct user_namespace *mnt_userns,
                         struct dentry *dentry, struct inode *inode,
                         const char *name, const void *value,
                         size_t size, int flags)
@@@ -1014,7 -1012,7 +1021,7 @@@
                 goto out_acl_release;
         }
         err = -EPERM;
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(&init_user_ns, inode))
                 goto out_acl_release;
   
         posix_acl_release(acl);
@@@ -1026,10 -1024,10 +1033,10 @@@
         if (unlikely(inode->i_mode & S_ISGID) &&
             handler->flags == ACL_TYPE_ACCESS &&
             !in_group_p(inode->i_gid) &&
-           !capable_wrt_inode_uidgid(inode, CAP_FSETID)) {
+           !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
                 struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
   
-               err = ovl_setattr(dentry, &iattr);
+               err = ovl_setattr(&init_user_ns, dentry, &iattr);
                 if (err)
                         return err;
         }
@@@ -1053,6 -1051,7 +1060,7 @@@ static int ovl_own_xattr_get(const stru
   }
   
   static int ovl_own_xattr_set(const struct xattr_handler *handler,
+                            struct user_namespace *mnt_userns,
                              struct dentry *dentry, struct inode *inode,
                              const char *name, const void *value,
                              size_t size, int flags)
@@@ -1068,6 -1067,7 +1076,7 @@@ static int ovl_other_xattr_get(const st
   }
   
   static int ovl_other_xattr_set(const struct xattr_handler *handler,
+                              struct user_namespace *mnt_userns,
                                struct dentry *dentry, struct inode *inode,
                                const char *name, const void *value,
                                size_t size, int flags)
@@@ -1932,10 -1932,6 +1941,10 @@@ static int ovl_fill_super(struct super_
         unsigned int numlower;
         int err;
   
+ +      err = -EIO;
+ +      if (WARN_ON(sb->s_user_ns != current_user_ns()))
+ +              goto out;
+ +
         sb->s_d_op = &ovl_dentry_operations;
   
         err = -ENOMEM;
@@@ -2002,8 -1998,6 +2011,8 @@@
         sb->s_op = &ovl_super_operations;
   
         if (ofs->config.upperdir) {
+ +              struct super_block *upper_sb;
+ +
                 if (!ofs->config.workdir) {
                         pr_err("missing 'workdir'\n");
                         goto out_err;
@@@ -2013,16 -2007,6 +2022,16 @@@
                 if (err)
                         goto out_err;
   
+ +              upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
+ +              if (!ovl_should_sync(ofs)) {
+ +                      ofs->errseq = errseq_sample(&upper_sb->s_wb_err);
+ +                      if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
+ +                              err = -EIO;
+ +                              pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
+ +                              goto out_err;
+ +                      }
+ +              }
+ +
                 err = ovl_get_workdir(sb, ofs, &upperpath);
                 if (err)
                         goto out_err;
@@@ -2030,8 -2014,9 +2039,8 @@@
                 if (!ofs->workdir)
                         sb->s_flags |= SB_RDONLY;
   
- -              sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth;
- -              sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran;
- -
+ +              sb->s_stack_depth = upper_sb->s_stack_depth;
+ +              sb->s_time_gran = upper_sb->s_time_gran;
         }
         oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers);
         err = PTR_ERR(oe);
diff --combined fs/overlayfs/util.c

index 9826b00,06013b7..7f5a01a
--- 1/fs/overlayfs/util.c
--- 2/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@@ -479,12 -479,12 +479,12 @@@ struct file *ovl_path_open(struct path 
                 BUG();
         }
   
-       err = inode_permission(inode, acc_mode | MAY_OPEN);
+       err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN);
         if (err)
                 return ERR_PTR(err);
   
         /* O_NOATIME is an optimization, don't fail if not permitted */
-       if (inode_owner_or_capable(inode))
+       if (inode_owner_or_capable(&init_user_ns, inode))
                 flags |= O_NOATIME;
   
         return dentry_open(path, flags, current_cred());
@@@ -962,30 -962,3 +962,30 @@@ err_free
         kfree(buf);
         return ERR_PTR(res);
   }
+ +
+ +/*
+ + * ovl_sync_status() - Check fs sync status for volatile mounts
+ + *
+ + * Returns 1 if this is not a volatile mount and a real sync is required.
+ + *
+ + * Returns 0 if syncing can be skipped because mount is volatile, and no errors
+ + * have occurred on the upperdir since the mount.
+ + *
+ + * Returns -errno if it is a volatile mount, and the error that occurred since
+ + * the last mount. If the error code changes, it'll return the latest error
+ + * code.
+ + */
+ +
+ +int ovl_sync_status(struct ovl_fs *ofs)
+ +{
+ +      struct vfsmount *mnt;
+ +
+ +      if (ovl_should_sync(ofs))
+ +              return 1;
+ +
+ +      mnt = ovl_upper_mnt(ofs);
+ +      if (!mnt)
+ +              return 0;
+ +
+ +      return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq);
+ +}
diff --combined fs/proc/proc_sysctl.c

index d2018f7,2daac06..656ba24
--- 1/fs/proc/proc_sysctl.c
--- 2/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@@ -785,7 -785,8 +785,8 @@@ out
         return 0;
   }
   
- static int proc_sys_permission(struct inode *inode, int mask)
+ static int proc_sys_permission(struct user_namespace *mnt_userns,
+                              struct inode *inode, int mask)
   {
         /*
          * sysctl entries that are not writeable,
@@@ -813,7 -814,8 +814,8 @@@
         return error;
   }
   
- static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
+ static int proc_sys_setattr(struct user_namespace *mnt_userns,
+                           struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         int error;
@@@ -821,16 -823,17 +823,17 @@@
         if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
                 return -EPERM;
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(&init_user_ns, dentry, attr);
         if (error)
                 return error;
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         mark_inode_dirty(inode);
         return 0;
   }
   
- static int proc_sys_getattr(const struct path *path, struct kstat *stat,
+ static int proc_sys_getattr(struct user_namespace *mnt_userns,
+                           const struct path *path, struct kstat *stat,
                             u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = d_inode(path->dentry);
@@@ -840,7 -843,7 +843,7 @@@
         if (IS_ERR(head))
                 return PTR_ERR(head);
   
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
         if (table)
                 stat->mode = (stat->mode & S_IFMT) | table->mode;
   
@@@ -1770,12 -1773,6 +1773,12 @@@ static int process_sysctl_arg(char *par
                         return 0;
         }
   
+ +      if (!val)
+ +              return -EINVAL;
+ +      len = strlen(val);
+ +      if (len == 0)
+ +              return -EINVAL;
+ +
         /*
          * To set sysctl options, we use a temporary mount of proc, look up the
          * respective sys/ file and write to it. To avoid mounting it when no
@@@ -1817,6 -1814,7 +1820,6 @@@
                                 file, param, val);
                 goto out;
         }
- -      len = strlen(val);
         wret = kernel_write(file, val, len, &pos);
         if (wret < 0) {
                 err = wret;
diff --combined fs/ubifs/xattr.c

index 842d5f1,8f4135c..6b1e983
--- 1/fs/ubifs/xattr.c
--- 2/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@@ -498,7 -498,7 +498,7 @@@ int ubifs_purge_xattrs(struct inode *ho
         struct fscrypt_name nm = {0};
         int err;
   
- -      if (ubifs_inode(host)->xattr_cnt < ubifs_xattr_max_cnt(c))
+ +      if (ubifs_inode(host)->xattr_cnt <= ubifs_xattr_max_cnt(c))
                 return 0;
   
         ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion",
@@@ -681,6 -681,7 +681,7 @@@ static int xattr_get(const struct xattr
   }
   
   static int xattr_set(const struct xattr_handler *handler,
+                          struct user_namespace *mnt_userns,
                            struct dentry *dentry, struct inode *inode,
                            const char *name, const void *value,
                            size_t size, int flags)
diff --combined fs/xfs/xfs_file.c

index 68ca1b4,1bdc356..a007ca0
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -29,6 -29,7 +29,7 @@@
   #include <linux/backing-dev.h>
   #include <linux/mman.h>
   #include <linux/fadvise.h>
+ #include <linux/mount.h>
   
   static const struct vm_operations_struct xfs_file_vm_ops;
   
@@@ -118,54 -119,6 +119,54 @@@ xfs_dir_fsync
         return xfs_log_force_inode(ip);
   }
   
+ +static xfs_lsn_t
+ +xfs_fsync_lsn(
+ +      struct xfs_inode        *ip,
+ +      bool                    datasync)
+ +{
+ +      if (!xfs_ipincount(ip))
+ +              return 0;
+ +      if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ +              return 0;
+ +      return ip->i_itemp->ili_last_lsn;
+ +}
+ +
+ +/*
+ + * All metadata updates are logged, which means that we just have to flush the
+ + * log up to the latest LSN that touched the inode.
+ + *
+ + * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ + * the log force before we clear the ili_fsync_fields field. This ensures that
+ + * we don't get a racing sync operation that does not wait for the metadata to
+ + * hit the journal before returning.  If we race with clearing ili_fsync_fields,
+ + * then all that will happen is the log force will do nothing as the lsn will
+ + * already be on disk.  We can't race with setting ili_fsync_fields because that
+ + * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ + * shared until after the ili_fsync_fields is cleared.
+ + */
+ +static  int
+ +xfs_fsync_flush_log(
+ +      struct xfs_inode        *ip,
+ +      bool                    datasync,
+ +      int                     *log_flushed)
+ +{
+ +      int                     error = 0;
+ +      xfs_lsn_t               lsn;
+ +
+ +      xfs_ilock(ip, XFS_ILOCK_SHARED);
+ +      lsn = xfs_fsync_lsn(ip, datasync);
+ +      if (lsn) {
+ +              error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC,
+ +                                        log_flushed);
+ +
+ +              spin_lock(&ip->i_itemp->ili_lock);
+ +              ip->i_itemp->ili_fsync_fields = 0;
+ +              spin_unlock(&ip->i_itemp->ili_lock);
+ +      }
+ +      xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ +      return error;
+ +}
+ +
   STATIC int
   xfs_file_fsync(
         struct file             *file,
@@@ -173,10 -126,13 +174,10 @@@
         loff_t                  end,
         int                     datasync)
   {
- -      struct inode            *inode = file->f_mapping->host;
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      struct xfs_inode_log_item *iip = ip->i_itemp;
+ +      struct xfs_inode        *ip = XFS_I(file->f_mapping->host);
         struct xfs_mount        *mp = ip->i_mount;
         int                     error = 0;
         int                     log_flushed = 0;
- -      xfs_lsn_t               lsn = 0;
   
         trace_xfs_file_fsync(ip);
   
@@@ -201,13 -157,32 +202,13 @@@
                 xfs_blkdev_issue_flush(mp->m_ddev_targp);
   
         /*
- -       * All metadata updates are logged, which means that we just have to
- -       * flush the log up to the latest LSN that touched the inode. If we have
- -       * concurrent fsync/fdatasync() calls, we need them to all block on the
- -       * log force before we clear the ili_fsync_fields field. This ensures
- -       * that we don't get a racing sync operation that does not wait for the
- -       * metadata to hit the journal before returning. If we race with
- -       * clearing the ili_fsync_fields, then all that will happen is the log
- -       * force will do nothing as the lsn will already be on disk. We can't
- -       * race with setting ili_fsync_fields because that is done under
- -       * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- -       * until after the ili_fsync_fields is cleared.
+ +       * Any inode that has dirty modifications in the log is pinned.  The
+ +       * racy check here for a pinned inode while not catch modifications
+ +       * that happen concurrently to the fsync call, but fsync semantics
+ +       * only require to sync previously completed I/O.
          */
- -      xfs_ilock(ip, XFS_ILOCK_SHARED);
- -      if (xfs_ipincount(ip)) {
- -              if (!datasync ||
- -                  (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- -                      lsn = iip->ili_last_lsn;
- -      }
- -
- -      if (lsn) {
- -              error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- -              spin_lock(&iip->ili_lock);
- -              iip->ili_fsync_fields = 0;
- -              spin_unlock(&iip->ili_lock);
- -      }
- -      xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ +      if (xfs_ipincount(ip))
+ +              error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
   
         /*
          * If we only have a single device, and the log force about was
@@@ -223,42 -198,30 +224,42 @@@
         return error;
   }
   
+ +static int
+ +xfs_ilock_iocb(
+ +      struct kiocb            *iocb,
+ +      unsigned int            lock_mode)
+ +{
+ +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+ +
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              if (!xfs_ilock_nowait(ip, lock_mode))
+ +                      return -EAGAIN;
+ +      } else {
+ +              xfs_ilock(ip, lock_mode);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
   STATIC ssize_t
- -xfs_file_dio_aio_read(
+ +xfs_file_dio_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
- -      size_t                  count = iov_iter_count(to);
         ssize_t                 ret;
   
- -      trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
+ +      trace_xfs_file_direct_read(iocb, to);
   
- -      if (!count)
+ +      if (!iov_iter_count(to))
                 return 0; /* skip atime */
   
         file_accessed(iocb->ki_filp);
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
- -      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
- -                      is_sync_kiocb(iocb));
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
+ +      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
         return ret;
@@@ -270,16 -233,21 +271,16 @@@ xfs_file_dax_read
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(iocb->ki_filp->f_mapping->host);
- -      size_t                  count = iov_iter_count(to);
         ssize_t                 ret = 0;
   
- -      trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+ +      trace_xfs_file_dax_read(iocb, to);
   
- -      if (!count)
+ +      if (!iov_iter_count(to))
                 return 0; /* skip atime */
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
- -
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
         ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
@@@ -288,18 -256,21 +289,18 @@@
   }
   
   STATIC ssize_t
- -xfs_file_buffered_aio_read(
+ +xfs_file_buffered_read(
         struct kiocb            *iocb,
         struct iov_iter         *to)
   {
         struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
         ssize_t                 ret;
   
- -      trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+ +      trace_xfs_file_buffered_read(iocb, to);
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, XFS_IOLOCK_SHARED);
- -      }
+ +      ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+ +      if (ret)
+ +              return ret;
         ret = generic_file_read_iter(iocb, to);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
   
@@@ -323,9 -294,9 +324,9 @@@ xfs_file_read_iter
         if (IS_DAX(inode))
                 ret = xfs_file_dax_read(iocb, to);
         else if (iocb->ki_flags & IOCB_DIRECT)
- -              ret = xfs_file_dio_aio_read(iocb, to);
+ +              ret = xfs_file_dio_read(iocb, to);
         else
- -              ret = xfs_file_buffered_aio_read(iocb, to);
+ +              ret = xfs_file_buffered_read(iocb, to);
   
         if (ret > 0)
                 XFS_STATS_ADD(mp, xs_read_bytes, ret);
@@@ -340,7 -311,7 +341,7 @@@
    * if called for a direct write beyond i_size.
    */
   STATIC ssize_t
- -xfs_file_aio_write_checks(
+ +xfs_file_write_checks(
         struct kiocb            *iocb,
         struct iov_iter         *from,
         int                     *iolock)
@@@ -358,14 -329,7 +359,14 @@@ restart
         if (error <= 0)
                 return error;
   
- -      error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ +      if (iocb->ki_flags & IOCB_NOWAIT) {
+ +              error = break_layout(inode, false);
+ +              if (error == -EWOULDBLOCK)
+ +                      error = -EAGAIN;
+ +      } else {
+ +              error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+ +      }
+ +
         if (error)
                 return error;
   
@@@ -376,11 -340,7 +377,11 @@@
         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
                 xfs_iunlock(ip, *iolock);
                 *iolock = XFS_IOLOCK_EXCL;
- -              xfs_ilock(ip, *iolock);
+ +              error = xfs_ilock_iocb(iocb, *iolock);
+ +              if (error) {
+ +                      *iolock = 0;
+ +                      return error;
+ +              }
                 goto restart;
         }
         /*
@@@ -402,10 -362,6 +403,10 @@@
         isize = i_size_read(inode);
         if (iocb->ki_pos > isize) {
                 spin_unlock(&ip->i_flags_lock);
+ +
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
+ +                      return -EAGAIN;
+ +
                 if (!drained_dio) {
                         if (*iolock == XFS_IOLOCK_SHARED) {
                                 xfs_iunlock(ip, *iolock);
@@@ -434,6 -390,12 +435,6 @@@
         } else
                 spin_unlock(&ip->i_flags_lock);
   
- -      /*
- -       * Updating the timestamps will grab the ilock again from
- -       * xfs_fs_dirty_inode, so we have to call it after dropping the
- -       * lock above.  Eventually we should look into a way to avoid
- -       * the pointless lock roundtrip.
- -       */
         return file_modified(file);
   }
   
@@@ -519,149 -481,122 +520,149 @@@ static const struct iomap_dio_ops xfs_d
   };
   
   /*
- - * xfs_file_dio_aio_write - handle direct IO writes
- - *
- - * Lock the inode appropriately to prepare for and issue a direct IO write.
- - * By separating it from the buffered write path we remove all the tricky to
- - * follow locking changes and looping.
- - *
- - * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- - * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- - * pages are flushed out.
- - *
- - * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- - * allowing them to be done in parallel with reads and other direct IO writes.
- - * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- - * needs to do sub-block zeroing and that requires serialisation against other
- - * direct IOs to the same block. In this case we need to serialise the
- - * submission of the unaligned IOs so that we don't get racing block zeroing in
- - * the dio layer.  To avoid the problem with aio, we also need to wait for
- - * outstanding IOs to complete so that unwritten extent conversion is completed
- - * before we try to map the overlapping block. This is currently implemented by
- - * hitting it with a big hammer (i.e. inode_dio_wait()).
- - *
- - * Returns with locks held indicated by @iolock and errors indicated by
- - * negative return values.
+ + * Handle block aligned direct I/O writes
    */
- -STATIC ssize_t
- -xfs_file_dio_aio_write(
+ +static noinline ssize_t
+ +xfs_file_dio_write_aligned(
+ +      struct xfs_inode        *ip,
         struct kiocb            *iocb,
         struct iov_iter         *from)
   {
- -      struct file             *file = iocb->ki_filp;
- -      struct address_space    *mapping = file->f_mapping;
- -      struct inode            *inode = mapping->host;
- -      struct xfs_inode        *ip = XFS_I(inode);
- -      struct xfs_mount        *mp = ip->i_mount;
- -      ssize_t                 ret = 0;
- -      int                     unaligned_io = 0;
- -      int                     iolock;
- -      size_t                  count = iov_iter_count(from);
- -      struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+ +      int                     iolock = XFS_IOLOCK_SHARED;
+ +      ssize_t                 ret;
   
- -      /* DIO must be aligned to device logical sector size */
- -      if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
- -              return -EINVAL;
+ +      ret = xfs_ilock_iocb(iocb, iolock);
+ +      if (ret)
+ +              return ret;
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
+ +      if (ret)
+ +              goto out_unlock;
   
         /*
- -       * Don't take the exclusive iolock here unless the I/O is unaligned to
- -       * the file system block size.  We don't need to consider the EOF
- -       * extension case here because xfs_file_aio_write_checks() will relock
- -       * the inode as necessary for EOF zeroing cases and fill out the new
- -       * inode size as appropriate.
+ +       * We don't need to hold the IOLOCK exclusively across the IO, so demote
+ +       * the iolock back to shared if we had to take the exclusive lock in
+ +       * xfs_file_write_checks() for other reasons.
          */
- -      if ((iocb->ki_pos & mp->m_blockmask) ||
- -          ((iocb->ki_pos + count) & mp->m_blockmask)) {
- -              unaligned_io = 1;
- -
- -              /*
- -               * We can't properly handle unaligned direct I/O to reflink
- -               * files yet, as we can't unshare a partial block.
- -               */
- -              if (xfs_is_cow_inode(ip)) {
- -                      trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- -                      return -ENOTBLK;
- -              }
- -              iolock = XFS_IOLOCK_EXCL;
- -      } else {
+ +      if (iolock == XFS_IOLOCK_EXCL) {
+ +              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
                 iolock = XFS_IOLOCK_SHARED;
         }
+ +      trace_xfs_file_direct_write(iocb, from);
+ +      ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ +                         &xfs_dio_write_ops, 0);
+ +out_unlock:
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
+ +      return ret;
+ +}
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              /* unaligned dio always waits, bail */
- -              if (unaligned_io)
- -                      return -EAGAIN;
- -              if (!xfs_ilock_nowait(ip, iolock))
+ +/*
+ + * Handle block unaligned direct I/O writes
+ + *
+ + * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
+ + * them to be done in parallel with reads and other direct I/O writes.  However,
+ + * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
+ + * to do sub-block zeroing and that requires serialisation against other direct
+ + * I/O to the same block.  In this case we need to serialise the submission of
+ + * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
+ + * In the case where sub-block zeroing is not required, we can do concurrent
+ + * sub-block dios to the same block successfully.
+ + *
+ + * Optimistically submit the I/O using the shared lock first, but use the
+ + * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
+ + * if block allocation or partial block zeroing would be required.  In that case
+ + * we try again with the exclusive lock.
+ + */
+ +static noinline ssize_t
+ +xfs_file_dio_write_unaligned(
+ +      struct xfs_inode        *ip,
+ +      struct kiocb            *iocb,
+ +      struct iov_iter         *from)
+ +{
+ +      size_t                  isize = i_size_read(VFS_I(ip));
+ +      size_t                  count = iov_iter_count(from);
+ +      int                     iolock = XFS_IOLOCK_SHARED;
+ +      unsigned int            flags = IOMAP_DIO_OVERWRITE_ONLY;
+ +      ssize_t                 ret;
+ +
+ +      /*
+ +       * Extending writes need exclusivity because of the sub-block zeroing
+ +       * that the DIO code always does for partial tail blocks beyond EOF, so
+ +       * don't even bother trying the fast path in this case.
+ +       */
+ +      if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
+ +retry_exclusive:
+ +              if (iocb->ki_flags & IOCB_NOWAIT)
                         return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, iolock);
+ +              iolock = XFS_IOLOCK_EXCL;
+ +              flags = IOMAP_DIO_FORCE_WAIT;
         }
   
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_ilock_iocb(iocb, iolock);
         if (ret)
- -              goto out;
- -      count = iov_iter_count(from);
+ +              return ret;
   
         /*
- -       * If we are doing unaligned IO, we can't allow any other overlapping IO
- -       * in-flight at the same time or we risk data corruption. Wait for all
- -       * other IO to drain before we submit. If the IO is aligned, demote the
- -       * iolock if we had to take the exclusive lock in
- -       * xfs_file_aio_write_checks() for other reasons.
+ +       * We can't properly handle unaligned direct I/O to reflink files yet,
+ +       * as we can't unshare a partial block.
          */
- -      if (unaligned_io) {
- -              inode_dio_wait(inode);
- -      } else if (iolock == XFS_IOLOCK_EXCL) {
- -              xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
- -              iolock = XFS_IOLOCK_SHARED;
+ +      if (xfs_is_cow_inode(ip)) {
+ +              trace_xfs_reflink_bounce_dio_write(iocb, from);
+ +              ret = -ENOTBLK;
+ +              goto out_unlock;
         }
   
- -      trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
+ +      if (ret)
+ +              goto out_unlock;
+ +
         /*
- -       * If unaligned, this is the only IO in-flight. Wait on it before we
- -       * release the iolock to prevent subsequent overlapping IO.
+ +       * If we are doing exclusive unaligned I/O, this must be the only I/O
+ +       * in-flight.  Otherwise we risk data corruption due to unwritten extent
+ +       * conversions from the AIO end_io handler.  Wait for all other I/O to
+ +       * drain first.
          */
+ +      if (flags & IOMAP_DIO_FORCE_WAIT)
+ +              inode_dio_wait(VFS_I(ip));
+ +
+ +      trace_xfs_file_direct_write(iocb, from);
         ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- -                         &xfs_dio_write_ops,
- -                         is_sync_kiocb(iocb) || unaligned_io);
- -out:
- -      xfs_iunlock(ip, iolock);
+ +                         &xfs_dio_write_ops, flags);
   
         /*
- -       * No fallback to buffered IO after short writes for XFS, direct I/O
- -       * will either complete fully or return an error.
+ +       * Retry unaligned I/O with exclusive blocking semantics if the DIO
+ +       * layer rejected it for mapping or locking reasons. If we are doing
+ +       * nonblocking user I/O, propagate the error.
          */
- -      ASSERT(ret < 0 || ret == count);
+ +      if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
+ +              ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
+ +              xfs_iunlock(ip, iolock);
+ +              goto retry_exclusive;
+ +      }
+ +
+ +out_unlock:
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
         return ret;
   }
   
+ +static ssize_t
+ +xfs_file_dio_write(
+ +      struct kiocb            *iocb,
+ +      struct iov_iter         *from)
+ +{
+ +      struct xfs_inode        *ip = XFS_I(file_inode(iocb->ki_filp));
+ +      struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+ +      size_t                  count = iov_iter_count(from);
+ +
+ +      /* direct I/O must be aligned to device logical sector size */
+ +      if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
+ +              return -EINVAL;
+ +      if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+ +              return xfs_file_dio_write_unaligned(ip, iocb, from);
+ +      return xfs_file_dio_write_aligned(ip, iocb, from);
+ +}
+ +
   static noinline ssize_t
   xfs_file_dax_write(
         struct kiocb            *iocb,
@@@ -671,26 -606,31 +672,26 @@@
         struct xfs_inode        *ip = XFS_I(inode);
         int                     iolock = XFS_IOLOCK_EXCL;
         ssize_t                 ret, error = 0;
- -      size_t                  count;
         loff_t                  pos;
   
- -      if (iocb->ki_flags & IOCB_NOWAIT) {
- -              if (!xfs_ilock_nowait(ip, iolock))
- -                      return -EAGAIN;
- -      } else {
- -              xfs_ilock(ip, iolock);
- -      }
- -
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_ilock_iocb(iocb, iolock);
+ +      if (ret)
+ +              return ret;
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
   
         pos = iocb->ki_pos;
- -      count = iov_iter_count(from);
   
- -      trace_xfs_file_dax_write(ip, count, pos);
+ +      trace_xfs_file_dax_write(iocb, from);
         ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
                 i_size_write(inode, iocb->ki_pos);
                 error = xfs_setfilesize(ip, pos, ret);
         }
   out:
- -      xfs_iunlock(ip, iolock);
+ +      if (iolock)
+ +              xfs_iunlock(ip, iolock);
         if (error)
                 return error;
   
@@@ -704,7 -644,7 +705,7 @@@
   }
   
   STATIC ssize_t
- -xfs_file_buffered_aio_write(
+ +xfs_file_buffered_write(
         struct kiocb            *iocb,
         struct iov_iter         *from)
   {
@@@ -713,7 -653,7 +714,7 @@@
         struct inode            *inode = mapping->host;
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 ret;
- -      int                     enospc = 0;
+ +      bool                    cleared_space = false;
         int                     iolock;
   
         if (iocb->ki_flags & IOCB_NOWAIT)
@@@ -723,14 -663,14 +724,14 @@@ write_retry
         iolock = XFS_IOLOCK_EXCL;
         xfs_ilock(ip, iolock);
   
- -      ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+ +      ret = xfs_file_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
   
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = inode_to_bdi(inode);
   
- -      trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+ +      trace_xfs_file_buffered_write(iocb, from);
         ret = iomap_file_buffered_write(iocb, from,
                         &xfs_buffered_write_iomap_ops);
         if (likely(ret >= 0))
@@@ -743,23 -683,27 +744,23 @@@
          * metadata space. This reduces the chances that the eofblocks scan
          * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
          * also behaves as a filter to prevent too many eofblocks scans from
- -       * running at the same time.
+ +       * running at the same time.  Use a synchronous scan to increase the
+ +       * effectiveness of the scan.
          */
- -      if (ret == -EDQUOT && !enospc) {
+ +      if (ret == -EDQUOT && !cleared_space) {
                 xfs_iunlock(ip, iolock);
- -              enospc = xfs_inode_free_quota_eofblocks(ip);
- -              if (enospc)
- -                      goto write_retry;
- -              enospc = xfs_inode_free_quota_cowblocks(ip);
- -              if (enospc)
- -                      goto write_retry;
- -              iolock = 0;
- -      } else if (ret == -ENOSPC && !enospc) {
+ +              xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC);
+ +              cleared_space = true;
+ +              goto write_retry;
+ +      } else if (ret == -ENOSPC && !cleared_space) {
                 struct xfs_eofblocks eofb = {0};
   
- -              enospc = 1;
+ +              cleared_space = true;
                 xfs_flush_inodes(ip->i_mount);
   
                 xfs_iunlock(ip, iolock);
                 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
- -              xfs_icache_free_eofblocks(ip->i_mount, &eofb);
- -              xfs_icache_free_cowblocks(ip->i_mount, &eofb);
+ +              xfs_blockgc_free_space(ip->i_mount, &eofb);
                 goto write_retry;
         }
   
@@@ -806,12 -750,12 +807,12 @@@ xfs_file_write_iter
                  * CoW.  In all other directio scenarios we do not
                  * allow an operation to fall back to buffered mode.
                  */
- -              ret = xfs_file_dio_aio_write(iocb, from);
+ +              ret = xfs_file_dio_write(iocb, from);
                 if (ret != -ENOTBLK)
                         return ret;
         }
   
- -      return xfs_file_buffered_aio_write(iocb, from);
+ +      return xfs_file_buffered_write(iocb, from);
   }
   
   static void
@@@ -1051,7 -995,8 +1052,8 @@@ xfs_file_fallocate
   
                 iattr.ia_valid = ATTR_SIZE;
                 iattr.ia_size = new_size;
-               error = xfs_vn_setattr_size(file_dentry(file), &iattr);
+               error = xfs_vn_setattr_size(file_mnt_user_ns(file),
+                                           file_dentry(file), &iattr);
                 if (error)
                         goto out_unlock;
         }
@@@ -1376,19 -1321,17 +1378,19 @@@ xfs_filemap_pfn_mkwrite
         return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
   }
   
- -static void
+ +static vm_fault_t
   xfs_filemap_map_pages(
         struct vm_fault         *vmf,
         pgoff_t                 start_pgoff,
         pgoff_t                 end_pgoff)
   {
         struct inode            *inode = file_inode(vmf->vma->vm_file);
+ +      vm_fault_t ret;
   
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- -      filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ +      ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ +      return ret;
   }
   
   static const struct vm_operations_struct xfs_file_vm_ops = {
diff --combined fs/xfs/xfs_inode.c

index 636ac13,95b7f2b..46a861d
--- 1/fs/xfs/xfs_inode.c
--- 2/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@@ -766,6 -766,7 +766,7 @@@ xfs_inode_inherit_flags2
    */
   static int
   xfs_init_new_inode(
+       struct user_namespace   *mnt_userns,
         struct xfs_trans        *tp,
         struct xfs_inode        *pip,
         xfs_ino_t               ino,
@@@ -775,7 -776,6 +776,7 @@@
         prid_t                  prid,
         struct xfs_inode        **ipp)
   {
+ +      struct inode            *dir = pip ? VFS_I(pip) : NULL;
         struct xfs_mount        *mp = tp->t_mountp;
         struct xfs_inode        *ip;
         unsigned int            flags;
@@@ -805,17 -805,18 +806,17 @@@
   
         ASSERT(ip != NULL);
         inode = VFS_I(ip);
- -      inode->i_mode = mode;
         set_nlink(inode, nlink);
- -      inode->i_uid = fsuid_into_mnt(mnt_userns);
         inode->i_rdev = rdev;
         ip->i_d.di_projid = prid;
   
- -      if (pip && XFS_INHERIT_GID(pip)) {
- -              inode->i_gid = VFS_I(pip)->i_gid;
- -              if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- -                      inode->i_mode |= S_ISGID;
+ +      if (dir && !(dir->i_mode & S_ISGID) &&
+ +          (mp->m_flags & XFS_MOUNT_GRPID)) {
-               inode->i_uid = current_fsuid();
++              inode->i_uid = fsuid_into_mnt(mnt_userns);
+ +              inode->i_gid = dir->i_gid;
+ +              inode->i_mode = mode;
         } else {
-               inode_init_owner(inode, dir, mode);
- -              inode->i_gid = fsgid_into_mnt(mnt_userns);
++              inode_init_owner(mnt_userns, inode, dir, mode);
         }
   
         /*
@@@ -824,7 -825,8 +825,8 @@@
          * (and only if the irix_sgid_inherit compatibility variable is set).
          */
         if (irix_sgid_inherit &&
-           (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
+           (inode->i_mode & S_ISGID) &&
+           !in_group_p(i_gid_into_mnt(mnt_userns, inode)))
                 inode->i_mode &= ~S_ISGID;
   
         ip->i_d.di_size = 0;
@@@ -901,6 -903,7 +903,7 @@@
    */
   int
   xfs_dir_ialloc(
+       struct user_namespace   *mnt_userns,
         struct xfs_trans        **tpp,
         struct xfs_inode        *dp,
         umode_t                 mode,
@@@ -933,7 -936,8 +936,8 @@@
                 return error;
         ASSERT(ino != NULLFSINO);
   
-       return xfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, prid, ipp);
+       return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev,
+                                 prid, ipp);
   }
   
   /*
@@@ -973,6 -977,7 +977,7 @@@ xfs_bumplink
   
   int
   xfs_create(
+       struct user_namespace   *mnt_userns,
         xfs_inode_t             *dp,
         struct xfs_name         *name,
         umode_t                 mode,
@@@ -1022,22 -1027,23 +1027,22 @@@
          * the case we'll drop the one we have and get a more
          * appropriate transaction later.
          */
- -      error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ +      error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ +                      &tp);
         if (error == -ENOSPC) {
                 /* flush outstanding delalloc blocks and retry */
                 xfs_flush_inodes(mp);
- -              error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
+ +              error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp,
+ +                              resblks, &tp);
         }
         if (error)
- -              goto out_release_inode;
+ +              goto out_release_dquots;
   
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
         unlock_dp_on_error = true;
   
- -      /*
- -       * Reserve disk quota and the inode.
- -       */
- -      error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- -                                              pdqp, resblks, 1, 0);
+ +      error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ +                      XFS_IEXT_DIR_MANIP_CNT(mp));
         if (error)
                 goto out_trans_cancel;
   
@@@ -1046,7 -1052,8 +1051,8 @@@
          * entry pointing to them, but a directory also the "." entry
          * pointing to itself.
          */
-       error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip);
+       error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev,
+                              prid, &ip);
         if (error)
                 goto out_trans_cancel;
   
@@@ -1115,7 -1122,7 +1121,7 @@@
                 xfs_finish_inode_setup(ip);
                 xfs_irele(ip);
         }
- -
+ + out_release_dquots:
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
@@@ -1127,6 -1134,7 +1133,7 @@@
   
   int
   xfs_create_tmpfile(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *dp,
         umode_t                 mode,
         struct xfs_inode        **ipp)
@@@ -1159,12 -1167,16 +1166,12 @@@
         resblks = XFS_IALLOC_SPACE_RES(mp);
         tres = &M_RES(mp)->tr_create_tmpfile;
   
- -      error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp);
- -      if (error)
- -              goto out_release_inode;
- -
- -      error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- -                                              pdqp, resblks, 1, 0);
+ +      error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
+ +                      &tp);
         if (error)
- -              goto out_trans_cancel;
+ +              goto out_release_dquots;
   
-       error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
+       error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip);
         if (error)
                 goto out_trans_cancel;
   
@@@ -1205,7 -1217,7 +1212,7 @@@
                 xfs_finish_inode_setup(ip);
                 xfs_irele(ip);
         }
- -
+ + out_release_dquots:
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
@@@ -1253,11 -1265,6 +1260,11 @@@ xfs_link
         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
   
+ +      error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
+ +                      XFS_IEXT_DIR_MANIP_CNT(mp));
+ +      if (error)
+ +              goto error_return;
+ +
         /*
          * If we are using project inheritance, we only allow hard link
          * creation in our tree when the project IDs are the same; else
@@@ -2977,13 -2984,15 +2984,15 @@@ out_trans_abort
    */
   static int
   xfs_rename_alloc_whiteout(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *dp,
         struct xfs_inode        **wip)
   {
         struct xfs_inode        *tmpfile;
         int                     error;
   
-       error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+       error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE,
+                                  &tmpfile);
         if (error)
                 return error;
   
@@@ -3005,6 -3014,7 +3014,7 @@@
    */
   int
   xfs_rename(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *src_dp,
         struct xfs_name         *src_name,
         struct xfs_inode        *src_ip,
@@@ -3017,7 -3027,7 +3027,7 @@@
         struct xfs_trans        *tp;
         struct xfs_inode        *wip = NULL;            /* whiteout inode */
         struct xfs_inode        *inodes[__XFS_SORT_INODES];
- -      struct xfs_buf          *agibp;
+ +      int                     i;
         int                     num_inodes = __XFS_SORT_INODES;
         bool                    new_parent = (src_dp != target_dp);
         bool                    src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@@ -3036,7 -3046,7 +3046,7 @@@
          */
         if (flags & RENAME_WHITEOUT) {
                 ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
-               error = xfs_rename_alloc_whiteout(target_dp, &wip);
+               error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip);
                 if (error)
                         return error;
   
@@@ -3106,35 -3116,6 +3116,35 @@@
         /*
          * Check for expected errors before we dirty the transaction
          * so we can return an error without a transaction abort.
+ +       *
+ +       * Extent count overflow check:
+ +       *
+ +       * From the perspective of src_dp, a rename operation is essentially a
+ +       * directory entry remove operation. Hence the only place where we check
+ +       * for extent count overflow for src_dp is in
+ +       * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
+ +       * -ENOSPC when it detects a possible extent count overflow and in
+ +       * response, the higher layers of directory handling code do the
+ +       * following:
+ +       * 1. Data/Free blocks: XFS lets these blocks linger until a
+ +       *    future remove operation removes them.
+ +       * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
+ +       *    Leaf space and unmaps the last block.
+ +       *
+ +       * For target_dp, there are two cases depending on whether the
+ +       * destination directory entry exists or not.
+ +       *
+ +       * When destination directory entry does not exist (i.e. target_ip ==
+ +       * NULL), extent count overflow check is performed only when transaction
+ +       * has a non-zero sized space reservation associated with it.  With a
+ +       * zero-sized space reservation, XFS allows a rename operation to
+ +       * continue only when the directory has sufficient free space in its
+ +       * data/leaf/free space blocks to hold the new entry.
+ +       *
+ +       * When destination directory entry exists (i.e. target_ip != NULL), all
+ +       * we need to do is change the inode number associated with the already
+ +       * existing entry. Hence there is no need to perform an extent count
+ +       * overflow check.
          */
         if (target_ip == NULL) {
                 /*
@@@ -3145,12 -3126,6 +3155,12 @@@
                         error = xfs_dir_canenter(tp, target_dp, target_name);
                         if (error)
                                 goto out_trans_cancel;
+ +              } else {
+ +                      error = xfs_iext_count_may_overflow(target_dp,
+ +                                      XFS_DATA_FORK,
+ +                                      XFS_IEXT_DIR_MANIP_CNT(mp));
+ +                      if (error)
+ +                              goto out_trans_cancel;
                 }
         } else {
                 /*
@@@ -3166,30 -3141,6 +3176,30 @@@
         }
   
         /*
+ +       * Lock the AGI buffers we need to handle bumping the nlink of the
+ +       * whiteout inode off the unlinked list and to handle dropping the
+ +       * nlink of the target inode.  Per locking order rules, do this in
+ +       * increasing AG order and before directory block allocation tries to
+ +       * grab AGFs because we grab AGIs before AGFs.
+ +       *
+ +       * The (vfs) caller must ensure that if src is a directory then
+ +       * target_ip is either null or an empty directory.
+ +       */
+ +      for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ +              if (inodes[i] == wip ||
+ +                  (inodes[i] == target_ip &&
+ +                   (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ +                      struct xfs_buf  *bp;
+ +                      xfs_agnumber_t  agno;
+ +
+ +                      agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ +                      error = xfs_read_agi(mp, tp, agno, &bp);
+ +                      if (error)
+ +                              goto out_trans_cancel;
+ +              }
+ +      }
+ +
+ +      /*
          * Directory entry creation below may acquire the AGF. Remove
          * the whiteout from the unlinked list first to preserve correct
          * AGI/AGF locking order. This dirties the transaction so failures
@@@ -3241,6 -3192,22 +3251,6 @@@
                  * In case there is already an entry with the same
                  * name at the destination directory, remove it first.
                  */
- -
- -              /*
- -               * Check whether the replace operation will need to allocate
- -               * blocks.  This happens when the shortform directory lacks
- -               * space and we have to convert it to a block format directory.
- -               * When more blocks are necessary, we must lock the AGI first
- -               * to preserve locking order (AGI -> AGF).
- -               */
- -              if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- -                      error = xfs_read_agi(mp, tp,
- -                                      XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- -                                      &agibp);
- -                      if (error)
- -                              goto out_trans_cancel;
- -              }
- -
                 error = xfs_dir_replace(tp, target_dp, target_name,
                                         src_ip->i_ino, spaceres);
                 if (error)
@@@ -3316,16 -3283,9 +3326,16 @@@
         if (wip) {
                 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
                                         spaceres);
- -      } else
+ +      } else {
+ +              /*
+ +               * NOTE: We don't need to check for extent count overflow here
+ +               * because the dir remove name code will leave the dir block in
+ +               * place if the extent count would overflow.
+ +               */
                 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
                                            spaceres);
+ +      }
+ +
         if (error)
                 goto out_trans_cancel;
   
diff --combined fs/xfs/xfs_ioctl.c

index 248083e,3d4c7ca..99dfe89
--- 1/fs/xfs/xfs_ioctl.c
--- 2/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@@ -693,7 -693,8 +693,8 @@@ xfs_ioc_space
   
         iattr.ia_valid = ATTR_SIZE;
         iattr.ia_size = bf->l_start;
-       error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
+       error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp),
+                                   &iattr);
         if (error)
                 goto out_unlock;
   
@@@ -734,13 -735,15 +735,15 @@@ xfs_fsinumbers_fmt
   
   STATIC int
   xfs_ioc_fsbulkstat(
-       xfs_mount_t             *mp,
+       struct file             *file,
         unsigned int            cmd,
         void                    __user *arg)
   {
+       struct xfs_mount        *mp = XFS_I(file_inode(file))->i_mount;
         struct xfs_fsop_bulkreq bulkreq;
         struct xfs_ibulk        breq = {
                 .mp             = mp,
+               .mnt_userns     = file_mnt_user_ns(file),
                 .ocount         = 0,
         };
         xfs_ino_t               lastino;
@@@ -908,13 -911,15 +911,15 @@@ xfs_bulk_ireq_teardown
   /* Handle the v5 bulkstat ioctl. */
   STATIC int
   xfs_ioc_bulkstat(
-       struct xfs_mount                *mp,
+       struct file                     *file,
         unsigned int                    cmd,
         struct xfs_bulkstat_req __user  *arg)
   {
+       struct xfs_mount                *mp = XFS_I(file_inode(file))->i_mount;
         struct xfs_bulk_ireq            hdr;
         struct xfs_ibulk                breq = {
                 .mp                     = mp,
+               .mnt_userns             = file_mnt_user_ns(file),
         };
         int                             error;
   
@@@ -1275,23 -1280,25 +1280,24 @@@ xfs_ioctl_setattr_prepare_dax
    */
   static struct xfs_trans *
   xfs_ioctl_setattr_get_trans(
-       struct xfs_inode        *ip,
- -      struct file             *file)
++      struct file             *file,
+ +      struct xfs_dquot        *pdqp)
   {
+       struct xfs_inode        *ip = XFS_I(file_inode(file));
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_trans        *tp;
         int                     error = -EROFS;
   
         if (mp->m_flags & XFS_MOUNT_RDONLY)
- -              goto out_unlock;
+ +              goto out_error;
         error = -EIO;
         if (XFS_FORCED_SHUTDOWN(mp))
- -              goto out_unlock;
+ +              goto out_error;
   
- -      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ +      error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
+ +                      capable(CAP_FOWNER), &tp);
         if (error)
- -              goto out_unlock;
- -
- -      xfs_ilock(ip, XFS_ILOCK_EXCL);
- -      xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ +              goto out_error;
   
         /*
          * CAP_FOWNER overrides the following restrictions:
@@@ -1299,7 -1306,7 +1305,7 @@@
          * The user ID of the calling process must be equal to the file owner
          * ID, except in cases where the CAP_FSETID capability is applicable.
          */
-       if (!inode_owner_or_capable(VFS_I(ip))) {
+       if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) {
                 error = -EPERM;
                 goto out_cancel;
         }
@@@ -1311,7 -1318,7 +1317,7 @@@
   
   out_cancel:
         xfs_trans_cancel(tp);
- -out_unlock:
+ +out_error:
         return ERR_PTR(error);
   }
   
@@@ -1427,21 -1434,23 +1433,23 @@@ xfs_ioctl_setattr_check_projid
   
   STATIC int
   xfs_ioctl_setattr(
-       xfs_inode_t             *ip,
+       struct file             *file,
         struct fsxattr          *fa)
   {
+       struct user_namespace   *mnt_userns = file_mnt_user_ns(file);
+       struct xfs_inode        *ip = XFS_I(file_inode(file));
         struct fsxattr          old_fa;
         struct xfs_mount        *mp = ip->i_mount;
         struct xfs_trans        *tp;
         struct xfs_dquot        *pdqp = NULL;
         struct xfs_dquot        *olddquot = NULL;
- -      int                     code;
+ +      int                     error;
   
         trace_xfs_ioctl_setattr(ip);
   
- -      code = xfs_ioctl_setattr_check_projid(ip, fa);
- -      if (code)
- -              return code;
+ +      error = xfs_ioctl_setattr_check_projid(ip, fa);
+ +      if (error)
+ +              return error;
   
         /*
          * If disk quotas is on, we make sure that the dquots do exist on disk,
@@@ -1452,36 -1461,44 +1460,36 @@@
          * because the i_*dquot fields will get updated anyway.
          */
         if (XFS_IS_QUOTA_ON(mp)) {
- -              code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ +              error = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
                                 VFS_I(ip)->i_gid, fa->fsx_projid,
                                 XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
- -              if (code)
- -                      return code;
+ +              if (error)
+ +                      return error;
         }
   
         xfs_ioctl_setattr_prepare_dax(ip, fa);
   
-       tp = xfs_ioctl_setattr_get_trans(ip, pdqp);
- -      tp = xfs_ioctl_setattr_get_trans(file);
++      tp = xfs_ioctl_setattr_get_trans(file, pdqp);
         if (IS_ERR(tp)) {
- -              code = PTR_ERR(tp);
+ +              error = PTR_ERR(tp);
                 goto error_free_dquots;
         }
   
- -      if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- -          ip->i_d.di_projid != fa->fsx_projid) {
- -              code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp,
- -                              capable(CAP_FOWNER) ?  XFS_QMOPT_FORCE_RES : 0);
- -              if (code)       /* out of quota */
- -                      goto error_trans_cancel;
- -      }
- -
         xfs_fill_fsxattr(ip, false, &old_fa);
- -      code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
- -      if (code)
+ +      error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ +      if (error)
                 goto error_trans_cancel;
   
- -      code = xfs_ioctl_setattr_check_extsize(ip, fa);
- -      if (code)
+ +      error = xfs_ioctl_setattr_check_extsize(ip, fa);
+ +      if (error)
                 goto error_trans_cancel;
   
- -      code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
- -      if (code)
+ +      error = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+ +      if (error)
                 goto error_trans_cancel;
   
- -      code = xfs_ioctl_setattr_xflags(tp, ip, fa);
- -      if (code)
+ +      error = xfs_ioctl_setattr_xflags(tp, ip, fa);
+ +      if (error)
                 goto error_trans_cancel;
   
         /*
@@@ -1493,7 -1510,7 +1501,7 @@@
          */
   
         if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
-           !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+           !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID))
                 VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
   
         /* Change the ownerships and register project quota modifications */
@@@ -1521,7 -1538,7 +1529,7 @@@
         else
                 ip->i_d.di_cowextsize = 0;
   
- -      code = xfs_trans_commit(tp);
+ +      error = xfs_trans_commit(tp);
   
         /*
          * Release any dquot(s) the inode had kept before chown.
@@@ -1529,18 -1546,17 +1537,17 @@@
         xfs_qm_dqrele(olddquot);
         xfs_qm_dqrele(pdqp);
   
- -      return code;
+ +      return error;
   
   error_trans_cancel:
         xfs_trans_cancel(tp);
   error_free_dquots:
         xfs_qm_dqrele(pdqp);
- -      return code;
+ +      return error;
   }
   
   STATIC int
   xfs_ioc_fssetxattr(
-       xfs_inode_t             *ip,
         struct file             *filp,
         void                    __user *arg)
   {
@@@ -1553,7 -1569,7 +1560,7 @@@
         error = mnt_want_write_file(filp);
         if (error)
                 return error;
-       error = xfs_ioctl_setattr(ip, &fa);
+       error = xfs_ioctl_setattr(filp, &fa);
         mnt_drop_write_file(filp);
         return error;
   }
@@@ -1599,7 -1615,7 +1606,7 @@@ xfs_ioc_setxflags
   
         xfs_ioctl_setattr_prepare_dax(ip, &fa);
   
-       tp = xfs_ioctl_setattr_get_trans(ip, NULL);
- -      tp = xfs_ioctl_setattr_get_trans(filp);
++      tp = xfs_ioctl_setattr_get_trans(filp, NULL);
         if (IS_ERR(tp)) {
                 error = PTR_ERR(tp);
                 goto out_drop_write;
@@@ -2110,10 -2126,10 +2117,10 @@@ xfs_file_ioctl
         case XFS_IOC_FSBULKSTAT_SINGLE:
         case XFS_IOC_FSBULKSTAT:
         case XFS_IOC_FSINUMBERS:
-               return xfs_ioc_fsbulkstat(mp, cmd, arg);
+               return xfs_ioc_fsbulkstat(filp, cmd, arg);
   
         case XFS_IOC_BULKSTAT:
-               return xfs_ioc_bulkstat(mp, cmd, arg);
+               return xfs_ioc_bulkstat(filp, cmd, arg);
         case XFS_IOC_INUMBERS:
                 return xfs_ioc_inumbers(mp, cmd, arg);
   
@@@ -2135,7 -2151,7 +2142,7 @@@
         case XFS_IOC_FSGETXATTRA:
                 return xfs_ioc_fsgetxattr(ip, 1, arg);
         case XFS_IOC_FSSETXATTR:
-               return xfs_ioc_fssetxattr(ip, filp, arg);
+               return xfs_ioc_fssetxattr(filp, arg);
         case XFS_IOC_GETXFLAGS:
                 return xfs_ioc_getxflags(ip, arg);
         case XFS_IOC_SETXFLAGS:
@@@ -2251,7 -2267,7 +2258,7 @@@
         }
   
         case XFS_IOC_FSGROWFSDATA: {
- -              xfs_growfs_data_t in;
+ +              struct xfs_growfs_data in;
   
                 if (copy_from_user(&in, arg, sizeof(in)))
                         return -EFAULT;
@@@ -2265,7 -2281,7 +2272,7 @@@
         }
   
         case XFS_IOC_FSGROWFSLOG: {
- -              xfs_growfs_log_t in;
+ +              struct xfs_growfs_log in;
   
                 if (copy_from_user(&in, arg, sizeof(in)))
                         return -EFAULT;
@@@ -2339,10 -2355,8 +2346,10 @@@
                 if (error)
                         return error;
   
+ +              trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_);
+ +
                 sb_start_write(mp->m_super);
- -              error = xfs_icache_free_eofblocks(mp, &keofb);
+ +              error = xfs_blockgc_free_space(mp, &keofb);
                 sb_end_write(mp->m_super);
                 return error;
         }
diff --combined fs/xfs/xfs_iops.c

index 0036950,816a0f7..66ebccb
--- 1/fs/xfs/xfs_iops.c
--- 2/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@@ -128,6 -128,7 +128,7 @@@ xfs_cleanup_inode
   
   STATIC int
   xfs_generic_create(
+       struct user_namespace   *mnt_userns,
         struct inode    *dir,
         struct dentry   *dentry,
         umode_t         mode,
@@@ -161,9 -162,10 +162,10 @@@
                 goto out_free_acl;
   
         if (!tmpfile) {
-               error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+               error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev,
+                                  &ip);
         } else {
-               error = xfs_create_tmpfile(XFS_I(dir), mode, &ip);
+               error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip);
         }
         if (unlikely(error))
                 goto out_free_acl;
@@@ -220,31 -222,35 +222,35 @@@
   
   STATIC int
   xfs_vn_mknod(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       umode_t         mode,
-       dev_t           rdev)
+       struct user_namespace   *mnt_userns,
+       struct inode            *dir,
+       struct dentry           *dentry,
+       umode_t                 mode,
+       dev_t                   rdev)
   {
-       return xfs_generic_create(dir, dentry, mode, rdev, false);
+       return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false);
   }
   
   STATIC int
   xfs_vn_create(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       umode_t         mode,
-       bool            flags)
+       struct user_namespace   *mnt_userns,
+       struct inode            *dir,
+       struct dentry           *dentry,
+       umode_t                 mode,
+       bool                    flags)
   {
-       return xfs_generic_create(dir, dentry, mode, 0, false);
+       return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false);
   }
   
   STATIC int
   xfs_vn_mkdir(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       umode_t         mode)
+       struct user_namespace   *mnt_userns,
+       struct inode            *dir,
+       struct dentry           *dentry,
+       umode_t                 mode)
   {
-       return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
+       return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0,
+                                 false);
   }
   
   STATIC struct dentry *
@@@ -361,9 -367,10 +367,10 @@@ xfs_vn_unlink
   
   STATIC int
   xfs_vn_symlink(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       const char      *symname)
+       struct user_namespace   *mnt_userns,
+       struct inode            *dir,
+       struct dentry           *dentry,
+       const char              *symname)
   {
         struct inode    *inode;
         struct xfs_inode *cip = NULL;
@@@ -377,7 -384,7 +384,7 @@@
         if (unlikely(error))
                 goto out;
   
-       error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+       error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip);
         if (unlikely(error))
                 goto out;
   
@@@ -403,11 -410,12 +410,12 @@@
   
   STATIC int
   xfs_vn_rename(
-       struct inode    *odir,
-       struct dentry   *odentry,
-       struct inode    *ndir,
-       struct dentry   *ndentry,
-       unsigned int    flags)
+       struct user_namespace   *mnt_userns,
+       struct inode            *odir,
+       struct dentry           *odentry,
+       struct inode            *ndir,
+       struct dentry           *ndentry,
+       unsigned int            flags)
   {
         struct inode    *new_inode = d_inode(ndentry);
         int             omode = 0;
@@@ -431,8 -439,8 +439,8 @@@
         if (unlikely(error))
                 return error;
   
-       return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
-                         XFS_I(ndir), &nname,
+       return xfs_rename(mnt_userns, XFS_I(odir), &oname,
+                         XFS_I(d_inode(odentry)), XFS_I(ndir), &nname,
                           new_inode ? XFS_I(new_inode) : NULL, flags);
   }
   
@@@ -529,6 -537,7 +537,7 @@@ xfs_stat_blksize
   
   STATIC int
   xfs_vn_getattr(
+       struct user_namespace   *mnt_userns,
         const struct path       *path,
         struct kstat            *stat,
         u32                     request_mask,
@@@ -547,8 -556,8 +556,8 @@@
         stat->dev = inode->i_sb->s_dev;
         stat->mode = inode->i_mode;
         stat->nlink = inode->i_nlink;
-       stat->uid = inode->i_uid;
-       stat->gid = inode->i_gid;
+       stat->uid = i_uid_into_mnt(mnt_userns, inode);
+       stat->gid = i_gid_into_mnt(mnt_userns, inode);
         stat->ino = ip->i_ino;
         stat->atime = inode->i_atime;
         stat->mtime = inode->i_mtime;
@@@ -626,8 -635,9 +635,9 @@@ xfs_setattr_time
   
   static int
   xfs_vn_change_ok(
-       struct dentry   *dentry,
-       struct iattr    *iattr)
+       struct user_namespace   *mnt_userns,
+       struct dentry           *dentry,
+       struct iattr            *iattr)
   {
         struct xfs_mount        *mp = XFS_I(d_inode(dentry))->i_mount;
   
@@@ -637,7 -647,7 +647,7 @@@
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
   
-       return setattr_prepare(dentry, iattr);
+       return setattr_prepare(mnt_userns, dentry, iattr);
   }
   
   /*
@@@ -648,6 -658,7 +658,7 @@@
    */
   static int
   xfs_setattr_nonsize(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *ip,
         struct iattr            *iattr)
   {
@@@ -700,11 -711,13 +711,11 @@@
                         return error;
         }
   
- -      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+ +      error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
+ +                      capable(CAP_FOWNER), &tp);
         if (error)
                 goto out_dqrele;
   
- -      xfs_ilock(ip, XFS_ILOCK_EXCL);
- -      xfs_trans_ijoin(tp, ip, 0);
- -
         /*
          * Change file ownership.  Must be the owner or privileged.
          */
@@@ -721,6 -734,21 +732,6 @@@
                 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
   
                 /*
- -               * Do a quota reservation only if uid/gid is actually
- -               * going to change.
- -               */
- -              if (XFS_IS_QUOTA_RUNNING(mp) &&
- -                  ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
- -                   (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
- -                      ASSERT(tp);
- -                      error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
- -                                              NULL, capable(CAP_FOWNER) ?
- -                                              XFS_QMOPT_FORCE_RES : 0);
- -                      if (error)      /* out of quota */
- -                              goto out_cancel;
- -              }
- -
- -              /*
                  * CAP_FSETID overrides the following restrictions:
                  *
                  * The set-user-ID and set-group-ID bits of a file will be
@@@ -769,6 -797,8 +780,6 @@@
                 xfs_trans_set_sync(tp);
         error = xfs_trans_commit(tp);
   
- -      xfs_iunlock(ip, XFS_ILOCK_EXCL);
- -
         /*
          * Release any dquot(s) the inode had kept before chown.
          */
@@@ -788,13 -818,16 +799,13 @@@
          *           Posix ACL code seems to care about this issue either.
          */
         if (mask & ATTR_MODE) {
-               error = posix_acl_chmod(inode, inode->i_mode);
+               error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
                 if (error)
                         return error;
         }
   
         return 0;
   
- -out_cancel:
- -      xfs_trans_cancel(tp);
- -      xfs_iunlock(ip, XFS_ILOCK_EXCL);
   out_dqrele:
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
@@@ -809,6 -842,7 +820,7 @@@
    */
   STATIC int
   xfs_setattr_size(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *ip,
         struct iattr            *iattr)
   {
@@@ -824,7 -858,7 +836,7 @@@
         ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
         ASSERT(S_ISREG(inode->i_mode));
         ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- -              ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ +              ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
   
         oldsize = inode->i_size;
         newsize = iattr->ia_size;
@@@ -840,7 -874,7 +852,7 @@@
                  * Use the regular setattr path to update the timestamps.
                  */
                 iattr->ia_valid &= ~ATTR_SIZE;
-               return xfs_setattr_nonsize(ip, iattr);
+               return xfs_setattr_nonsize(mnt_userns, ip, iattr);
         }
   
         /*
@@@ -1009,6 -1043,7 +1021,7 @@@ out_trans_cancel
   
   int
   xfs_vn_setattr_size(
+       struct user_namespace   *mnt_userns,
         struct dentry           *dentry,
         struct iattr            *iattr)
   {
@@@ -1017,14 -1052,15 +1030,15 @@@
   
         trace_xfs_setattr(ip);
   
-       error = xfs_vn_change_ok(dentry, iattr);
+       error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
         if (error)
                 return error;
-       return xfs_setattr_size(ip, iattr);
+       return xfs_setattr_size(mnt_userns, ip, iattr);
   }
   
   STATIC int
   xfs_vn_setattr(
+       struct user_namespace   *mnt_userns,
         struct dentry           *dentry,
         struct iattr            *iattr)
   {
@@@ -1044,14 -1080,14 +1058,14 @@@
                         return error;
                 }
   
-               error = xfs_vn_setattr_size(dentry, iattr);
+               error = xfs_vn_setattr_size(mnt_userns, dentry, iattr);
                 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
         } else {
                 trace_xfs_setattr(ip);
   
-               error = xfs_vn_change_ok(dentry, iattr);
+               error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
                 if (!error)
-                       error = xfs_setattr_nonsize(ip, iattr);
+                       error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
         }
   
         return error;
@@@ -1122,11 -1158,12 +1136,12 @@@ xfs_vn_fiemap
   
   STATIC int
   xfs_vn_tmpfile(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       umode_t         mode)
+       struct user_namespace   *mnt_userns,
+       struct inode            *dir,
+       struct dentry           *dentry,
+       umode_t                 mode)
   {
-       return xfs_generic_create(dir, dentry, mode, 0, true);
+       return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true);
   }
   
   static const struct inode_operations xfs_inode_operations = {
diff --combined fs/xfs/xfs_qm.c

index 742d141,1b7b139..bfa4164
--- 1/fs/xfs/xfs_qm.c
--- 2/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@@ -787,7 -787,8 +787,8 @@@ xfs_qm_qino_alloc
                 return error;
   
         if (need_alloc) {
-               error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ipp);
+               error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0,
+                                      0, ipp);
                 if (error) {
                         xfs_trans_cancel(tp);
                         return error;
@@@ -1786,35 -1787,105 +1787,35 @@@ xfs_qm_vop_chown
         xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
   
         /*
- -       * Take an extra reference, because the inode is going to keep
- -       * this dquot pointer even after the trans_commit.
+ +       * Back when we made quota reservations for the chown, we reserved the
+ +       * ondisk blocks + delalloc blocks with the new dquot.  Now that we've
+ +       * switched the dquots, decrease the new dquot's block reservation
+ +       * (having already bumped up the real counter) so that we don't have
+ +       * any reservation to give back when we commit.
          */
- -      *IO_olddq = xfs_qm_dqhold(newdq);
- -
- -      return prevdq;
- -}
- -
- -/*
- - * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- - */
- -int
- -xfs_qm_vop_chown_reserve(
- -      struct xfs_trans        *tp,
- -      struct xfs_inode        *ip,
- -      struct xfs_dquot        *udqp,
- -      struct xfs_dquot        *gdqp,
- -      struct xfs_dquot        *pdqp,
- -      uint                    flags)
- -{
- -      struct xfs_mount        *mp = ip->i_mount;
- -      uint64_t                delblks;
- -      unsigned int            blkflags;
- -      struct xfs_dquot        *udq_unres = NULL;
- -      struct xfs_dquot        *gdq_unres = NULL;
- -      struct xfs_dquot        *pdq_unres = NULL;
- -      struct xfs_dquot        *udq_delblks = NULL;
- -      struct xfs_dquot        *gdq_delblks = NULL;
- -      struct xfs_dquot        *pdq_delblks = NULL;
- -      int                     error;
- -
- -
- -      ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- -      ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- -
- -      delblks = ip->i_delayed_blks;
- -      blkflags = XFS_IS_REALTIME_INODE(ip) ?
- -                      XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+ +      xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ +                      -ip->i_delayed_blks);
   
- -      if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- -          i_uid_read(VFS_I(ip)) != udqp->q_id) {
- -              udq_delblks = udqp;
- -              /*
- -               * If there are delayed allocation blocks, then we have to
- -               * unreserve those from the old dquot, and add them to the
- -               * new dquot.
- -               */
- -              if (delblks) {
- -                      ASSERT(ip->i_udquot);
- -                      udq_unres = ip->i_udquot;
- -              }
- -      }
- -      if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- -          i_gid_read(VFS_I(ip)) != gdqp->q_id) {
- -              gdq_delblks = gdqp;
- -              if (delblks) {
- -                      ASSERT(ip->i_gdquot);
- -                      gdq_unres = ip->i_gdquot;
- -              }
- -      }
- -
- -      if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- -          ip->i_d.di_projid != pdqp->q_id) {
- -              pdq_delblks = pdqp;
- -              if (delblks) {
- -                      ASSERT(ip->i_pdquot);
- -                      pdq_unres = ip->i_pdquot;
- -              }
- -      }
- -
- -      error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- -                              udq_delblks, gdq_delblks, pdq_delblks,
- -                              ip->i_d.di_nblocks, 1, flags | blkflags);
- -      if (error)
- -              return error;
+ +      /*
+ +       * Give the incore reservation for delalloc blocks back to the old
+ +       * dquot.  We don't normally handle delalloc quota reservations
+ +       * transactionally, so just lock the dquot and subtract from the
+ +       * reservation.  Dirty the transaction because it's too late to turn
+ +       * back now.
+ +       */
+ +      tp->t_flags |= XFS_TRANS_DIRTY;
+ +      xfs_dqlock(prevdq);
+ +      ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ +      prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ +      xfs_dqunlock(prevdq);
   
         /*
- -       * Do the delayed blks reservations/unreservations now. Since, these
- -       * are done without the help of a transaction, if a reservation fails
- -       * its previous reservations won't be automatically undone by trans
- -       * code. So, we have to do it manually here.
+ +       * Take an extra reference, because the inode is going to keep
+ +       * this dquot pointer even after the trans_commit.
          */
- -      if (delblks) {
- -              /*
- -               * Do the reservations first. Unreservation can't fail.
- -               */
- -              ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- -              ASSERT(udq_unres || gdq_unres || pdq_unres);
- -              error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- -                          udq_delblks, gdq_delblks, pdq_delblks,
- -                          (xfs_qcnt_t)delblks, 0, flags | blkflags);
- -              if (error)
- -                      return error;
- -              xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- -                              udq_unres, gdq_unres, pdq_unres,
- -                              -((xfs_qcnt_t)delblks), 0, blkflags);
- -      }
+ +      *IO_olddq = xfs_qm_dqhold(newdq);
   
- -      return 0;
+ +      return prevdq;
   }
   
   int
diff --combined fs/xfs/xfs_super.c

index 586d423,e95c1ef..e5e0713
--- 1/fs/xfs/xfs_super.c
--- 2/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@@ -35,7 -35,6 +35,7 @@@
   #include "xfs_refcount_item.h"
   #include "xfs_bmap_item.h"
   #include "xfs_reflink.h"
+ +#include "xfs_pwork.h"
   
   #include <linux/magic.h>
   #include <linux/fs_context.h>
@@@ -343,7 -342,7 +343,7 @@@ voi
   xfs_blkdev_issue_flush(
         xfs_buftarg_t           *buftarg)
   {
- -      blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
+ +      blkdev_issue_flush(buftarg->bt_bdev);
   }
   
   STATIC void
@@@ -496,44 -495,40 +496,44 @@@ xfs_init_mount_workqueues
         struct xfs_mount        *mp)
   {
         mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- -                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
+ +                      XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ +                      1, mp->m_super->s_id);
         if (!mp->m_buf_workqueue)
                 goto out;
   
         mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- -                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ +                      XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ +                      0, mp->m_super->s_id);
         if (!mp->m_unwritten_workqueue)
                 goto out_destroy_buf;
   
         mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- -                      WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ +                      XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
                         0, mp->m_super->s_id);
         if (!mp->m_cil_workqueue)
                 goto out_destroy_unwritten;
   
         mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- -                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
+ +                      XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ +                      0, mp->m_super->s_id);
         if (!mp->m_reclaim_workqueue)
                 goto out_destroy_cil;
   
- -      mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- -                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
- -      if (!mp->m_eofblocks_workqueue)
+ +      mp->m_blockgc_workqueue = alloc_workqueue("xfs-blockgc/%s",
+ +                      WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM,
+ +                      0, mp->m_super->s_id);
+ +      if (!mp->m_blockgc_workqueue)
                 goto out_destroy_reclaim;
   
- -      mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- -                                             mp->m_super->s_id);
+ +      mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
+ +                      XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
         if (!mp->m_sync_workqueue)
                 goto out_destroy_eofb;
   
         return 0;
   
   out_destroy_eofb:
- -      destroy_workqueue(mp->m_eofblocks_workqueue);
+ +      destroy_workqueue(mp->m_blockgc_workqueue);
   out_destroy_reclaim:
         destroy_workqueue(mp->m_reclaim_workqueue);
   out_destroy_cil:
@@@ -551,7 -546,7 +551,7 @@@ xfs_destroy_mount_workqueues
         struct xfs_mount        *mp)
   {
         destroy_workqueue(mp->m_sync_workqueue);
- -      destroy_workqueue(mp->m_eofblocks_workqueue);
+ +      destroy_workqueue(mp->m_blockgc_workqueue);
         destroy_workqueue(mp->m_reclaim_workqueue);
         destroy_workqueue(mp->m_cil_workqueue);
         destroy_workqueue(mp->m_unwritten_workqueue);
@@@ -873,6 -868,39 +873,6 @@@ xfs_restore_resvblks(struct xfs_mount *
   }
   
   /*
- - * Trigger writeback of all the dirty metadata in the file system.
- - *
- - * This ensures that the metadata is written to their location on disk rather
- - * than just existing in transactions in the log. This means after a quiesce
- - * there is no log replay required to write the inodes to disk - this is the
- - * primary difference between a sync and a quiesce.
- - *
- - * We cancel log work early here to ensure all transactions the log worker may
- - * run have finished before we clean up and log the superblock and write an
- - * unmount record. The unfreeze process is responsible for restarting the log
- - * worker correctly.
- - */
- -void
- -xfs_quiesce_attr(
- -      struct xfs_mount        *mp)
- -{
- -      int     error = 0;
- -
- -      cancel_delayed_work_sync(&mp->m_log->l_work);
- -
- -      /* force the log to unpin objects from the now complete transactions */
- -      xfs_log_force(mp, XFS_LOG_SYNC);
- -
- -
- -      /* Push the superblock and write an unmount record */
- -      error = xfs_log_sbcount(mp);
- -      if (error)
- -              xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- -                              "Frozen image may not be consistent.");
- -      xfs_log_quiesce(mp);
- -}
- -
- -/*
    * Second stage of a freeze. The data is already frozen so we only
    * need to take care of the metadata. Once that's done sync the superblock
    * to the log to dirty it in case of a crash while frozen. This ensures that we
@@@ -892,9 -920,10 +892,9 @@@ xfs_fs_freeze
          * set a GFP_NOFS context here to avoid recursion deadlocks.
          */
         flags = memalloc_nofs_save();
- -      xfs_stop_block_reaping(mp);
+ +      xfs_blockgc_stop(mp);
         xfs_save_resvblks(mp);
- -      xfs_quiesce_attr(mp);
- -      ret = xfs_sync_sb(mp, true);
+ +      ret = xfs_log_quiesce(mp);
         memalloc_nofs_restore(flags);
         return ret;
   }
@@@ -907,7 -936,7 +907,7 @@@ xfs_fs_unfreeze
   
         xfs_restore_resvblks(mp);
         xfs_log_work_queue(mp);
- -      xfs_start_block_reaping(mp);
+ +      xfs_blockgc_start(mp);
         return 0;
   }
   
@@@ -1691,7 -1720,7 +1691,7 @@@ xfs_remount_rw
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                 return error;
         }
- -      xfs_start_block_reaping(mp);
+ +      xfs_blockgc_start(mp);
   
         /* Create the per-AG metadata reservation pool .*/
         error = xfs_fs_reserve_ag_blocks(mp);
@@@ -1711,10 -1740,10 +1711,10 @@@ xfs_remount_ro
          * Cancel background eofb scanning so it cannot race with the final
          * log force+buftarg wait and deadlock the remount.
          */
- -      xfs_stop_block_reaping(mp);
+ +      xfs_blockgc_stop(mp);
   
         /* Get rid of any leftover CoW reservations... */
- -      error = xfs_icache_free_cowblocks(mp, NULL);
+ +      error = xfs_blockgc_free_space(mp, NULL);
         if (error) {
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                 return error;
@@@ -1736,7 -1765,7 +1736,7 @@@
          */
         xfs_save_resvblks(mp);
   
- -      xfs_quiesce_attr(mp);
+ +      xfs_log_clean(mp);
         mp->m_flags |= XFS_MOUNT_RDONLY;
   
         return 0;
@@@ -1843,6 -1872,8 +1843,6 @@@ static int xfs_init_fs_context
         mutex_init(&mp->m_growlock);
         INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
         INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- -      INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- -      INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
         mp->m_kobj.kobject.kset = xfs_kset;
         /*
          * We don't create the finobt per-ag space reservation until after log
@@@ -1881,7 -1912,7 +1881,7 @@@ static struct file_system_type xfs_fs_t
         .init_fs_context        = xfs_init_fs_context,
         .parameters             = xfs_fs_parameters,
         .kill_sb                = kill_block_super,
-       .fs_flags               = FS_REQUIRES_DEV,
+       .fs_flags               = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
   };
   MODULE_ALIAS_FS("xfs");
   
@@@ -2088,12 -2119,11 +2088,12 @@@ xfs_init_workqueues(void
          * max_active value for this workqueue.
          */
         xfs_alloc_wq = alloc_workqueue("xfsalloc",
- -                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
+ +                      XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
         if (!xfs_alloc_wq)
                 return -ENOMEM;
   
- -      xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+ +      xfs_discard_wq = alloc_workqueue("xfsdiscard", XFS_WQFLAGS(WQ_UNBOUND),
+ +                      0);
         if (!xfs_discard_wq)
                 goto out_free_alloc_wq;
   
diff --combined fs/xfs/xfs_symlink.c

index 8565663,77c8ea3..1379013
--- 1/fs/xfs/xfs_symlink.c
--- 2/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@@ -134,6 -134,7 +134,7 @@@ xfs_readlink
   
   int
   xfs_symlink(
+       struct user_namespace   *mnt_userns,
         struct xfs_inode        *dp,
         struct xfs_name         *link_name,
         const char              *target_path,
@@@ -197,10 -198,9 +198,10 @@@
                 fs_blocks = xfs_symlink_blocks(mp, pathlen);
         resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
   
- -      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp);
+ +      error = xfs_trans_alloc_icreate(mp, &M_RES(mp)->tr_symlink, udqp, gdqp,
+ +                      pdqp, resblks, &tp);
         if (error)
- -              goto out_release_inode;
+ +              goto out_release_dquots;
   
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
         unlock_dp_on_error = true;
@@@ -213,16 -213,19 +214,16 @@@
                 goto out_trans_cancel;
         }
   
- -      /*
- -       * Reserve disk quota : blocks and inode.
- -       */
- -      error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
- -                                              pdqp, resblks, 1, 0);
+ +      error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
+ +                      XFS_IEXT_DIR_MANIP_CNT(mp));
         if (error)
                 goto out_trans_cancel;
   
         /*
          * Allocate an inode for the symlink.
          */
-       error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
-                              prid, &ip);
+       error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT),
+                              1, 0, prid, &ip);
         if (error)
                 goto out_trans_cancel;
   
@@@ -298,7 -301,6 +299,7 @@@
                 }
                 ASSERT(pathlen == 0);
         }
+ +      i_size_write(VFS_I(ip), ip->i_d.di_size);
   
         /*
          * Create the directory entry for the symlink.
@@@ -341,7 -343,7 +342,7 @@@ out_release_inode
                 xfs_finish_inode_setup(ip);
                 xfs_irele(ip);
         }
- -
+ +out_release_dquots:
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
diff --combined fs/zonefs/super.c

index f311543,76e45d6..b6ff4a2
--- 1/fs/zonefs/super.c
--- 2/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@@ -24,9 -24,6 +24,9 @@@
   
   #include "zonefs.h"
   
+ +#define CREATE_TRACE_POINTS
+ +#include "trace.h"
+ +
   static inline int zonefs_zone_mgmt(struct inode *inode,
                                    enum req_opf op)
   {
@@@ -35,7 -32,6 +35,7 @@@
   
         lockdep_assert_held(&zi->i_truncate_mutex);
   
+ +      trace_zonefs_zone_mgmt(inode, op);
         ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
                                zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
         if (ret) {
@@@ -104,8 -100,6 +104,8 @@@ static int zonefs_iomap_begin(struct in
         iomap->bdev = inode->i_sb->s_bdev;
         iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
   
+ +      trace_zonefs_iomap_begin(inode, iomap);
+ +
         return 0;
   }
   
@@@ -256,9 -250,6 +256,9 @@@ static loff_t zonefs_check_zone_conditi
                 }
                 inode->i_mode &= ~0222;
                 return i_size_read(inode);
+ +      case BLK_ZONE_COND_FULL:
+ +              /* The write pointer of full zones is invalid. */
+ +              return zi->i_max_size;
         default:
                 if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
                         return zi->i_max_size;
@@@ -489,7 -480,8 +489,8 @@@ unlock
         return ret;
   }
   
- static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr)
+ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
+                               struct dentry *dentry, struct iattr *iattr)
   {
         struct inode *inode = d_inode(dentry);
         int ret;
@@@ -497,7 -489,7 +498,7 @@@
         if (unlikely(IS_IMMUTABLE(inode)))
                 return -EPERM;
   
-       ret = setattr_prepare(dentry, iattr);
+       ret = setattr_prepare(&init_user_ns, dentry, iattr);
         if (ret)
                 return ret;
   
@@@ -525,7 -517,7 +526,7 @@@
                         return ret;
         }
   
-       setattr_copy(inode, iattr);
+       setattr_copy(&init_user_ns, inode, iattr);
   
         return 0;
   }
@@@ -550,7 -542,7 +551,7 @@@ static int zonefs_file_fsync(struct fil
         if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
                 ret = file_write_and_wait_range(file, start, end);
         if (!ret)
- -              ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
+ +              ret = blkdev_issue_flush(inode->i_sb->s_bdev);
   
         if (ret)
                 zonefs_io_error(inode, true);
@@@ -687,7 -679,7 +688,7 @@@ static ssize_t zonefs_file_dio_append(s
         if (!nr_pages)
                 return 0;
   
- -      bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+ +      bio = bio_alloc(GFP_NOFS, nr_pages);
         if (!bio)
                 return -ENOMEM;
   
@@@ -712,7 -704,6 +713,7 @@@
         ret = submit_bio_wait(bio);
   
         zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+ +      trace_zonefs_file_dio_append(inode, size, ret);
   
   out_release:
         bio_release_pages(bio, false);
@@@ -790,7 -781,7 +791,7 @@@ static ssize_t zonefs_file_dio_write(st
                 ret = zonefs_file_dio_append(iocb, from);
         else
                 ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- -                                 &zonefs_write_dio_ops, sync);
+ +                                 &zonefs_write_dio_ops, 0);
         if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
             (ret > 0 || ret == -EIOCBQUEUED)) {
                 if (ret > 0)
@@@ -927,7 -918,7 +928,7 @@@ static ssize_t zonefs_file_read_iter(st
                 }
                 file_accessed(iocb->ki_filp);
                 ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
- -                                 &zonefs_read_dio_ops, is_sync_kiocb(iocb));
+ +                                 &zonefs_read_dio_ops, 0);
         } else {
                 ret = generic_file_read_iter(iocb, to);
                 if (ret == -EIO)
@@@ -1233,7 -1224,7 +1234,7 @@@ static void zonefs_init_dir_inode(struc
         struct super_block *sb = parent->i_sb;
   
         inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
-       inode_init_owner(inode, parent, S_IFDIR | 0555);
+       inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
         inode->i_op = &zonefs_dir_inode_operations;
         inode->i_fop = &simple_dir_operations;
         set_nlink(inode, 2);
@@@ -1591,11 -1582,12 +1592,11 @@@ static int zonefs_fill_super(struct sup
         sb->s_time_gran = 1;
   
         /*
- -       * The block size is set to the device physical sector size to ensure
- -       * that write operations on 512e devices (512B logical block and 4KB
- -       * physical block) are always aligned to the device physical blocks,
- -       * as mandated by the ZBC/ZAC specifications.
+ +       * The block size is set to the device zone write granularity to ensure
+ +       * that write operations are always aligned according to the device
+ +       * interface constraints.
          */
- -      sb_set_blocksize(sb, bdev_physical_block_size(sb->s_bdev));
+ +      sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
         sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
         sbi->s_uid = GLOBAL_ROOT_UID;
         sbi->s_gid = GLOBAL_ROOT_GID;
diff --combined include/linux/fs.h

index 43ba79d,7762d3d..418b772
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -39,6 -39,8 +39,8 @@@
   #include <linux/fs_types.h>
   #include <linux/build_bug.h>
   #include <linux/stddef.h>
+ #include <linux/mount.h>
+ #include <linux/cred.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -1572,6 -1574,52 +1574,52 @@@ static inline void i_gid_write(struct i
         inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
   }
   
+ static inline kuid_t kuid_into_mnt(struct user_namespace *mnt_userns,
+                                  kuid_t kuid)
+ {
+       return make_kuid(mnt_userns, __kuid_val(kuid));
+ }
+ 
+ static inline kgid_t kgid_into_mnt(struct user_namespace *mnt_userns,
+                                  kgid_t kgid)
+ {
+       return make_kgid(mnt_userns, __kgid_val(kgid));
+ }
+ 
+ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
+                                   const struct inode *inode)
+ {
+       return kuid_into_mnt(mnt_userns, inode->i_uid);
+ }
+ 
+ static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
+                                   const struct inode *inode)
+ {
+       return kgid_into_mnt(mnt_userns, inode->i_gid);
+ }
+ 
+ static inline kuid_t kuid_from_mnt(struct user_namespace *mnt_userns,
+                                  kuid_t kuid)
+ {
+       return KUIDT_INIT(from_kuid(mnt_userns, kuid));
+ }
+ 
+ static inline kgid_t kgid_from_mnt(struct user_namespace *mnt_userns,
+                                  kgid_t kgid)
+ {
+       return KGIDT_INIT(from_kgid(mnt_userns, kgid));
+ }
+ 
+ static inline kuid_t fsuid_into_mnt(struct user_namespace *mnt_userns)
+ {
+       return kuid_from_mnt(mnt_userns, current_fsuid());
+ }
+ 
+ static inline kgid_t fsgid_into_mnt(struct user_namespace *mnt_userns)
+ {
+       return kgid_from_mnt(mnt_userns, current_fsgid());
+ }
+ 
   extern struct timespec64 current_time(struct inode *inode);
   
   /*
@@@ -1714,28 -1762,48 +1762,48 @@@ static inline bool sb_start_intwrite_tr
         return __sb_start_write_trylock(sb, SB_FREEZE_FS);
   }
   
- 
- extern bool inode_owner_or_capable(const struct inode *inode);
+ bool inode_owner_or_capable(struct user_namespace *mnt_userns,
+                           const struct inode *inode);
   
   /*
    * VFS helper functions..
    */
- extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
- extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
- extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
- extern int vfs_symlink(struct inode *, struct dentry *, const char *);
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
- extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+ int vfs_create(struct user_namespace *, struct inode *,
+              struct dentry *, umode_t, bool);
+ int vfs_mkdir(struct user_namespace *, struct inode *,
+             struct dentry *, umode_t);
+ int vfs_mknod(struct user_namespace *, struct inode *, struct dentry *,
+               umode_t, dev_t);
+ int vfs_symlink(struct user_namespace *, struct inode *,
+               struct dentry *, const char *);
+ int vfs_link(struct dentry *, struct user_namespace *, struct inode *,
+            struct dentry *, struct inode **);
+ int vfs_rmdir(struct user_namespace *, struct inode *, struct dentry *);
+ int vfs_unlink(struct user_namespace *, struct inode *, struct dentry *,
+              struct inode **);
+ 
+ struct renamedata {
+       struct user_namespace *old_mnt_userns;
+       struct inode *old_dir;
+       struct dentry *old_dentry;
+       struct user_namespace *new_mnt_userns;
+       struct inode *new_dir;
+       struct dentry *new_dentry;
+       struct inode **delegated_inode;
+       unsigned int flags;
+ } __randomize_layout;
   
- static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+ int vfs_rename(struct renamedata *);
+ 
+ static inline int vfs_whiteout(struct user_namespace *mnt_userns,
+                              struct inode *dir, struct dentry *dentry)
   {
-       return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+       return vfs_mknod(mnt_userns, dir, dentry, S_IFCHR | WHITEOUT_MODE,
+                        WHITEOUT_DEV);
   }
   
- extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
-                                 int open_flag);
+ struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns,
+                          struct dentry *dentry, umode_t mode, int open_flag);
   
   int vfs_mkobj(struct dentry *, umode_t,
                 int (*f)(struct dentry *, umode_t, void *),
@@@ -1757,8 -1825,8 +1825,8 @@@ extern long compat_ptr_ioctl(struct fil
   /*
    * VFS file helper functions.
    */
- extern void inode_init_owner(struct inode *inode, const struct inode *dir,
-                       umode_t mode);
+ void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode,
+                     const struct inode *dir, umode_t mode);
   extern bool may_open_dev(const struct path *path);
   
   /*
@@@ -1862,22 -1930,28 +1930,28 @@@ struct file_operations 
   struct inode_operations {
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
         const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
-       int (*permission) (struct inode *, int);
+       int (*permission) (struct user_namespace *, struct inode *, int);
         struct posix_acl * (*get_acl)(struct inode *, int);
   
         int (*readlink) (struct dentry *, char __user *,int);
   
-       int (*create) (struct inode *,struct dentry *, umode_t, bool);
+       int (*create) (struct user_namespace *, struct inode *,struct dentry *,
+                      umode_t, bool);
         int (*link) (struct dentry *,struct inode *,struct dentry *);
         int (*unlink) (struct inode *,struct dentry *);
-       int (*symlink) (struct inode *,struct dentry *,const char *);
-       int (*mkdir) (struct inode *,struct dentry *,umode_t);
+       int (*symlink) (struct user_namespace *, struct inode *,struct dentry *,
+                       const char *);
+       int (*mkdir) (struct user_namespace *, struct inode *,struct dentry *,
+                     umode_t);
         int (*rmdir) (struct inode *,struct dentry *);
-       int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
-       int (*rename) (struct inode *, struct dentry *,
+       int (*mknod) (struct user_namespace *, struct inode *,struct dentry *,
+                     umode_t,dev_t);
+       int (*rename) (struct user_namespace *, struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
-       int (*setattr) (struct dentry *, struct iattr *);
-       int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+       int (*setattr) (struct user_namespace *, struct dentry *,
+                       struct iattr *);
+       int (*getattr) (struct user_namespace *, const struct path *,
+                       struct kstat *, u32, unsigned int);
         ssize_t (*listxattr) (struct dentry *, char *, size_t);
         int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                       u64 len);
@@@ -1885,8 -1959,10 +1959,10 @@@
         int (*atomic_open)(struct inode *, struct dentry *,
                            struct file *, unsigned open_flag,
                            umode_t create_mode);
-       int (*tmpfile) (struct inode *, struct dentry *, umode_t);
-       int (*set_acl)(struct inode *, struct posix_acl *, int);
+       int (*tmpfile) (struct user_namespace *, struct inode *,
+                       struct dentry *, umode_t);
+       int (*set_acl)(struct user_namespace *, struct inode *,
+                      struct posix_acl *, int);
   } ____cacheline_aligned;
   
   static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
@@@ -2035,9 -2111,11 +2111,11 @@@ static inline bool sb_rdonly(const stru
   #define IS_WHITEOUT(inode)    (S_ISCHR(inode->i_mode) && \
                                  (inode)->i_rdev == WHITEOUT_DEV)
   
- static inline bool HAS_UNMAPPED_ID(struct inode *inode)
+ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
+                                  struct inode *inode)
   {
-       return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid);
+       return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
+              !gid_valid(i_gid_into_mnt(mnt_userns, inode));
   }
   
   static inline enum rw_hint file_write_hint(struct file *file)
@@@ -2084,8 -2162,8 +2162,8 @@@ static inline void kiocb_clone(struct k
   /*
    * Inode state bits.  Protected by inode->i_lock
    *
- - * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
- - * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
+ + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
+ + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
    *
    * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
    * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
@@@ -2094,20 -2172,12 +2172,20 @@@
    * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
    *
    * I_DIRTY_SYNC               Inode is dirty, but doesn't have to be written on
- - *                    fdatasync().  i_atime is the usual cause.
- - * I_DIRTY_DATASYNC   Data-related inode changes pending. We keep track of
+ + *                    fdatasync() (unless I_DIRTY_DATASYNC is also set).
+ + *                    Timestamp updates are the usual cause.
+ + * I_DIRTY_DATASYNC   Data-related inode changes pending.  We keep track of
    *                    these changes separately from I_DIRTY_SYNC so that we
    *                    don't have to write inode on fdatasync() when only
- - *                    mtime has changed in it.
+ + *                    e.g. the timestamps have changed.
    * I_DIRTY_PAGES      Inode has dirty pages.  Inode itself may be clean.
+ + * I_DIRTY_TIME               The inode itself only has dirty timestamps, and the
+ + *                    lazytime mount option is enabled.  We keep track of this
+ + *                    separately from I_DIRTY_SYNC in order to implement
+ + *                    lazytime.  This gets cleared if I_DIRTY_INODE
+ + *                    (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set.  I.e.
+ + *                    either I_DIRTY_TIME *or* I_DIRTY_INODE can be set in
+ + *                    i_state, but not both.  I_DIRTY_PAGES may still be set.
    * I_NEW              Serves as both a mutex and completion notification.
    *                    New inodes set I_NEW.  If two processes both create
    *                    the same inode, one of them will release its inode and
@@@ -2194,21 -2264,6 +2272,21 @@@ static inline void mark_inode_dirty_syn
         __mark_inode_dirty(inode, I_DIRTY_SYNC);
   }
   
+ +/*
+ + * Returns true if the given inode itself only has dirty timestamps (its pages
+ + * may still be dirty) and isn't currently being allocated or freed.
+ + * Filesystems should call this if when writing an inode when lazytime is
+ + * enabled, they want to opportunistically write the timestamps of other inodes
+ + * located very nearby on-disk, e.g. in the same inode block.  This returns true
+ + * if the given inode is in need of such an opportunistic update.  Requires
+ + * i_lock, or at least later re-checking under i_lock.
+ + */
+ +static inline bool inode_is_dirtytime_only(struct inode *inode)
+ +{
+ +      return (inode->i_state & (I_DIRTY_TIME | I_NEW |
+ +                                I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+ +}
+ +
   extern void inc_nlink(struct inode *inode);
   extern void drop_nlink(struct inode *inode);
   extern void clear_nlink(struct inode *inode);
@@@ -2254,6 -2309,7 +2332,7 @@@ struct file_system_type 
   #define FS_HAS_SUBTYPE                4
   #define FS_USERNS_MOUNT               8       /* Can be mounted by userns root */
   #define FS_DISALLOW_NOTIFY_PERM       16      /* Disable fanotify permission events */
+ #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
   #define FS_THP_SUPPORT                8192    /* Remove once all fs converted */
   #define FS_RENAME_DOES_D_MOVE 32768   /* FS will handle d_move() during rename() internally. */
         int (*init_fs_context)(struct fs_context *);
@@@ -2540,9 -2596,13 +2619,13 @@@ struct filename 
   };
   static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);
   
+ static inline struct user_namespace *file_mnt_user_ns(struct file *file)
+ {
+       return mnt_user_ns(file->f_path.mnt);
+ }
   extern long vfs_truncate(const struct path *, loff_t);
- extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
-                      struct file *filp);
+ int do_truncate(struct user_namespace *, struct dentry *, loff_t start,
+               unsigned int time_attrs, struct file *filp);
   extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                         loff_t len);
   extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@@ -2779,10 -2839,22 +2862,22 @@@ static inline int bmap(struct inode *in
   }
   #endif
   
- extern int notify_change(struct dentry *, struct iattr *, struct inode **);
- extern int inode_permission(struct inode *, int);
- extern int generic_permission(struct inode *, int);
- extern int __check_sticky(struct inode *dir, struct inode *inode);
+ int notify_change(struct user_namespace *, struct dentry *,
+                 struct iattr *, struct inode **);
+ int inode_permission(struct user_namespace *, struct inode *, int);
+ int generic_permission(struct user_namespace *, struct inode *, int);
+ static inline int file_permission(struct file *file, int mask)
+ {
+       return inode_permission(file_mnt_user_ns(file),
+                               file_inode(file), mask);
+ }
+ static inline int path_permission(const struct path *path, int mask)
+ {
+       return inode_permission(mnt_user_ns(path->mnt),
+                               d_inode(path->dentry), mask);
+ }
+ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
+                  struct inode *inode);
   
   static inline bool execute_ok(struct inode *inode)
   {
@@@ -3113,7 -3185,7 +3208,7 @@@ extern int __page_symlink(struct inode 
   extern int page_symlink(struct inode *inode, const char *symname, int len);
   extern const struct inode_operations page_symlink_inode_operations;
   extern void kfree_link(void *);
- extern void generic_fillattr(struct inode *, struct kstat *);
+ void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *);
   extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
   extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
   void __inode_add_bytes(struct inode *inode, loff_t bytes);
@@@ -3163,15 -3235,18 +3258,18 @@@ extern int dcache_dir_open(struct inod
   extern int dcache_dir_close(struct inode *, struct file *);
   extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
   extern int dcache_readdir(struct file *, struct dir_context *);
- extern int simple_setattr(struct dentry *, struct iattr *);
- extern int simple_getattr(const struct path *, struct kstat *, u32, unsigned int);
+ extern int simple_setattr(struct user_namespace *, struct dentry *,
+                         struct iattr *);
+ extern int simple_getattr(struct user_namespace *, const struct path *,
+                         struct kstat *, u32, unsigned int);
   extern int simple_statfs(struct dentry *, struct kstatfs *);
   extern int simple_open(struct inode *inode, struct file *file);
   extern int simple_link(struct dentry *, struct inode *, struct dentry *);
   extern int simple_unlink(struct inode *, struct dentry *);
   extern int simple_rmdir(struct inode *, struct dentry *);
- extern int simple_rename(struct inode *, struct dentry *,
-                        struct inode *, struct dentry *, unsigned int);
+ extern int simple_rename(struct user_namespace *, struct inode *,
+                        struct dentry *, struct inode *, struct dentry *,
+                        unsigned int);
   extern void simple_recursive_removal(struct dentry *,
                                 void (*callback)(struct dentry *));
   extern int noop_fsync(struct file *, loff_t, loff_t, int);
@@@ -3215,6 -3290,11 +3313,6 @@@ extern int generic_file_fsync(struct fi
   
   extern int generic_check_addressable(unsigned, u64);
   
- -#ifdef CONFIG_UNICODE
- -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
- -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- -                              const char *str, const struct qstr *name);
- -#endif
   extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
   
   #ifdef CONFIG_MIGRATION
@@@ -3229,9 -3309,10 +3327,10 @@@ extern int buffer_migrate_page_norefs(s
   #define buffer_migrate_page_norefs NULL
   #endif
   
- extern int setattr_prepare(struct dentry *, struct iattr *);
+ int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *);
   extern int inode_newsize_ok(const struct inode *, loff_t offset);
- extern void setattr_copy(struct inode *inode, const struct iattr *attr);
+ void setattr_copy(struct user_namespace *, struct inode *inode,
+                 const struct iattr *attr);
   
   extern int file_update_time(struct file *file);
   
@@@ -3395,12 -3476,13 +3494,13 @@@ static inline bool is_sxid(umode_t mode
         return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
   }
   
- static inline int check_sticky(struct inode *dir, struct inode *inode)
+ static inline int check_sticky(struct user_namespace *mnt_userns,
+                              struct inode *dir, struct inode *inode)
   {
         if (!(dir->i_mode & S_ISVTX))
                 return 0;
   
-       return __check_sticky(dir, inode);
+       return __check_sticky(mnt_userns, dir, inode);
   }
   
   static inline void inode_has_no_xattr(struct inode *inode)
diff --combined include/linux/ima.h

index 2ac834b,5486312..61d5723
--- 1/include/linux/ima.h
--- 2/include/linux/ima.h
+++ b/include/linux/ima.h
@@@ -16,7 -16,8 +16,8 @@@ struct linux_binprm
   #ifdef CONFIG_IMA
   extern int ima_bprm_check(struct linux_binprm *bprm);
   extern int ima_file_check(struct file *file, int mask);
- extern void ima_post_create_tmpfile(struct inode *inode);
+ extern void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+                                   struct inode *inode);
   extern void ima_file_free(struct file *file);
   extern int ima_file_mmap(struct file *file, unsigned long prot);
   extern int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot);
@@@ -27,14 -28,11 +28,15 @@@ extern int ima_read_file(struct file *f
                          bool contents);
   extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
                               enum kernel_read_file_id id);
- extern void ima_post_path_mknod(struct dentry *dentry);
+ extern void ima_post_path_mknod(struct user_namespace *mnt_userns,
+                               struct dentry *dentry);
   extern int ima_file_hash(struct file *file, char *buf, size_t buf_size);
   extern int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size);
   extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size);
+ +extern void ima_measure_critical_data(const char *event_label,
+ +                                    const char *event_name,
+ +                                    const void *buf, size_t buf_len,
+ +                                    bool hash);
   
   #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
   extern void ima_appraise_parse_cmdline(void);
@@@ -72,7 -70,8 +74,8 @@@ static inline int ima_file_check(struc
         return 0;
   }
   
- static inline void ima_post_create_tmpfile(struct inode *inode)
+ static inline void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+                                          struct inode *inode)
   {
   }
   
@@@ -116,7 -115,8 +119,8 @@@ static inline int ima_post_read_file(st
         return 0;
   }
   
- static inline void ima_post_path_mknod(struct dentry *dentry)
+ static inline void ima_post_path_mknod(struct user_namespace *mnt_userns,
+                                      struct dentry *dentry)
   {
         return;
   }
@@@ -132,12 -132,6 +136,12 @@@ static inline int ima_inode_hash(struc
   }
   
   static inline void ima_kexec_cmdline(int kernel_fd, const void *buf, int size) {}
+ +
+ +static inline void ima_measure_critical_data(const char *event_label,
+ +                                           const char *event_name,
+ +                                           const void *buf, size_t buf_len,
+ +                                           bool hash) {}
+ +
   #endif /* CONFIG_IMA */
   
   #ifndef CONFIG_IMA_KEXEC
@@@ -163,7 -157,8 +167,8 @@@ static inline void ima_post_key_create_
   
   #ifdef CONFIG_IMA_APPRAISE
   extern bool is_ima_appraise_enabled(void);
- extern void ima_inode_post_setattr(struct dentry *dentry);
+ extern void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+                                  struct dentry *dentry);
   extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
                        const void *xattr_value, size_t xattr_value_len);
   extern int ima_inode_removexattr(struct dentry *dentry, const char *xattr_name);
@@@ -173,7 -168,8 +178,8 @@@ static inline bool is_ima_appraise_enab
         return 0;
   }
   
- static inline void ima_inode_post_setattr(struct dentry *dentry)
+ static inline void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+                                         struct dentry *dentry)
   {
         return;
   }
diff --combined include/linux/lsm_hook_defs.h

index dfd261d,df4cdad..477a597
--- 1/include/linux/lsm_hook_defs.h
--- 2/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@@ -113,8 -113,6 +113,8 @@@ LSM_HOOK(void, LSM_RET_VOID, inode_free
   LSM_HOOK(int, 0, inode_init_security, struct inode *inode,
          struct inode *dir, const struct qstr *qstr, const char **name,
          void **value, size_t *len)
+ +LSM_HOOK(int, 0, inode_init_security_anon, struct inode *inode,
+ +       const struct qstr *name, const struct inode *context_inode)
   LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
          umode_t mode)
   LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
@@@ -135,17 -133,20 +135,20 @@@ LSM_HOOK(int, 0, inode_follow_link, str
   LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
   LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
   LSM_HOOK(int, 0, inode_getattr, const struct path *path)
- LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
-        const void *value, size_t size, int flags)
+ LSM_HOOK(int, 0, inode_setxattr, struct user_namespace *mnt_userns,
+        struct dentry *dentry, const char *name, const void *value,
+        size_t size, int flags)
   LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
          const char *name, const void *value, size_t size, int flags)
   LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
   LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
- LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
+ LSM_HOOK(int, 0, inode_removexattr, struct user_namespace *mnt_userns,
+        struct dentry *dentry, const char *name)
   LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
- LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
- LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
-        const char *name, void **buffer, bool alloc)
+ LSM_HOOK(int, 0, inode_killpriv, struct user_namespace *mnt_userns,
+        struct dentry *dentry)
+ LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct user_namespace *mnt_userns,
+        struct inode *inode, const char *name, void **buffer, bool alloc)
   LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
          const char *name, const void *value, size_t size, int flags)
   LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
diff --combined include/linux/lsm_hooks.h

index bdfc8a7,98a5e26..fb7f319
--- 1/include/linux/lsm_hooks.h
--- 2/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@@ -233,15 -233,6 +233,15 @@@
    *    Returns 0 if @name and @value have been successfully set,
    *    -EOPNOTSUPP if no security attribute is needed, or
    *    -ENOMEM on memory allocation failure.
+ + * @inode_init_security_anon:
+ + *      Set up the incore security field for the new anonymous inode
+ + *      and return whether the inode creation is permitted by the security
+ + *      module or not.
+ + *      @inode contains the inode structure
+ + *      @name name of the anonymous inode class
+ + *      @context_inode optional related inode
+ + *    Returns 0 on success, -EACCES if the security module denies the
+ + *    creation of this inode, or another -errno upon other errors.
    * @inode_create:
    *    Check permission to create a regular file.
    *    @dir contains inode structure of the parent of the new file.
@@@ -453,6 -444,7 +453,7 @@@
    * @inode_killpriv:
    *    The setuid bit is being removed.  Remove similar security labels.
    *    Called with the dentry->d_inode->i_mutex held.
+  *    @mnt_userns: user namespace of the mount
    *    @dentry is the dentry being changed.
    *    Return 0 on success.  If error is returned, then the operation
    *    causing setuid bit removal is failed.
diff --combined include/linux/security.h

index b0d14f0,4e4f6c3..8aeebd6
--- 1/include/linux/security.h
--- 2/include/linux/security.h
+++ b/include/linux/security.h
@@@ -145,13 -145,16 +145,16 @@@ extern int cap_capset(struct cred *new
                       const kernel_cap_t *inheritable,
                       const kernel_cap_t *permitted);
   extern int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file);
- extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
-                             const void *value, size_t size, int flags);
- extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
- extern int cap_inode_need_killpriv(struct dentry *dentry);
- extern int cap_inode_killpriv(struct dentry *dentry);
- extern int cap_inode_getsecurity(struct inode *inode, const char *name,
-                                void **buffer, bool alloc);
+ int cap_inode_setxattr(struct dentry *dentry, const char *name,
+                      const void *value, size_t size, int flags);
+ int cap_inode_removexattr(struct user_namespace *mnt_userns,
+                         struct dentry *dentry, const char *name);
+ int cap_inode_need_killpriv(struct dentry *dentry);
+ int cap_inode_killpriv(struct user_namespace *mnt_userns,
+                      struct dentry *dentry);
+ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+                         struct inode *inode, const char *name, void **buffer,
+                         bool alloc);
   extern int cap_mmap_addr(unsigned long addr);
   extern int cap_mmap_file(struct file *file, unsigned long reqprot,
                          unsigned long prot, unsigned long flags);
@@@ -324,9 -327,6 +327,9 @@@ void security_inode_free(struct inode *
   int security_inode_init_security(struct inode *inode, struct inode *dir,
                                  const struct qstr *qstr,
                                  initxattrs initxattrs, void *fs_data);
+ +int security_inode_init_security_anon(struct inode *inode,
+ +                                    const struct qstr *name,
+ +                                    const struct inode *context_inode);
   int security_old_inode_init_security(struct inode *inode, struct inode *dir,
                                      const struct qstr *qstr, const char **name,
                                      void **value, size_t *len);
@@@ -348,16 -348,21 +351,21 @@@ int security_inode_follow_link(struct d
   int security_inode_permission(struct inode *inode, int mask);
   int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
   int security_inode_getattr(const struct path *path);
- int security_inode_setxattr(struct dentry *dentry, const char *name,
+ int security_inode_setxattr(struct user_namespace *mnt_userns,
+                           struct dentry *dentry, const char *name,
                             const void *value, size_t size, int flags);
   void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                   const void *value, size_t size, int flags);
   int security_inode_getxattr(struct dentry *dentry, const char *name);
   int security_inode_listxattr(struct dentry *dentry);
- int security_inode_removexattr(struct dentry *dentry, const char *name);
+ int security_inode_removexattr(struct user_namespace *mnt_userns,
+                              struct dentry *dentry, const char *name);
   int security_inode_need_killpriv(struct dentry *dentry);
- int security_inode_killpriv(struct dentry *dentry);
- int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc);
+ int security_inode_killpriv(struct user_namespace *mnt_userns,
+                           struct dentry *dentry);
+ int security_inode_getsecurity(struct user_namespace *mnt_userns,
+                              struct inode *inode, const char *name,
+                              void **buffer, bool alloc);
   int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
   void security_inode_getsecid(struct inode *inode, u32 *secid);
@@@ -741,13 -746,6 +749,13 @@@ static inline int security_inode_init_s
         return 0;
   }
   
+ +static inline int security_inode_init_security_anon(struct inode *inode,
+ +                                                  const struct qstr *name,
+ +                                                  const struct inode *context_inode)
+ +{
+ +      return 0;
+ +}
+ +
   static inline int security_old_inode_init_security(struct inode *inode,
                                                    struct inode *dir,
                                                    const struct qstr *qstr,
@@@ -841,8 -839,9 +849,9 @@@ static inline int security_inode_getatt
         return 0;
   }
   
- static inline int security_inode_setxattr(struct dentry *dentry,
-               const char *name, const void *value, size_t size, int flags)
+ static inline int security_inode_setxattr(struct user_namespace *mnt_userns,
+               struct dentry *dentry, const char *name, const void *value,
+               size_t size, int flags)
   {
         return cap_inode_setxattr(dentry, name, value, size, flags);
   }
@@@ -862,10 -861,11 +871,11 @@@ static inline int security_inode_listxa
         return 0;
   }
   
- static inline int security_inode_removexattr(struct dentry *dentry,
-                       const char *name)
+ static inline int security_inode_removexattr(struct user_namespace *mnt_userns,
+                                            struct dentry *dentry,
+                                            const char *name)
   {
-       return cap_inode_removexattr(dentry, name);
+       return cap_inode_removexattr(mnt_userns, dentry, name);
   }
   
   static inline int security_inode_need_killpriv(struct dentry *dentry)
@@@ -873,14 -873,18 +883,18 @@@
         return cap_inode_need_killpriv(dentry);
   }
   
- static inline int security_inode_killpriv(struct dentry *dentry)
+ static inline int security_inode_killpriv(struct user_namespace *mnt_userns,
+                                         struct dentry *dentry)
   {
-       return cap_inode_killpriv(dentry);
+       return cap_inode_killpriv(mnt_userns, dentry);
   }
   
- static inline int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ static inline int security_inode_getsecurity(struct user_namespace *mnt_userns,
+                                            struct inode *inode,
+                                            const char *name, void **buffer,
+                                            bool alloc)
   {
-       return cap_inode_getsecurity(inode, name, buffer, alloc);
+       return cap_inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
   }
   
   static inline int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
diff --combined include/linux/syscalls.h

index f93f927,cd7b5c8..2839dc9
--- 1/include/linux/syscalls.h
--- 2/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@@ -68,6 -68,7 +68,7 @@@ union bpf_attr
   struct io_uring_params;
   struct clone_args;
   struct open_how;
+ struct mount_attr;
   
   #include <linux/types.h>
   #include <linux/aio_abi.h>
@@@ -607,11 -608,11 +608,11 @@@ asmlinkage long sys_unshare(unsigned lo
   
   /* kernel/futex.c */
   asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
- -                      struct __kernel_timespec __user *utime, u32 __user *uaddr2,
- -                      u32 val3);
+ +                        const struct __kernel_timespec __user *utime,
+ +                        u32 __user *uaddr2, u32 val3);
   asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val,
- -                      struct old_timespec32 __user *utime, u32 __user *uaddr2,
- -                      u32 val3);
+ +                               const struct old_timespec32 __user *utime,
+ +                               u32 __user *uaddr2, u32 val3);
   asmlinkage long sys_get_robust_list(int pid,
                                     struct robust_list_head __user * __user *head_ptr,
                                     size_t __user *len_ptr);
@@@ -1028,6 -1029,9 +1029,9 @@@ asmlinkage long sys_open_tree(int dfd, 
   asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
                                int to_dfd, const char __user *to_path,
                                unsigned int ms_flags);
+ asmlinkage long sys_mount_setattr(int dfd, const char __user *path,
+                                 unsigned int flags,
+                                 struct mount_attr __user *uattr, size_t usize);
   asmlinkage long sys_fsopen(const char __user *fs_name, unsigned int flags);
   asmlinkage long sys_fsconfig(int fs_fd, unsigned int cmd, const char __user *key,
                              const void __user *value, int aux);
diff --combined kernel/auditsc.c

index 434337a,fdfdd71..47fb48f
--- 1/kernel/auditsc.c
--- 2/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@@ -799,12 -799,12 +799,12 @@@ static int audit_in_mask(const struct a
         return rule->mask[word] & bit;
   }
   
- -/* At syscall entry and exit time, this filter is called if the
- - * audit_state is not low enough that auditing cannot take place, but is
- - * also not high enough that we already know we have to write an audit
- - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+ +/* At syscall exit time, this filter is called if the audit_state is
+ + * not low enough that auditing cannot take place, but is also not
+ + * high enough that we already know we have to write an audit record
+ + * (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
    */
- -static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+ +static void audit_filter_syscall(struct task_struct *tsk,
                                              struct audit_context *ctx,
                                              struct list_head *list)
   {
@@@ -812,7 -812,7 +812,7 @@@
         enum audit_state state;
   
         if (auditd_test_task(tsk))
- -              return AUDIT_DISABLED;
+ +              return;
   
         rcu_read_lock();
         list_for_each_entry_rcu(e, list, list) {
@@@ -821,11 -821,11 +821,11 @@@
                                        &state, false)) {
                         rcu_read_unlock();
                         ctx->current_state = state;
- -                      return state;
+ +                      return;
                 }
         }
         rcu_read_unlock();
- -      return AUDIT_BUILD_CONTEXT;
+ +      return;
   }
   
   /*
@@@ -1930,7 -1930,7 +1930,7 @@@ static inline int audit_copy_fcaps(stru
         if (!dentry)
                 return 0;
   
-       rc = get_vfs_caps_from_disk(dentry, &caps);
+       rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
         if (rc)
                 return rc;
   
@@@ -2481,7 -2481,8 +2481,8 @@@ int __audit_log_bprm_fcaps(struct linux
         ax->d.next = context->aux;
         context->aux = (void *)ax;
   
-       get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+       get_vfs_caps_from_disk(&init_user_ns,
+                              bprm->file->f_path.dentry, &vcaps);
   
         ax->fcap.permitted = vcaps.permitted;
         ax->fcap.inheritable = vcaps.inheritable;
diff --combined kernel/cgroup/cgroup.c

index c80fe99,091ffb5..9153b20
--- 1/kernel/cgroup/cgroup.c
--- 2/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@@ -3564,7 -3564,6 +3564,7 @@@ static ssize_t cgroup_pressure_write(st
   {
         struct psi_trigger *new;
         struct cgroup *cgrp;
+ +      struct psi_group *psi;
   
         cgrp = cgroup_kn_lock_live(of->kn, false);
         if (!cgrp)
@@@ -3573,8 -3572,7 +3573,8 @@@
         cgroup_get(cgrp);
         cgroup_kn_unlock(of->kn);
   
- -      new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ +      psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ +      new = psi_trigger_create(psi, buf, nbytes, res);
         if (IS_ERR(new)) {
                 cgroup_put(cgrp);
                 return PTR_ERR(new);
@@@ -4672,7 -4670,7 +4672,7 @@@ static int cgroup_may_write(const struc
         if (!inode)
                 return -ENOMEM;
   
-       ret = inode_permission(inode, MAY_WRITE);
+       ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
         iput(inode);
         return ret;
   }
@@@ -4728,8 -4726,8 +4728,8 @@@ static int cgroup_attach_permissions(st
         return ret;
   }
   
- -static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- -                                char *buf, size_t nbytes, loff_t off)
+ +static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ +                                  bool threadgroup)
   {
         struct cgroup *src_cgrp, *dst_cgrp;
         struct task_struct *task;
@@@ -4740,7 -4738,7 +4740,7 @@@
         if (!dst_cgrp)
                 return -ENODEV;
   
- -      task = cgroup_procs_write_start(buf, true, &locked);
+ +      task = cgroup_procs_write_start(buf, threadgroup, &locked);
         ret = PTR_ERR_OR_ZERO(task);
         if (ret)
                 goto out_unlock;
@@@ -4750,26 -4748,19 +4750,26 @@@
         src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
         spin_unlock_irq(&css_set_lock);
   
+ +      /* process and thread migrations follow same delegation rule */
         ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- -                                      of->file->f_path.dentry->d_sb, true);
+ +                                      of->file->f_path.dentry->d_sb, threadgroup);
         if (ret)
                 goto out_finish;
   
- -      ret = cgroup_attach_task(dst_cgrp, task, true);
+ +      ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
   
   out_finish:
         cgroup_procs_write_finish(task, locked);
   out_unlock:
         cgroup_kn_unlock(of->kn);
   
- -      return ret ?: nbytes;
+ +      return ret;
+ +}
+ +
+ +static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ +                                char *buf, size_t nbytes, loff_t off)
+ +{
+ +      return __cgroup_procs_write(of, buf, true) ?: nbytes;
   }
   
   static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@@ -4780,7 -4771,41 +4780,7 @@@
   static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
                                     char *buf, size_t nbytes, loff_t off)
   {
- -      struct cgroup *src_cgrp, *dst_cgrp;
- -      struct task_struct *task;
- -      ssize_t ret;
- -      bool locked;
- -
- -      buf = strstrip(buf);
- -
- -      dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- -      if (!dst_cgrp)
- -              return -ENODEV;
- -
- -      task = cgroup_procs_write_start(buf, false, &locked);
- -      ret = PTR_ERR_OR_ZERO(task);
- -      if (ret)
- -              goto out_unlock;
- -
- -      /* find the source cgroup */
- -      spin_lock_irq(&css_set_lock);
- -      src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- -      spin_unlock_irq(&css_set_lock);
- -
- -      /* thread migrations follow the cgroup.procs delegation rule */
- -      ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- -                                      of->file->f_path.dentry->d_sb, false);
- -      if (ret)
- -              goto out_finish;
- -
- -      ret = cgroup_attach_task(dst_cgrp, task, false);
- -
- -out_finish:
- -      cgroup_procs_write_finish(task, locked);
- -out_unlock:
- -      cgroup_kn_unlock(of->kn);
- -
- -      return ret ?: nbytes;
+ +      return __cgroup_procs_write(of, buf, false) ?: nbytes;
   }
   
   /* cgroup core interface files for the default hierarchy */
diff --combined kernel/sys.c

index 6928d23,138fb25..8bb46e5
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -24,6 -24,7 +24,6 @@@
   #include <linux/times.h>
   #include <linux/posix-timers.h>
   #include <linux/security.h>
- -#include <linux/dcookies.h>
   #include <linux/suspend.h>
   #include <linux/tty.h>
   #include <linux/signal.h>
@@@ -1847,7 -1848,7 +1847,7 @@@ static int prctl_set_mm_exe_file(struc
         if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
                 goto exit;
   
-       err = inode_permission(inode, MAY_EXEC);
+       err = file_permission(exe.file, MAY_EXEC);
         if (err)
                 goto exit;
   
diff --combined mm/madvise.c

index 0938fd3,d4f5eec..df692d2
--- 1/mm/madvise.c
--- 2/mm/madvise.c
+++ b/mm/madvise.c
@@@ -506,9 -506,9 +506,9 @@@ static long madvise_cold(struct vm_area
                 return -EINVAL;
   
         lru_add_drain();
- -      tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ +      tlb_gather_mmu(&tlb, mm);
         madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
- -      tlb_finish_mmu(&tlb, start_addr, end_addr);
+ +      tlb_finish_mmu(&tlb);
   
         return 0;
   }
@@@ -539,8 -539,9 +539,9 @@@ static inline bool can_do_pageout(struc
          * otherwise we'd be including shared non-exclusive mappings, which
          * opens a side channel.
          */
-       return inode_owner_or_capable(file_inode(vma->vm_file)) ||
-               inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+       return inode_owner_or_capable(&init_user_ns,
+                                     file_inode(vma->vm_file)) ||
+              file_permission(vma->vm_file, MAY_WRITE) == 0;
   }
   
   static long madvise_pageout(struct vm_area_struct *vma,
@@@ -558,9 -559,9 +559,9 @@@
                 return 0;
   
         lru_add_drain();
- -      tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ +      tlb_gather_mmu(&tlb, mm);
         madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
- -      tlb_finish_mmu(&tlb, start_addr, end_addr);
+ +      tlb_finish_mmu(&tlb);
   
         return 0;
   }
@@@ -723,7 -724,7 +724,7 @@@ static int madvise_free_single_vma(stru
                                 range.start, range.end);
   
         lru_add_drain();
- -      tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ +      tlb_gather_mmu(&tlb, mm);
         update_hiwater_rss(mm);
   
         mmu_notifier_invalidate_range_start(&range);
@@@ -732,7 -733,7 +733,7 @@@
                         &madvise_free_walk_ops, &tlb);
         tlb_end_vma(&tlb, vma);
         mmu_notifier_invalidate_range_end(&range);
- -      tlb_finish_mmu(&tlb, range.start, range.end);
+ +      tlb_finish_mmu(&tlb);
   
         return 0;
   }
diff --combined mm/memcontrol.c

index 913c2b9,cf9076f..0b9bd35
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -3115,7 -3115,9 +3115,7 @@@ void __memcg_kmem_uncharge(struct mem_c
         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 page_counter_uncharge(&memcg->kmem, nr_pages);
   
- -      page_counter_uncharge(&memcg->memory, nr_pages);
- -      if (do_memsw_account())
- -              page_counter_uncharge(&memcg->memsw, nr_pages);
+ +      refill_stock(memcg, nr_pages);
   }
   
   /**
@@@ -4897,7 -4899,7 +4897,7 @@@ static ssize_t memcg_write_event_contro
   
         /* the process need read permission on control file */
         /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = inode_permission(file_inode(cfile.file), MAY_READ);
+       ret = file_permission(cfile.file, MAY_READ);
         if (ret < 0)
                 goto out_put_cfile;
   
@@@ -6271,8 -6273,6 +6271,8 @@@ static ssize_t memory_high_write(struc
         if (err)
                 return err;
   
+ +      page_counter_set_high(&memcg->memory, high);
+ +
         for (;;) {
                 unsigned long nr_pages = page_counter_read(&memcg->memory);
                 unsigned long reclaimed;
@@@ -6296,7 -6296,10 +6296,7 @@@
                         break;
         }
   
- -      page_counter_set_high(&memcg->memory, high);
- -
         memcg_wb_domain_size_changed(memcg);
- -
         return nbytes;
   }
   
diff --combined mm/shmem.c

index 1b254fb,facdd1a..7924b3b
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -1060,7 -1060,8 +1060,8 @@@ void shmem_truncate_range(struct inode 
   }
   EXPORT_SYMBOL_GPL(shmem_truncate_range);
   
- static int shmem_getattr(const struct path *path, struct kstat *stat,
+ static int shmem_getattr(struct user_namespace *mnt_userns,
+                        const struct path *path, struct kstat *stat,
                          u32 request_mask, unsigned int query_flags)
   {
         struct inode *inode = path->dentry->d_inode;
@@@ -1072,7 -1073,7 +1073,7 @@@
                 shmem_recalc_inode(inode);
                 spin_unlock_irq(&info->lock);
         }
-       generic_fillattr(inode, stat);
+       generic_fillattr(&init_user_ns, inode, stat);
   
         if (is_huge_enabled(sb_info))
                 stat->blksize = HPAGE_PMD_SIZE;
@@@ -1080,14 -1081,15 +1081,15 @@@
         return 0;
   }
   
- static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+ static int shmem_setattr(struct user_namespace *mnt_userns,
+                        struct dentry *dentry, struct iattr *attr)
   {
         struct inode *inode = d_inode(dentry);
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
         int error;
   
-       error = setattr_prepare(dentry, attr);
+       error = setattr_prepare(&init_user_ns, dentry, attr);
         if (error)
                 return error;
   
@@@ -1141,9 -1143,9 +1143,9 @@@
                 }
         }
   
-       setattr_copy(inode, attr);
+       setattr_copy(&init_user_ns, inode, attr);
         if (attr->ia_valid & ATTR_MODE)
-               error = posix_acl_chmod(inode, inode->i_mode);
+               error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
         return error;
   }
   
@@@ -1520,11 -1522,11 +1522,11 @@@ static struct page *shmem_swapin(swp_en
   {
         struct vm_area_struct pvma;
         struct page *page;
- -      struct vm_fault vmf;
+ +      struct vm_fault vmf = {
+ +              .vma = &pvma,
+ +      };
   
         shmem_pseudo_vma_init(&pvma, info, index);
- -      vmf.vma = &pvma;
- -      vmf.address = 0;
         page = swap_cluster_readahead(swap, gfp, &vmf);
         shmem_pseudo_vma_destroy(&pvma);
   
@@@ -2303,7 -2305,7 +2305,7 @@@ static struct inode *shmem_get_inode(st
         inode = new_inode(sb);
         if (inode) {
                 inode->i_ino = ino;
-               inode_init_owner(inode, dir, mode);
+               inode_init_owner(&init_user_ns, inode, dir, mode);
                 inode->i_blocks = 0;
                 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
                 inode->i_generation = prandom_u32();
@@@ -2917,7 -2919,8 +2919,8 @@@ static int shmem_statfs(struct dentry *
    * File creation. Allocate an inode, and we're done..
    */
   static int
- shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+           struct dentry *dentry, umode_t mode, dev_t dev)
   {
         struct inode *inode;
         int error = -ENOSPC;
@@@ -2946,7 -2949,8 +2949,8 @@@ out_iput
   }
   
   static int
- shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+             struct dentry *dentry, umode_t mode)
   {
         struct inode *inode;
         int error = -ENOSPC;
@@@ -2969,20 -2973,22 +2973,22 @@@ out_iput
         return error;
   }
   
- static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+                      struct dentry *dentry, umode_t mode)
   {
         int error;
   
-       if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+       if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+                                mode | S_IFDIR, 0)))
                 return error;
         inc_nlink(dir);
         return 0;
   }
   
- static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-               bool excl)
+ static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+                       struct dentry *dentry, umode_t mode, bool excl)
   {
-       return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+       return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
   }
   
   /*
@@@ -3062,7 -3068,8 +3068,8 @@@ static int shmem_exchange(struct inode 
         return 0;
   }
   
- static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+ static int shmem_whiteout(struct user_namespace *mnt_userns,
+                         struct inode *old_dir, struct dentry *old_dentry)
   {
         struct dentry *whiteout;
         int error;
@@@ -3071,7 -3078,7 +3078,7 @@@
         if (!whiteout)
                 return -ENOMEM;
   
-       error = shmem_mknod(old_dir, whiteout,
+       error = shmem_mknod(&init_user_ns, old_dir, whiteout,
                             S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
         dput(whiteout);
         if (error)
@@@ -3094,7 -3101,10 +3101,10 @@@
    * it exists so that the VFS layer correctly free's it when it
    * gets overwritten.
    */
- static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+ static int shmem_rename2(struct user_namespace *mnt_userns,
+                        struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry,
+                        unsigned int flags)
   {
         struct inode *inode = d_inode(old_dentry);
         int they_are_dirs = S_ISDIR(inode->i_mode);
@@@ -3111,7 -3121,7 +3121,7 @@@
         if (flags & RENAME_WHITEOUT) {
                 int error;
   
-               error = shmem_whiteout(old_dir, old_dentry);
+               error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
                 if (error)
                         return error;
         }
@@@ -3135,7 -3145,8 +3145,8 @@@
         return 0;
   }
   
- static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+                        struct dentry *dentry, const char *symname)
   {
         int error;
         int len;
@@@ -3273,6 -3284,7 +3284,7 @@@ static int shmem_xattr_handler_get(cons
   }
   
   static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+                                  struct user_namespace *mnt_userns,
                                    struct dentry *unused, struct inode *inode,
                                    const char *name, const void *value,
                                    size_t size, int flags)
diff --combined net/socket.c

index 7f0617a,2826698..23c7842
--- 1/net/socket.c
--- 2/net/socket.c
+++ b/net/socket.c
@@@ -334,6 -334,7 +334,7 @@@ static const struct xattr_handler sockf
   };
   
   static int sockfs_security_xattr_set(const struct xattr_handler *handler,
+                                    struct user_namespace *mnt_userns,
                                      struct dentry *dentry, struct inode *inode,
                                      const char *suffix, const void *value,
                                      size_t size, int flags)
@@@ -537,9 -538,10 +538,10 @@@ static ssize_t sockfs_listxattr(struct 
         return used;
   }
   
- static int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+ static int sockfs_setattr(struct user_namespace *mnt_userns,
+                         struct dentry *dentry, struct iattr *iattr)
   {
-       int err = simple_setattr(dentry, iattr);
+       int err = simple_setattr(&init_user_ns, dentry, iattr);
   
         if (!err && (iattr->ia_valid & ATTR_UID)) {
                 struct socket *sock = SOCKET_I(d_inode(dentry));
@@@ -2126,9 -2128,6 +2128,9 @@@ SYSCALL_DEFINE5(setsockopt, int, fd, in
         return __sys_setsockopt(fd, level, optname, optval, optlen);
   }
   
+ +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
+ +                                                       int optname));
+ +
   /*
    *    Get a socket option. Because we don't know the option lengths we have
    *    to pass a user mode parameter for the protocols to sort out.
diff --combined security/commoncap.c

index 78598be,234b074..28f4d25
--- 1/security/commoncap.c
--- 2/security/commoncap.c
+++ b/security/commoncap.c
@@@ -303,17 -303,25 +303,25 @@@ int cap_inode_need_killpriv(struct dent
   
   /**
    * cap_inode_killpriv - Erase the security markings on an inode
-  * @dentry: The inode/dentry to alter
+  *
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dentry:   The inode/dentry to alter
    *
    * Erase the privilege-enhancing security markings on an inode.
    *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then
+  * take care to map the inode according to @mnt_userns before checking
+  * permissions. On non-idmapped mounts or if permission checking is to be
+  * performed on the raw inode simply passs init_user_ns.
+  *
    * Returns 0 if successful, -ve on error.
    */
- int cap_inode_killpriv(struct dentry *dentry)
+ int cap_inode_killpriv(struct user_namespace *mnt_userns, struct dentry *dentry)
   {
         int error;
   
-       error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
+       error = __vfs_removexattr(mnt_userns, dentry, XATTR_NAME_CAPS);
         if (error == -EOPNOTSUPP)
                 error = 0;
         return error;
@@@ -366,16 -374,16 +374,17 @@@ static bool is_v3header(size_t size, co
    * by the integrity subsystem, which really wants the unconverted values -
    * so that's good.
    */
- int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+ int cap_inode_getsecurity(struct user_namespace *mnt_userns,
+                         struct inode *inode, const char *name, void **buffer,
                           bool alloc)
   {
         int size, ret;
         kuid_t kroot;
+ +      u32 nsmagic, magic;
         uid_t root, mappedroot;
         char *tmpbuf = NULL;
         struct vfs_cap_data *cap;
- -      struct vfs_ns_cap_data *nscap;
+ +      struct vfs_ns_cap_data *nscap = NULL;
         struct dentry *dentry;
         struct user_namespace *fs_ns;
   
@@@ -387,8 -395,8 +396,8 @@@
                 return -EINVAL;
   
         size = sizeof(struct vfs_ns_cap_data);
-       ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
-                                &tmpbuf, size, GFP_NOFS);
+       ret = (int)vfs_getxattr_alloc(mnt_userns, dentry, XATTR_NAME_CAPS,
+                                     &tmpbuf, size, GFP_NOFS);
         dput(dentry);
   
         if (ret < 0)
@@@ -397,61 -405,49 +406,64 @@@
         fs_ns = inode->i_sb->s_user_ns;
         cap = (struct vfs_cap_data *) tmpbuf;
         if (is_v2header((size_t) ret, cap)) {
- -              /* If this is sizeof(vfs_cap_data) then we're ok with the
- -               * on-disk value, so return that.  */
- -              if (alloc)
- -                      *buffer = tmpbuf;
- -              else
- -                      kfree(tmpbuf);
- -              return ret;
- -      } else if (!is_v3header((size_t) ret, cap)) {
- -              kfree(tmpbuf);
- -              return -EINVAL;
+ +              root = 0;
+ +      } else if (is_v3header((size_t) ret, cap)) {
+ +              nscap = (struct vfs_ns_cap_data *) tmpbuf;
+ +              root = le32_to_cpu(nscap->rootid);
+ +      } else {
+ +              size = -EINVAL;
+ +              goto out_free;
         }
   
- -      nscap = (struct vfs_ns_cap_data *) tmpbuf;
- -      root = le32_to_cpu(nscap->rootid);
         kroot = make_kuid(fs_ns, root);
   
+       /* If this is an idmapped mount shift the kuid. */
+       kroot = kuid_into_mnt(mnt_userns, kroot);
+ 
         /* If the root kuid maps to a valid uid in current ns, then return
          * this as a nscap. */
         mappedroot = from_kuid(current_user_ns(), kroot);
         if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+ +              size = sizeof(struct vfs_ns_cap_data);
                 if (alloc) {
- -                      *buffer = tmpbuf;
+ +                      if (!nscap) {
+ +                              /* v2 -> v3 conversion */
+ +                              nscap = kzalloc(size, GFP_ATOMIC);
+ +                              if (!nscap) {
+ +                                      size = -ENOMEM;
+ +                                      goto out_free;
+ +                              }
+ +                              nsmagic = VFS_CAP_REVISION_3;
+ +                              magic = le32_to_cpu(cap->magic_etc);
+ +                              if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ +                                      nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+ +                              memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ +                              nscap->magic_etc = cpu_to_le32(nsmagic);
+ +                      } else {
+ +                              /* use allocated v3 buffer */
+ +                              tmpbuf = NULL;
+ +                      }
                         nscap->rootid = cpu_to_le32(mappedroot);
- -              } else
- -                      kfree(tmpbuf);
- -              return size;
+ +                      *buffer = nscap;
+ +              }
+ +              goto out_free;
         }
   
         if (!rootid_owns_currentns(kroot)) {
- -              kfree(tmpbuf);
- -              return -EOPNOTSUPP;
+ +              size = -EOVERFLOW;
+ +              goto out_free;
         }
   
         /* This comes from a parent namespace.  Return as a v2 capability */
         size = sizeof(struct vfs_cap_data);
         if (alloc) {
- -              *buffer = kmalloc(size, GFP_ATOMIC);
- -              if (*buffer) {
- -                      struct vfs_cap_data *cap = *buffer;
- -                      __le32 nsmagic, magic;
+ +              if (nscap) {
+ +                      /* v3 -> v2 conversion */
+ +                      cap = kzalloc(size, GFP_ATOMIC);
+ +                      if (!cap) {
+ +                              size = -ENOMEM;
+ +                              goto out_free;
+ +                      }
                         magic = VFS_CAP_REVISION_2;
                         nsmagic = le32_to_cpu(nscap->magic_etc);
                         if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
@@@ -459,26 -455,40 +471,43 @@@
                         memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                         cap->magic_etc = cpu_to_le32(magic);
                 } else {
- -                      size = -ENOMEM;
+ +                      /* use unconverted v2 */
+ +                      tmpbuf = NULL;
                 }
+ +              *buffer = cap;
         }
+ +out_free:
         kfree(tmpbuf);
         return size;
   }
   
+ /**
+  * rootid_from_xattr - translate root uid of vfs caps
+  *
+  * @value:    vfs caps value which may be modified by this function
+  * @size:     size of @ivalue
+  * @task_ns:  user namespace of the caller
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then
+  * take care to map the inode according to @mnt_userns before checking
+  * permissions. On non-idmapped mounts or if permission checking is to be
+  * performed on the raw inode simply passs init_user_ns.
+  */
   static kuid_t rootid_from_xattr(const void *value, size_t size,
-                               struct user_namespace *task_ns)
+                               struct user_namespace *task_ns,
+                               struct user_namespace *mnt_userns)
   {
         const struct vfs_ns_cap_data *nscap = value;
+       kuid_t rootkid;
         uid_t rootid = 0;
   
         if (size == XATTR_CAPS_SZ_3)
                 rootid = le32_to_cpu(nscap->rootid);
   
-       return make_kuid(task_ns, rootid);
+       rootkid = make_kuid(task_ns, rootid);
+       return kuid_from_mnt(mnt_userns, rootkid);
   }
   
   static bool validheader(size_t size, const struct vfs_cap_data *cap)
@@@ -486,13 -496,27 +515,27 @@@
         return is_v2header(size, cap) || is_v3header(size, cap);
   }
   
- /*
+ /**
+  * cap_convert_nscap - check vfs caps
+  *
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dentry:   used to retrieve inode to check permissions on
+  * @ivalue:   vfs caps value which may be modified by this function
+  * @size:     size of @ivalue
+  *
    * User requested a write of security.capability.  If needed, update the
    * xattr to change from v2 to v3, or to fixup the v3 rootid.
    *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then
+  * take care to map the inode according to @mnt_userns before checking
+  * permissions. On non-idmapped mounts or if permission checking is to be
+  * performed on the raw inode simply passs init_user_ns.
+  *
    * If all is ok, we return the new size, on error return < 0.
    */
- int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
+ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry,
+                     const void **ivalue, size_t size)
   {
         struct vfs_ns_cap_data *nscap;
         uid_t nsrootid;
@@@ -500,8 -524,7 +543,8 @@@
         __u32 magic, nsmagic;
         struct inode *inode = d_backing_inode(dentry);
         struct user_namespace *task_ns = current_user_ns(),
- -              *fs_ns = inode->i_sb->s_user_ns;
+ +              *fs_ns = inode->i_sb->s_user_ns,
+ +              *ancestor;
         kuid_t rootid;
         size_t newsize;
   
@@@ -509,14 -532,14 +552,14 @@@
                 return -EINVAL;
         if (!validheader(size, cap))
                 return -EINVAL;
-       if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+       if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
                 return -EPERM;
-       if (size == XATTR_CAPS_SZ_2)
+       if (size == XATTR_CAPS_SZ_2 && (mnt_userns == &init_user_ns))
                 if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                         /* user is privileged, just write the v2 */
                         return size;
   
-       rootid = rootid_from_xattr(*ivalue, size, task_ns);
+       rootid = rootid_from_xattr(*ivalue, size, task_ns, mnt_userns);
         if (!uid_valid(rootid))
                 return -EINVAL;
   
@@@ -524,15 -547,6 +567,15 @@@
         if (nsrootid == -1)
                 return -EINVAL;
   
+ +      /*
+ +       * Do not allow allow adding a v3 filesystem capability xattr
+ +       * if the rootid field is ambiguous.
+ +       */
+ +      for (ancestor = task_ns->parent; ancestor; ancestor = ancestor->parent) {
+ +              if (from_kuid(ancestor, rootid) == 0)
+ +                      return -EINVAL;
+ +      }
+ +
         newsize = sizeof(struct vfs_ns_cap_data);
         nscap = kmalloc(newsize, GFP_ATOMIC);
         if (!nscap)
@@@ -593,10 -607,24 +636,24 @@@ static inline int bprm_caps_from_vfs_ca
         return *effective ? ret : 0;
   }
   
- /*
+ /**
+  * get_vfs_caps_from_disk - retrieve vfs caps from disk
+  *
+  * @mnt_userns:       user namespace of the mount the inode was found from
+  * @dentry:   dentry from which @inode is retrieved
+  * @cpu_caps: vfs capabilities
+  *
    * Extract the on-exec-apply capability sets for an executable file.
+  *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then
+  * take care to map the inode according to @mnt_userns before checking
+  * permissions. On non-idmapped mounts or if permission checking is to be
+  * performed on the raw inode simply passs init_user_ns.
    */
- int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+ int get_vfs_caps_from_disk(struct user_namespace *mnt_userns,
+                          const struct dentry *dentry,
+                          struct cpu_vfs_cap_data *cpu_caps)
   {
         struct inode *inode = d_backing_inode(dentry);
         __u32 magic_etc;
@@@ -652,6 -680,7 +709,7 @@@
         /* Limit the caps to the mounter of the filesystem
          * or the more limited uid specified in the xattr.
          */
+       rootkuid = kuid_into_mnt(mnt_userns, rootkuid);
         if (!rootid_owns_currentns(rootkuid))
                 return -ENODATA;
   
@@@ -697,7 -726,8 +755,8 @@@ static int get_file_caps(struct linux_b
         if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                 return 0;
   
-       rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
+       rc = get_vfs_caps_from_disk(file_mnt_user_ns(file),
+                                   file->f_path.dentry, &vcaps);
         if (rc < 0) {
                 if (rc == -EINVAL)
                         printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
@@@ -962,16 -992,25 +1021,25 @@@ int cap_inode_setxattr(struct dentry *d
   
   /**
    * cap_inode_removexattr - Determine whether an xattr may be removed
-  * @dentry: The inode/dentry being altered
-  * @name: The name of the xattr to be changed
+  *
+  * @mnt_userns:       User namespace of the mount the inode was found from
+  * @dentry:   The inode/dentry being altered
+  * @name:     The name of the xattr to be changed
    *
    * Determine whether an xattr may be removed from an inode, returning 0 if
    * permission is granted, -ve if denied.
    *
+  * If the inode has been found through an idmapped mount the user namespace of
+  * the vfsmount must be passed through @mnt_userns. This function will then
+  * take care to map the inode according to @mnt_userns before checking
+  * permissions. On non-idmapped mounts or if permission checking is to be
+  * performed on the raw inode simply passs init_user_ns.
+  *
    * This is used to make sure security xattrs don't get removed by those who
    * aren't privileged to remove them.
    */
- int cap_inode_removexattr(struct dentry *dentry, const char *name)
+ int cap_inode_removexattr(struct user_namespace *mnt_userns,
+                         struct dentry *dentry, const char *name)
   {
         struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
   
@@@ -985,7 -1024,7 +1053,7 @@@
                 struct inode *inode = d_backing_inode(dentry);
                 if (!inode)
                         return -EINVAL;
-               if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+               if (!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_SETFCAP))
                         return -EPERM;
                 return 0;
         }
diff --combined security/integrity/evm/evm_crypto.c

index a6dd47e,f720f78..d76b006
--- 1/security/integrity/evm/evm_crypto.c
--- 2/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@@ -73,7 -73,7 +73,7 @@@ static struct shash_desc *init_desc(cha
   {
         long rc;
         const char *algo;
- -      struct crypto_shash **tfm, *tmp_tfm;
+ +      struct crypto_shash **tfm, *tmp_tfm = NULL;
         struct shash_desc *desc;
   
         if (type == EVM_XATTR_HMAC) {
@@@ -118,16 -118,13 +118,16 @@@ unlock
   alloc:
         desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(*tfm),
                         GFP_KERNEL);
- -      if (!desc)
+ +      if (!desc) {
+ +              crypto_free_shash(tmp_tfm);
                 return ERR_PTR(-ENOMEM);
+ +      }
   
         desc->tfm = *tfm;
   
         rc = crypto_shash_init(desc);
         if (rc) {
+ +              crypto_free_shash(tmp_tfm);
                 kfree(desc);
                 return ERR_PTR(rc);
         }
@@@ -225,7 -222,7 +225,7 @@@ static int evm_calc_hmac_or_hash(struc
                                 ima_present = true;
                         continue;
                 }
-               size = vfs_getxattr_alloc(dentry, xattr->name,
+               size = vfs_getxattr_alloc(&init_user_ns, dentry, xattr->name,
                                           &xattr_value, xattr_size, GFP_NOFS);
                 if (size == -ENOMEM) {
                         error = -ENOMEM;
@@@ -278,8 -275,8 +278,8 @@@ static int evm_is_immutable(struct dent
                 return 1;
   
         /* Do this the hard way */
-       rc = vfs_getxattr_alloc(dentry, XATTR_NAME_EVM, (char **)&xattr_data, 0,
-                               GFP_NOFS);
+       rc = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_EVM,
+                               (char **)&xattr_data, 0, GFP_NOFS);
         if (rc <= 0) {
                 if (rc == -ENODATA)
                         return 0;
@@@ -322,11 -319,12 +322,12 @@@ int evm_update_evmxattr(struct dentry *
                            xattr_value_len, &data);
         if (rc == 0) {
                 data.hdr.xattr.sha1.type = EVM_XATTR_HMAC;
-               rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_EVM,
+               rc = __vfs_setxattr_noperm(&init_user_ns, dentry,
+                                          XATTR_NAME_EVM,
                                            &data.hdr.xattr.data[1],
                                            SHA1_DIGEST_SIZE + 1, 0);
         } else if (rc == -ENODATA && (inode->i_opflags & IOP_XATTR)) {
-               rc = __vfs_removexattr(dentry, XATTR_NAME_EVM);
+               rc = __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_EVM);
         }
         return rc;
   }
diff --combined security/integrity/ima/ima.h

index aa31247,b87c900..8e8b525
--- 1/security/integrity/ima/ima.h
--- 2/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@@ -201,7 -201,6 +201,7 @@@ static inline unsigned int ima_hash_key
         hook(POLICY_CHECK, policy)                      \
         hook(KEXEC_CMDLINE, kexec_cmdline)              \
         hook(KEY_CHECK, key)                            \
+ +      hook(CRITICAL_DATA, critical_data)              \
         hook(MAX_CHECK, none)
   
   #define __ima_hook_enumify(ENUM, str) ENUM,
@@@ -254,10 -253,11 +254,11 @@@ static inline void ima_process_queued_k
   #endif /* CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS */
   
   /* LIM API function definitions */
- int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-                  int mask, enum ima_hooks func, int *pcr,
+ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+                  const struct cred *cred, u32 secid, int mask,
+                  enum ima_hooks func, int *pcr,
                    struct ima_template_desc **template_desc,
- -                 const char *keyring);
+ +                 const char *func_data);
   int ima_must_measure(struct inode *inode, int mask, enum ima_hooks func);
   int ima_collect_measurement(struct integrity_iint_cache *iint,
                             struct file *file, void *buf, loff_t size,
@@@ -267,10 -267,10 +268,11 @@@ void ima_store_measurement(struct integ
                            struct evm_ima_xattr_data *xattr_value,
                            int xattr_len, const struct modsig *modsig, int pcr,
                            struct ima_template_desc *template_desc);
- void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+ void process_buffer_measurement(struct user_namespace *mnt_userns,
+                               struct inode *inode, const void *buf, int size,
                                 const char *eventname, enum ima_hooks func,
- -                              int pcr, const char *keyring);
+ +                              int pcr, const char *func_data,
+ +                              bool buf_hash);
   void ima_audit_measurement(struct integrity_iint_cache *iint,
                            const unsigned char *filename);
   int ima_alloc_init_template(struct ima_event_data *event_data,
@@@ -283,10 -283,11 +285,11 @@@ void ima_free_template_entry(struct ima
   const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
   
   /* IMA policy related functions */
- int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-                    enum ima_hooks func, int mask, int flags, int *pcr,
+ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+                    const struct cred *cred, u32 secid, enum ima_hooks func,
+                    int mask, int flags, int *pcr,
                      struct ima_template_desc **template_desc,
- -                   const char *keyring);
+ +                   const char *func_data);
   void ima_init_policy(void);
   void ima_update_policy(void);
   void ima_update_policy_flag(void);
@@@ -315,7 -316,8 +318,8 @@@ int ima_appraise_measurement(enum ima_h
                              struct file *file, const unsigned char *filename,
                              struct evm_ima_xattr_data *xattr_value,
                              int xattr_len, const struct modsig *modsig);
- int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func);
+ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+                     int mask, enum ima_hooks func);
   void ima_update_xattr(struct integrity_iint_cache *iint, struct file *file);
   enum integrity_status ima_get_cache_status(struct integrity_iint_cache *iint,
                                            enum ima_hooks func);
@@@ -342,7 -344,8 +346,8 @@@ static inline int ima_appraise_measurem
         return INTEGRITY_UNKNOWN;
   }
   
- static inline int ima_must_appraise(struct inode *inode, int mask,
+ static inline int ima_must_appraise(struct user_namespace *mnt_userns,
+                                   struct inode *inode, int mask,
                                     enum ima_hooks func)
   {
         return 0;
diff --combined security/integrity/ima/ima_api.c

index 1dd70dc,ed410ef..d8e321c
--- 1/security/integrity/ima/ima_api.c
--- 2/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@@ -162,6 -162,7 +162,7 @@@ err_out
   
   /**
    * ima_get_action - appraise & measure decision based on policy.
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: pointer to the inode associated with the object being validated
    * @cred: pointer to credentials structure to validate
    * @secid: secid of the task being validated
@@@ -170,30 -171,31 +171,31 @@@
    * @func: caller identifier
    * @pcr: pointer filled in if matched measure policy sets pcr=
    * @template_desc: pointer filled in if matched measure policy sets template=
- - * @keyring: keyring name used to determine the action
+ + * @func_data: func specific data, may be NULL
    *
    * The policy is defined in terms of keypairs:
    *            subj=, obj=, type=, func=, mask=, fsmagic=
    *    subj,obj, and type: are LSM specific.
    *    func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK
- - *    | KEXEC_CMDLINE | KEY_CHECK
+ + *    | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA
    *    mask: contains the permission mask
    *    fsmagic: hex value
    *
    * Returns IMA_MEASURE, IMA_APPRAISE mask.
    *
    */
- int ima_get_action(struct inode *inode, const struct cred *cred, u32 secid,
-                  int mask, enum ima_hooks func, int *pcr,
+ int ima_get_action(struct user_namespace *mnt_userns, struct inode *inode,
+                  const struct cred *cred, u32 secid, int mask,
+                  enum ima_hooks func, int *pcr,
                    struct ima_template_desc **template_desc,
- -                 const char *keyring)
+ +                 const char *func_data)
   {
         int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH;
   
         flags &= ima_policy_flag;
   
-       return ima_match_policy(inode, cred, secid, func, mask, flags, pcr,
-                               template_desc, func_data);
+       return ima_match_policy(mnt_userns, inode, cred, secid, func, mask,
- -                              flags, pcr, template_desc, keyring);
++                              flags, pcr, template_desc, func_data);
   }
   
   /*
diff --combined security/integrity/ima/ima_appraise.c

index 46ffa38,2e64b9f..565e33f
--- 1/security/integrity/ima/ima_appraise.c
--- 2/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@@ -68,7 -68,8 +68,8 @@@ bool is_ima_appraise_enabled(void
    *
    * Return 1 to appraise or hash
    */
- int ima_must_appraise(struct inode *inode, int mask, enum ima_hooks func)
+ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode,
+                     int mask, enum ima_hooks func)
   {
         u32 secid;
   
@@@ -76,8 -77,8 +77,8 @@@
                 return 0;
   
         security_task_getsecid(current, &secid);
-       return ima_match_policy(inode, current_cred(), secid, func, mask,
-                               IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
+       return ima_match_policy(mnt_userns, inode, current_cred(), secid, func,
+                               mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL);
   }
   
   static int ima_fix_xattr(struct dentry *dentry,
@@@ -94,7 -95,7 +95,7 @@@
                 iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
                 iint->ima_hash->xattr.ng.algo = algo;
         }
-       rc = __vfs_setxattr_noperm(dentry, XATTR_NAME_IMA,
+       rc = __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_IMA,
                                    &iint->ima_hash->xattr.data[offset],
                                    (sizeof(iint->ima_hash->xattr) - offset) +
                                    iint->ima_hash->length, 0);
@@@ -215,8 -216,8 +216,8 @@@ int ima_read_xattr(struct dentry *dentr
   {
         ssize_t ret;
   
-       ret = vfs_getxattr_alloc(dentry, XATTR_NAME_IMA, (char **)xattr_value,
-                                0, GFP_NOFS);
+       ret = vfs_getxattr_alloc(&init_user_ns, dentry, XATTR_NAME_IMA,
+                                (char **)xattr_value, 0, GFP_NOFS);
         if (ret == -EOPNOTSUPP)
                 ret = 0;
         return ret;
@@@ -350,9 -351,9 +351,9 @@@ int ima_check_blacklist(struct integrit
   
                 rc = is_binary_blacklisted(digest, digestsize);
                 if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
-                       process_buffer_measurement(NULL, digest, digestsize,
+                       process_buffer_measurement(&init_user_ns, NULL, digest, digestsize,
                                                    "blacklisted-hash", NONE,
- -                                                 pcr, NULL);
+ +                                                 pcr, NULL, false);
         }
   
         return rc;
@@@ -501,6 -502,7 +502,7 @@@ void ima_update_xattr(struct integrity_
   
   /**
    * ima_inode_post_setattr - reflect file metadata changes
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @dentry: pointer to the affected dentry
    *
    * Changes to a dentry's metadata might result in needing to appraise.
@@@ -508,7 -510,8 +510,8 @@@
    * This function is called from notify_change(), which expects the caller
    * to lock the inode's i_mutex.
    */
- void ima_inode_post_setattr(struct dentry *dentry)
+ void ima_inode_post_setattr(struct user_namespace *mnt_userns,
+                           struct dentry *dentry)
   {
         struct inode *inode = d_backing_inode(dentry);
         struct integrity_iint_cache *iint;
@@@ -518,9 -521,9 +521,9 @@@
             || !(inode->i_opflags & IOP_XATTR))
                 return;
   
-       action = ima_must_appraise(inode, MAY_ACCESS, POST_SETATTR);
+       action = ima_must_appraise(mnt_userns, inode, MAY_ACCESS, POST_SETATTR);
         if (!action)
-               __vfs_removexattr(dentry, XATTR_NAME_IMA);
+               __vfs_removexattr(&init_user_ns, dentry, XATTR_NAME_IMA);
         iint = integrity_iint_find(inode);
         if (iint) {
                 set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
diff --combined security/integrity/ima/ima_asymmetric_keys.c

index a740957,c4ef691..1fb0b0e
--- 1/security/integrity/ima/ima_asymmetric_keys.c
--- 2/security/integrity/ima/ima_asymmetric_keys.c
+++ b/security/integrity/ima/ima_asymmetric_keys.c
@@@ -10,6 -10,7 +10,7 @@@
    */
   
   #include <keys/asymmetric-type.h>
+ #include <linux/user_namespace.h>
   #include "ima.h"
   
   /**
@@@ -58,7 -59,7 +59,7 @@@ void ima_post_key_create_or_update(stru
          * if the IMA policy is configured to measure a key linked
          * to the given keyring.
          */
-       process_buffer_measurement(NULL, payload, payload_len,
+       process_buffer_measurement(&init_user_ns, NULL, payload, payload_len,
                                    keyring->description, KEY_CHECK, 0,
- -                                 keyring->description);
+ +                                 keyring->description, false);
   }
diff --combined security/integrity/ima/ima_main.c

index 6a42984,cb1c56e..9ef748e
--- 1/security/integrity/ima/ima_main.c
--- 2/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@@ -218,8 -218,8 +218,8 @@@ static int process_measurement(struct f
          * bitmask based on the appraise/audit/measurement policy.
          * Included is the appraise submask.
          */
-       action = ima_get_action(inode, cred, secid, mask, func, &pcr,
-                               &template_desc, NULL);
+       action = ima_get_action(file_mnt_user_ns(file), inode, cred, secid,
+                               mask, func, &pcr, &template_desc, NULL);
         violation_check = ((func == FILE_CHECK || func == MMAP_CHECK) &&
                            (ima_policy_flag & IMA_MEASURE));
         if (!action && !violation_check)
@@@ -431,8 -431,9 +431,9 @@@ int ima_file_mprotect(struct vm_area_st
   
         security_task_getsecid(current, &secid);
         inode = file_inode(vma->vm_file);
-       action = ima_get_action(inode, current_cred(), secid, MAY_EXEC,
-                               MMAP_CHECK, &pcr, &template, 0);
+       action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode,
+                               current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+                               &pcr, &template, 0);
   
         /* Is the mmap'ed file in policy? */
         if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
@@@ -592,18 -593,21 +593,21 @@@ EXPORT_SYMBOL_GPL(ima_inode_hash)
   
   /**
    * ima_post_create_tmpfile - mark newly created tmpfile as new
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @file : newly created tmpfile
    *
    * No measuring, appraising or auditing of newly created tmpfiles is needed.
    * Skip calling process_measurement(), but indicate which newly, created
    * tmpfiles are in policy.
    */
- void ima_post_create_tmpfile(struct inode *inode)
+ void ima_post_create_tmpfile(struct user_namespace *mnt_userns,
+                            struct inode *inode)
   {
         struct integrity_iint_cache *iint;
         int must_appraise;
   
-       must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+       must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+                                         FILE_CHECK);
         if (!must_appraise)
                 return;
   
@@@ -619,18 -623,21 +623,21 @@@
   
   /**
    * ima_post_path_mknod - mark as a new inode
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @dentry: newly created dentry
    *
    * Mark files created via the mknodat syscall as new, so that the
    * file data can be written later.
    */
- void ima_post_path_mknod(struct dentry *dentry)
+ void ima_post_path_mknod(struct user_namespace *mnt_userns,
+                        struct dentry *dentry)
   {
         struct integrity_iint_cache *iint;
         struct inode *inode = dentry->d_inode;
         int must_appraise;
   
-       must_appraise = ima_must_appraise(inode, MAY_ACCESS, FILE_CHECK);
+       must_appraise = ima_must_appraise(mnt_userns, inode, MAY_ACCESS,
+                                         FILE_CHECK);
         if (!must_appraise)
                 return;
   
@@@ -809,22 -816,22 +816,24 @@@ int ima_post_load_data(char *buf, loff_
   }
   
   /*
- - * process_buffer_measurement - Measure the buffer to ima log.
+ + * process_buffer_measurement - Measure the buffer or the buffer data hash
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
    * @buf: pointer to the buffer that needs to be added to the log.
    * @size: size of buffer(in bytes).
    * @eventname: event name to be used for the buffer entry.
    * @func: IMA hook
    * @pcr: pcr to extend the measurement
- - * @keyring: keyring name to determine the action to be performed
+ + * @func_data: func specific data, may be NULL
+ + * @buf_hash: measure buffer data hash
    *
- - * Based on policy, the buffer is measured into the ima log.
+ + * Based on policy, either the buffer data or buffer data hash is measured
    */
- void process_buffer_measurement(struct inode *inode, const void *buf, int size,
+ void process_buffer_measurement(struct user_namespace *mnt_userns,
+                               struct inode *inode, const void *buf, int size,
                                 const char *eventname, enum ima_hooks func,
- -                              int pcr, const char *keyring)
+ +                              int pcr, const char *func_data,
+ +                              bool buf_hash)
   {
         int ret = 0;
         const char *audit_cause = "ENOMEM";
@@@ -839,8 -846,6 +848,8 @@@
                 struct ima_digest_data hdr;
                 char digest[IMA_MAX_DIGEST_SIZE];
         } hash = {};
+ +      char digest_hash[IMA_MAX_DIGEST_SIZE];
+ +      int digest_hash_len = hash_digest_size[ima_hash_algo];
         int violation = 0;
         int action = 0;
         u32 secid;
@@@ -864,8 -869,9 +873,9 @@@
          */
         if (func) {
                 security_task_getsecid(current, &secid);
-               action = ima_get_action(inode, current_cred(), secid, 0, func,
-                                       &pcr, &template, func_data);
+               action = ima_get_action(mnt_userns, inode, current_cred(),
+                                       secid, 0, func, &pcr, &template,
- -                                      keyring);
++                                      func_data);
                 if (!(action & IMA_MEASURE))
                         return;
         }
@@@ -883,27 -889,13 +893,27 @@@
                 goto out;
         }
   
+ +      if (buf_hash) {
+ +              memcpy(digest_hash, hash.hdr.digest, digest_hash_len);
+ +
+ +              ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
+ +                                         iint.ima_hash);
+ +              if (ret < 0) {
+ +                      audit_cause = "hashing_error";
+ +                      goto out;
+ +              }
+ +
+ +              event_data.buf = digest_hash;
+ +              event_data.buf_len = digest_hash_len;
+ +      }
+ +
         ret = ima_alloc_init_template(&event_data, &entry, template);
         if (ret < 0) {
                 audit_cause = "alloc_entry";
                 goto out;
         }
   
- -      ret = ima_store_template(entry, violation, NULL, buf, pcr);
+ +      ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
         if (ret < 0) {
                 audit_cause = "store_entry";
                 ima_free_template_entry(entry);
@@@ -937,38 -929,12 +947,38 @@@ void ima_kexec_cmdline(int kernel_fd, c
         if (!f.file)
                 return;
   
-       process_buffer_measurement(file_inode(f.file), buf, size,
-                                  "kexec-cmdline", KEXEC_CMDLINE, 0, NULL,
-                                  false);
+       process_buffer_measurement(file_mnt_user_ns(f.file), file_inode(f.file),
+                                  buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
- -                                 NULL);
++                                 NULL, false);
         fdput(f);
   }
   
-       process_buffer_measurement(NULL, buf, buf_len, event_name,
+ +/**
+ + * ima_measure_critical_data - measure kernel integrity critical data
+ + * @event_label: unique event label for grouping and limiting critical data
+ + * @event_name: event name for the record in the IMA measurement list
+ + * @buf: pointer to buffer data
+ + * @buf_len: length of buffer data (in bytes)
+ + * @hash: measure buffer data hash
+ + *
+ + * Measure data critical to the integrity of the kernel into the IMA log
+ + * and extend the pcr.  Examples of critical data could be various data
+ + * structures, policies, and states stored in kernel memory that can
+ + * impact the integrity of the system.
+ + */
+ +void ima_measure_critical_data(const char *event_label,
+ +                             const char *event_name,
+ +                             const void *buf, size_t buf_len,
+ +                             bool hash)
+ +{
+ +      if (!event_name || !event_label || !buf || !buf_len)
+ +              return;
+ +
++      process_buffer_measurement(&init_user_ns, NULL, buf, buf_len, event_name,
+ +                                 CRITICAL_DATA, 0, event_label,
+ +                                 hash);
+ +}
+ +
   static int __init init_ima(void)
   {
         int error;
diff --combined security/integrity/ima/ima_policy.c

index 9b45d06,e14426c..4f8cb15
--- 1/security/integrity/ima/ima_policy.c
--- 2/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@@ -34,7 -34,6 +34,7 @@@
   #define IMA_PCR               0x0100
   #define IMA_FSNAME    0x0200
   #define IMA_KEYRINGS  0x0400
+ +#define IMA_LABEL     0x0800
   
   #define UNKNOWN               0
   #define MEASURE               0x0001  /* same as IMA_MEASURE */
@@@ -86,7 -85,6 +86,7 @@@ struct ima_rule_entry 
         } lsm[MAX_LSM_RULES];
         char *fsname;
         struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */
+ +      struct ima_rule_opt_list *label; /* Measure data grouped under this label */
         struct ima_template_desc *template;
   };
   
@@@ -206,10 -204,6 +206,10 @@@ static struct ima_rule_entry secure_boo
          .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED},
   };
   
+ +static struct ima_rule_entry critical_data_rules[] __ro_after_init = {
+ +      {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC},
+ +};
+ +
   /* An array of architecture specific rules */
   static struct ima_rule_entry *arch_policy_entry __ro_after_init;
   
@@@ -232,7 -226,6 +232,7 @@@ __setup("ima_tcb", default_measure_poli
   
   static bool ima_use_appraise_tcb __initdata;
   static bool ima_use_secure_boot __initdata;
+ +static bool ima_use_critical_data __initdata;
   static bool ima_fail_unverifiable_sigs __ro_after_init;
   static int __init policy_setup(char *str)
   {
@@@ -247,8 -240,6 +247,8 @@@
                         ima_use_appraise_tcb = true;
                 else if (strcmp(p, "secure_boot") == 0)
                         ima_use_secure_boot = true;
+ +              else if (strcmp(p, "critical_data") == 0)
+ +                      ima_use_critical_data = true;
                 else if (strcmp(p, "fail_securely") == 0)
                         ima_fail_unverifiable_sigs = true;
                 else
@@@ -462,46 -453,30 +462,46 @@@ int ima_lsm_policy_change(struct notifi
   }
   
   /**
- - * ima_match_keyring - determine whether the keyring matches the measure rule
+ + * ima_match_rule_data - determine whether func_data matches the policy rule
    * @rule: a pointer to a rule
- - * @keyring: name of the keyring to match against the measure rule
+ + * @func_data: data to match against the measure rule data
    * @cred: a pointer to a credentials structure for user validation
    *
- - * Returns true if keyring matches one in the rule, false otherwise.
+ + * Returns true if func_data matches one in the rule, false otherwise.
    */
- -static bool ima_match_keyring(struct ima_rule_entry *rule,
- -                            const char *keyring, const struct cred *cred)
+ +static bool ima_match_rule_data(struct ima_rule_entry *rule,
+ +                              const char *func_data,
+ +                              const struct cred *cred)
   {
+ +      const struct ima_rule_opt_list *opt_list = NULL;
         bool matched = false;
         size_t i;
   
         if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid))
                 return false;
   
- -      if (!rule->keyrings)
- -              return true;
+ +      switch (rule->func) {
+ +      case KEY_CHECK:
+ +              if (!rule->keyrings)
+ +                      return true;
+ +
+ +              opt_list = rule->keyrings;
+ +              break;
+ +      case CRITICAL_DATA:
+ +              if (!rule->label)
+ +                      return true;
+ +
+ +              opt_list = rule->label;
+ +              break;
+ +      default:
+ +              return false;
+ +      }
   
- -      if (!keyring)
+ +      if (!func_data)
                 return false;
   
- -      for (i = 0; i < rule->keyrings->count; i++) {
- -              if (!strcmp(rule->keyrings->items[i], keyring)) {
+ +      for (i = 0; i < opt_list->count; i++) {
+ +              if (!strcmp(opt_list->items[i], func_data)) {
                         matched = true;
                         break;
                 }
@@@ -513,35 -488,31 +513,37 @@@
   /**
    * ima_match_rules - determine whether an inode matches the policy rule.
    * @rule: a pointer to a rule
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: a pointer to an inode
    * @cred: a pointer to a credentials structure for user validation
    * @secid: the secid of the task to be validated
    * @func: LIM hook identifier
    * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
- - * @keyring: keyring name to check in policy for KEY_CHECK func
+ + * @func_data: func specific data, may be NULL
    *
    * Returns true on rule match, false on failure.
    */
- static bool ima_match_rules(struct ima_rule_entry *rule, struct inode *inode,
-                           const struct cred *cred, u32 secid,
-                           enum ima_hooks func, int mask,
+ static bool ima_match_rules(struct ima_rule_entry *rule,
+                           struct user_namespace *mnt_userns,
+                           struct inode *inode, const struct cred *cred,
+                           u32 secid, enum ima_hooks func, int mask,
- -                          const char *keyring)
+ +                          const char *func_data)
   {
         int i;
   
- -      if (func == KEY_CHECK) {
- -              return (rule->flags & IMA_FUNC) && (rule->func == func) &&
- -                     ima_match_keyring(rule, keyring, cred);
- -      }
         if ((rule->flags & IMA_FUNC) &&
             (rule->func != func && func != POST_SETATTR))
                 return false;
+ +
+ +      switch (func) {
+ +      case KEY_CHECK:
+ +      case CRITICAL_DATA:
+ +              return ((rule->func == func) &&
+ +                      ima_match_rule_data(rule, func_data, cred));
+ +      default:
+ +              break;
+ +      }
+ +
         if ((rule->flags & IMA_MASK) &&
             (rule->mask != mask && func != POST_SETATTR))
                 return false;
@@@ -570,7 -541,7 +572,7 @@@
         }
   
         if ((rule->flags & IMA_FOWNER) &&
-           !rule->fowner_op(inode->i_uid, rule->fowner))
+           !rule->fowner_op(i_uid_into_mnt(mnt_userns, inode), rule->fowner))
                 return false;
         for (i = 0; i < MAX_LSM_RULES; i++) {
                 int rc = 0;
@@@ -633,6 -604,7 +635,7 @@@ static int get_subaction(struct ima_rul
   
   /**
    * ima_match_policy - decision based on LSM and other conditions
+  * @mnt_userns:       user namespace of the mount the inode was found from
    * @inode: pointer to an inode for which the policy decision is being made
    * @cred: pointer to a credentials structure for which the policy decision is
    *        being made
@@@ -641,7 -613,8 +644,7 @@@
    * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
    * @pcr: set the pcr to extend
    * @template_desc: the template that should be used for this rule
- - * @keyring: the keyring name, if given, to be used to check in the policy.
- - *           keyring can be NULL if func is anything other than KEY_CHECK.
+ + * @func_data: func specific data, may be NULL
    *
    * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type)
    * conditions.
@@@ -650,10 -623,11 +653,11 @@@
    * list when walking it.  Reads are many orders of magnitude more numerous
    * than writes so ima_match_policy() is classical RCU candidate.
    */
- int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid,
-                    enum ima_hooks func, int mask, int flags, int *pcr,
+ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode,
+                    const struct cred *cred, u32 secid, enum ima_hooks func,
+                    int mask, int flags, int *pcr,
                      struct ima_template_desc **template_desc,
- -                   const char *keyring)
+ +                   const char *func_data)
   {
         struct ima_rule_entry *entry;
         int action = 0, actmask = flags | (flags << 1);
@@@ -667,8 -641,8 +671,8 @@@
                 if (!(entry->action & actmask))
                         continue;
   
-               if (!ima_match_rules(entry, inode, cred, secid, func, mask,
-                                    func_data))
+               if (!ima_match_rules(entry, mnt_userns, inode, cred, secid,
- -                                   func, mask, keyring))
++                                   func, mask, func_data))
                         continue;
   
                 action |= entry->flags & IMA_ACTION_FLAGS;
@@@ -878,11 -852,6 +882,11 @@@ void __init ima_init_policy(void
                           ARRAY_SIZE(default_appraise_rules),
                           IMA_DEFAULT_POLICY);
   
+ +      if (ima_use_critical_data)
+ +              add_rules(critical_data_rules,
+ +                        ARRAY_SIZE(critical_data_rules),
+ +                        IMA_DEFAULT_POLICY);
+ +
         ima_update_policy_flag();
   }
   
@@@ -942,7 -911,7 +946,7 @@@ enum 
         Opt_uid_lt, Opt_euid_lt, Opt_fowner_lt,
         Opt_appraise_type, Opt_appraise_flag,
         Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings,
- -      Opt_err
+ +      Opt_label, Opt_err
   };
   
   static const match_table_t policy_tokens = {
@@@ -979,7 -948,6 +983,7 @@@
         {Opt_pcr, "pcr=%s"},
         {Opt_template, "template=%s"},
         {Opt_keyrings, "keyrings=%s"},
+ +      {Opt_label, "label=%s"},
         {Opt_err, NULL}
   };
   
@@@ -1143,18 -1111,6 +1147,18 @@@ static bool ima_validate_rule(struct im
                         return false;
   
                 break;
+ +      case CRITICAL_DATA:
+ +              if (entry->action & ~(MEASURE | DONT_MEASURE))
+ +                      return false;
+ +
+ +              if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_PCR |
+ +                                   IMA_LABEL))
+ +                      return false;
+ +
+ +              if (ima_rule_contains_lsm_cond(entry))
+ +                      return false;
+ +
+ +              break;
         default:
                 return false;
         }
@@@ -1286,8 -1242,6 +1290,8 @@@ static int ima_parse_rule(char *rule, s
                         else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) &&
                                  strcmp(args[0].from, "KEY_CHECK") == 0)
                                 entry->func = KEY_CHECK;
+ +                      else if (strcmp(args[0].from, "CRITICAL_DATA") == 0)
+ +                              entry->func = CRITICAL_DATA;
                         else
                                 result = -EINVAL;
                         if (!result)
@@@ -1358,23 -1312,6 +1362,23 @@@
   
                         entry->flags |= IMA_KEYRINGS;
                         break;
+ +              case Opt_label:
+ +                      ima_log_string(ab, "label", args[0].from);
+ +
+ +                      if (entry->label) {
+ +                              result = -EINVAL;
+ +                              break;
+ +                      }
+ +
+ +                      entry->label = ima_alloc_rule_opt_list(args);
+ +                      if (IS_ERR(entry->label)) {
+ +                              result = PTR_ERR(entry->label);
+ +                              entry->label = NULL;
+ +                              break;
+ +                      }
+ +
+ +                      entry->flags |= IMA_LABEL;
+ +                      break;
                 case Opt_fsuuid:
                         ima_log_string(ab, "fsuuid", args[0].from);
   
@@@ -1755,12 -1692,6 +1759,12 @@@ int ima_policy_show(struct seq_file *m
                 seq_puts(m, " ");
         }
   
+ +      if (entry->flags & IMA_LABEL) {
+ +              seq_puts(m, "label=");
+ +              ima_show_rule_opt_list(m, entry->label);
+ +              seq_puts(m, " ");
+ +      }
+ +
         if (entry->flags & IMA_PCR) {
                 snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr);
                 seq_printf(m, pt(Opt_pcr), tbuf);
diff --combined security/integrity/ima/ima_queue_keys.c

index c2f2ad3,ca3dea1..979ef6c
--- 1/security/integrity/ima/ima_queue_keys.c
--- 2/security/integrity/ima/ima_queue_keys.c
+++ b/security/integrity/ima/ima_queue_keys.c
@@@ -8,6 -8,7 +8,7 @@@
    *       Enables deferred processing of keys
    */
   
+ #include <linux/user_namespace.h>
   #include <linux/workqueue.h>
   #include <keys/asymmetric-type.h>
   #include "ima.h"
@@@ -158,12 -159,12 +159,13 @@@ void ima_process_queued_keys(void
   
         list_for_each_entry_safe(entry, tmp, &ima_keys, list) {
                 if (!timer_expired)
-                       process_buffer_measurement(NULL, entry->payload,
+                       process_buffer_measurement(&init_user_ns, NULL,
+                                                  entry->payload,
                                                    entry->payload_len,
                                                    entry->keyring_name,
                                                    KEY_CHECK, 0,
- -                                                 entry->keyring_name);
+ +                                                 entry->keyring_name,
+ +                                                 false);
                 list_del(&entry->list);
                 ima_free_key_entry(entry);
         }
diff --combined security/security.c

index 401663b,698a9f1..5ac96b1
--- 1/security/security.c
--- 2/security/security.c
+++ b/security/security.c
@@@ -1059,14 -1059,6 +1059,14 @@@ out
   }
   EXPORT_SYMBOL(security_inode_init_security);
   
+ +int security_inode_init_security_anon(struct inode *inode,
+ +                                    const struct qstr *name,
+ +                                    const struct inode *context_inode)
+ +{
+ +      return call_int_hook(inode_init_security_anon, 0, inode, name,
+ +                           context_inode);
+ +}
+ +
   int security_old_inode_init_security(struct inode *inode, struct inode *dir,
                                      const struct qstr *qstr, const char **name,
                                      void **value, size_t *len)
@@@ -1288,7 -1280,8 +1288,8 @@@ int security_inode_getattr(const struc
         return call_int_hook(inode_getattr, 0, path);
   }
   
- int security_inode_setxattr(struct dentry *dentry, const char *name,
+ int security_inode_setxattr(struct user_namespace *mnt_userns,
+                           struct dentry *dentry, const char *name,
                             const void *value, size_t size, int flags)
   {
         int ret;
@@@ -1299,8 -1292,8 +1300,8 @@@
          * SELinux and Smack integrate the cap call,
          * so assume that all LSMs supplying this call do so.
          */
-       ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
-                               flags);
+       ret = call_int_hook(inode_setxattr, 1, mnt_userns, dentry, name, value,
+                           size, flags);
   
         if (ret == 1)
                 ret = cap_inode_setxattr(dentry, name, value, size, flags);
@@@ -1335,7 -1328,8 +1336,8 @@@ int security_inode_listxattr(struct den
         return call_int_hook(inode_listxattr, 0, dentry);
   }
   
- int security_inode_removexattr(struct dentry *dentry, const char *name)
+ int security_inode_removexattr(struct user_namespace *mnt_userns,
+                              struct dentry *dentry, const char *name)
   {
         int ret;
   
@@@ -1345,9 -1339,9 +1347,9 @@@
          * SELinux and Smack integrate the cap call,
          * so assume that all LSMs supplying this call do so.
          */
-       ret = call_int_hook(inode_removexattr, 1, dentry, name);
+       ret = call_int_hook(inode_removexattr, 1, mnt_userns, dentry, name);
         if (ret == 1)
-               ret = cap_inode_removexattr(dentry, name);
+               ret = cap_inode_removexattr(mnt_userns, dentry, name);
         if (ret)
                 return ret;
         ret = ima_inode_removexattr(dentry, name);
@@@ -1361,12 -1355,15 +1363,15 @@@ int security_inode_need_killpriv(struc
         return call_int_hook(inode_need_killpriv, 0, dentry);
   }
   
- int security_inode_killpriv(struct dentry *dentry)
+ int security_inode_killpriv(struct user_namespace *mnt_userns,
+                           struct dentry *dentry)
   {
-       return call_int_hook(inode_killpriv, 0, dentry);
+       return call_int_hook(inode_killpriv, 0, mnt_userns, dentry);
   }
   
- int security_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ int security_inode_getsecurity(struct user_namespace *mnt_userns,
+                              struct inode *inode, const char *name,
+                              void **buffer, bool alloc)
   {
         struct security_hook_list *hp;
         int rc;
@@@ -1377,7 -1374,7 +1382,7 @@@
          * Only one module will provide an attribute with a given name.
          */
         hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
-               rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
+               rc = hp->hook.inode_getsecurity(mnt_userns, inode, name, buffer, alloc);
                 if (rc != LSM_RET_DEFAULT(inode_getsecurity))
                         return rc;
         }
diff --combined security/selinux/hooks.c

index af2994a,9719dd1..ddd0977
--- 1/security/selinux/hooks.c
--- 2/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@@ -484,55 -484,6 +484,55 @@@ static int selinux_is_sblabel_mnt(struc
         }
   }
   
+ +static int sb_check_xattr_support(struct super_block *sb)
+ +{
+ +      struct superblock_security_struct *sbsec = sb->s_security;
+ +      struct dentry *root = sb->s_root;
+ +      struct inode *root_inode = d_backing_inode(root);
+ +      u32 sid;
+ +      int rc;
+ +
+ +      /*
+ +       * Make sure that the xattr handler exists and that no
+ +       * error other than -ENODATA is returned by getxattr on
+ +       * the root directory.  -ENODATA is ok, as this may be
+ +       * the first boot of the SELinux kernel before we have
+ +       * assigned xattr values to the filesystem.
+ +       */
+ +      if (!(root_inode->i_opflags & IOP_XATTR)) {
+ +              pr_warn("SELinux: (dev %s, type %s) has no xattr support\n",
+ +                      sb->s_id, sb->s_type->name);
+ +              goto fallback;
+ +      }
+ +
+ +      rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
+ +      if (rc < 0 && rc != -ENODATA) {
+ +              if (rc == -EOPNOTSUPP) {
+ +                      pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n",
+ +                              sb->s_id, sb->s_type->name);
+ +                      goto fallback;
+ +              } else {
+ +                      pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n",
+ +                              sb->s_id, sb->s_type->name, -rc);
+ +                      return rc;
+ +              }
+ +      }
+ +      return 0;
+ +
+ +fallback:
+ +      /* No xattr support - try to fallback to genfs if possible. */
+ +      rc = security_genfs_sid(&selinux_state, sb->s_type->name, "/",
+ +                              SECCLASS_DIR, &sid);
+ +      if (rc)
+ +              return -EOPNOTSUPP;
+ +
+ +      pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n",
+ +              sb->s_id, sb->s_type->name);
+ +      sbsec->behavior = SECURITY_FS_USE_GENFS;
+ +      sbsec->sid = sid;
+ +      return 0;
+ +}
+ +
   static int sb_finish_set_opts(struct super_block *sb)
   {
         struct superblock_security_struct *sbsec = sb->s_security;
@@@ -541,9 -492,30 +541,9 @@@
         int rc = 0;
   
         if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
- -              /* Make sure that the xattr handler exists and that no
- -                 error other than -ENODATA is returned by getxattr on
- -                 the root directory.  -ENODATA is ok, as this may be
- -                 the first boot of the SELinux kernel before we have
- -                 assigned xattr values to the filesystem. */
- -              if (!(root_inode->i_opflags & IOP_XATTR)) {
- -                      pr_warn("SELinux: (dev %s, type %s) has no "
- -                             "xattr support\n", sb->s_id, sb->s_type->name);
- -                      rc = -EOPNOTSUPP;
- -                      goto out;
- -              }
- -
- -              rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
- -              if (rc < 0 && rc != -ENODATA) {
- -                      if (rc == -EOPNOTSUPP)
- -                              pr_warn("SELinux: (dev %s, type "
- -                                     "%s) has no security xattr handler\n",
- -                                     sb->s_id, sb->s_type->name);
- -                      else
- -                              pr_warn("SELinux: (dev %s, type "
- -                                     "%s) getxattr errno %d\n", sb->s_id,
- -                                     sb->s_type->name, -rc);
- -                      goto out;
- -              }
+ +              rc = sb_check_xattr_support(sb);
+ +              if (rc)
+ +                      return rc;
         }
   
         sbsec->flags |= SE_SBINITIALIZED;
@@@ -582,6 -554,7 +582,6 @@@
                 spin_lock(&sbsec->isec_lock);
         }
         spin_unlock(&sbsec->isec_lock);
- -out:
         return rc;
   }
   
@@@ -1147,8 -1120,7 +1147,8 @@@ static inline u16 inode_mode_to_securit
   
   static inline int default_protocol_stream(int protocol)
   {
- -      return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP);
+ +      return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP ||
+ +              protocol == IPPROTO_MPTCP);
   }
   
   static inline int default_protocol_dgram(int protocol)
@@@ -2962,62 -2934,6 +2962,62 @@@ static int selinux_inode_init_security(
         return 0;
   }
   
+ +static int selinux_inode_init_security_anon(struct inode *inode,
+ +                                          const struct qstr *name,
+ +                                          const struct inode *context_inode)
+ +{
+ +      const struct task_security_struct *tsec = selinux_cred(current_cred());
+ +      struct common_audit_data ad;
+ +      struct inode_security_struct *isec;
+ +      int rc;
+ +
+ +      if (unlikely(!selinux_initialized(&selinux_state)))
+ +              return 0;
+ +
+ +      isec = selinux_inode(inode);
+ +
+ +      /*
+ +       * We only get here once per ephemeral inode.  The inode has
+ +       * been initialized via inode_alloc_security but is otherwise
+ +       * untouched.
+ +       */
+ +
+ +      if (context_inode) {
+ +              struct inode_security_struct *context_isec =
+ +                      selinux_inode(context_inode);
+ +              if (context_isec->initialized != LABEL_INITIALIZED) {
+ +                      pr_err("SELinux:  context_inode is not initialized");
+ +                      return -EACCES;
+ +              }
+ +
+ +              isec->sclass = context_isec->sclass;
+ +              isec->sid = context_isec->sid;
+ +      } else {
+ +              isec->sclass = SECCLASS_ANON_INODE;
+ +              rc = security_transition_sid(
+ +                      &selinux_state, tsec->sid, tsec->sid,
+ +                      isec->sclass, name, &isec->sid);
+ +              if (rc)
+ +                      return rc;
+ +      }
+ +
+ +      isec->initialized = LABEL_INITIALIZED;
+ +      /*
+ +       * Now that we've initialized security, check whether we're
+ +       * allowed to actually create this type of anonymous inode.
+ +       */
+ +
+ +      ad.type = LSM_AUDIT_DATA_INODE;
+ +      ad.u.inode = inode;
+ +
+ +      return avc_has_perm(&selinux_state,
+ +                          tsec->sid,
+ +                          isec->sid,
+ +                          isec->sclass,
+ +                          FILE__CREATE,
+ +                          &ad);
+ +}
+ +
   static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
   {
         return may_create(dir, dentry, SECCLASS_FILE);
@@@ -3203,7 -3119,8 +3203,8 @@@ static bool has_cap_mac_admin(bool audi
         return true;
   }
   
- static int selinux_inode_setxattr(struct dentry *dentry, const char *name,
+ static int selinux_inode_setxattr(struct user_namespace *mnt_userns,
+                                 struct dentry *dentry, const char *name,
                                   const void *value, size_t size, int flags)
   {
         struct inode *inode = d_backing_inode(dentry);
@@@ -3224,13 -3141,13 +3225,13 @@@
         }
   
         if (!selinux_initialized(&selinux_state))
-               return (inode_owner_or_capable(inode) ? 0 : -EPERM);
+               return (inode_owner_or_capable(mnt_userns, inode) ? 0 : -EPERM);
   
         sbsec = inode->i_sb->s_security;
         if (!(sbsec->flags & SBLABEL_MNT))
                 return -EOPNOTSUPP;
   
-       if (!inode_owner_or_capable(inode))
+       if (!inode_owner_or_capable(mnt_userns, inode))
                 return -EPERM;
   
         ad.type = LSM_AUDIT_DATA_DENTRY;
@@@ -3351,10 -3268,11 +3352,11 @@@ static int selinux_inode_listxattr(stru
         return dentry_has_perm(cred, dentry, FILE__GETATTR);
   }
   
- static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
+ static int selinux_inode_removexattr(struct user_namespace *mnt_userns,
+                                    struct dentry *dentry, const char *name)
   {
         if (strcmp(name, XATTR_NAME_SELINUX)) {
-               int rc = cap_inode_removexattr(dentry, name);
+               int rc = cap_inode_removexattr(mnt_userns, dentry, name);
                 if (rc)
                         return rc;
   
@@@ -3420,7 -3338,9 +3422,9 @@@ static int selinux_path_notify(const st
    *
    * Permission check is handled by selinux_inode_getxattr hook.
    */
- static int selinux_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+ static int selinux_inode_getsecurity(struct user_namespace *mnt_userns,
+                                    struct inode *inode, const char *name,
+                                    void **buffer, bool alloc)
   {
         u32 size;
         int error;
@@@ -3497,10 -3417,6 +3501,10 @@@ static int selinux_inode_setsecurity(st
   static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
   {
         const int len = sizeof(XATTR_NAME_SELINUX);
+ +
+ +      if (!selinux_initialized(&selinux_state))
+ +              return 0;
+ +
         if (buffer && len <= buffer_size)
                 memcpy(buffer, XATTR_NAME_SELINUX, len);
         return len;
@@@ -6614,14 -6530,15 +6618,15 @@@ static int selinux_inode_notifysecctx(s
    */
   static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
   {
-       return __vfs_setxattr_noperm(dentry, XATTR_NAME_SELINUX, ctx, ctxlen, 0);
+       return __vfs_setxattr_noperm(&init_user_ns, dentry, XATTR_NAME_SELINUX,
+                                    ctx, ctxlen, 0);
   }
   
   static int selinux_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
   {
         int len = 0;
-       len = selinux_inode_getsecurity(inode, XATTR_SELINUX_SUFFIX,
-                                               ctx, true);
+       len = selinux_inode_getsecurity(&init_user_ns, inode,
+                                       XATTR_SELINUX_SUFFIX, ctx, true);
         if (len < 0)
                 return len;
         *ctxlen = len;
@@@ -7088,7 -7005,6 +7093,7 @@@ static struct security_hook_list selinu
   
         LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
         LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
+ +      LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon),
         LSM_HOOK_INIT(inode_create, selinux_inode_create),
         LSM_HOOK_INIT(inode_link, selinux_inode_link),
         LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
diff --combined tools/testing/selftests/Makefile

index 41f0a0a,157179a..6c575cf
--- 1/tools/testing/selftests/Makefile
--- 2/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@@ -33,8 -33,8 +33,9 @@@ TARGETS += memf
   TARGETS += memory-hotplug
   TARGETS += mincore
   TARGETS += mount
+ TARGETS += mount_setattr
   TARGETS += mqueue
+ +TARGETS += nci
   TARGETS += net
   TARGETS += net/forwarding
   TARGETS += net/mptcp
@@@ -126,6 -126,15 +127,6 @@@ ARCH           ?= $(SUBARCH
   export KSFT_KHDR_INSTALL_DONE := 1
   export BUILD
   
- -# build and run gpio when output directory is the src dir.
- -# gpio has dependency on tools/gpio and builds tools/gpio
- -# objects in the src directory in all cases making the src
- -# repo dirty even when objects are relocated.
- -ifneq (1,$(DEFAULT_INSTALL_HDR_PATH))
- -      TMP := $(filter-out gpio, $(TARGETS))
- -      TARGETS := $(TMP)
- -endif
- -
   # set default goal to all, so make without a target runs all, even when
   # all isn't the first target in the file.
   .DEFAULT_GOAL := all
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 23 Feb 2021 21:39:45 +0000 (13:39 -0800)
		1	2
Documentation/filesystems/porting.rst	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/filesystems/vfs.rst	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/syscalls/syscall.tbl	patch \|	diff1 \|	diff2 \|	blob \| history
fs/affs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ceph/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ceph/super.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/cifsfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ecryptfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exfat/exfat_fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exfat/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ialloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/acl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/f2fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/xattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fat/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fcntl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hfsplus/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hostfs/hostfs_kern.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hugetlbfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/libfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/export.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/nfs2acl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/nfs3acl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/nfsfh.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/nfsproc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/vfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/notify/fanotify/fanotify_user.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/notify/inotify/inotify_user.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/open.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/copy_up.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/overlayfs.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/util.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/proc_sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ubifs/xattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_iops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_qm.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_symlink.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/zonefs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/ima.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/lsm_hook_defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/lsm_hooks.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/security.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/syscalls.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/auditsc.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/madvise.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/socket.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/commoncap.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/evm/evm_crypto.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima.h	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_api.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_appraise.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_asymmetric_keys.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_policy.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_queue_keys.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/security.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/selinux/hooks.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history