Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
Pull vfs dedup fixes from Dave Chinner:
 "This reworks the vfs data cloning infrastructure.

  We discovered many issues with these interfaces late in the 4.19 cycle
  - the worst of them (data corruption, setuid stripping) were fixed for
  XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
  the problems was needed. That rework is the contents of this pull
  request.

  Rework the vfs_clone_file_range and vfs_dedupe_file_range
  infrastructure to use a common .remap_file_range method and supply
  generic bounds and sanity checking functions that are shared with the
  data write path. The current VFS infrastructure has problems with
  rlimit, LFS file sizes, file time stamps, maximum filesystem file
  sizes, stripping setuid bits, etc and so they are addressed in these
  commits.

  We also introduce the ability for the ->remap_file_range methods to
  return short clones so that clones for vfs_copy_file_range() don't get
  rejected if the entire range can't be cloned. It also allows
  filesystems to sliently skip deduplication of partial EOF blocks if
  they are not capable of doing so without requiring errors to be thrown
  to userspace.

  Existing filesystems are converted to user the new remap_file_range
  method, and both XFS and ocfs2 are modified to make use of the new
  generic checking infrastructure"

* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
  xfs: remove [cm]time update from reflink calls
  xfs: remove xfs_reflink_remap_range
  xfs: remove redundant remap partial EOF block checks
  xfs: support returning partial reflink results
  xfs: clean up xfs_reflink_remap_blocks call site
  xfs: fix pagecache truncation prior to reflink
  ocfs2: remove ocfs2_reflink_remap_range
  ocfs2: support partial clone range and dedupe range
  ocfs2: fix pagecache truncation prior to reflink
  ocfs2: truncate page cache for clone destination file before remapping
  vfs: clean up generic_remap_file_range_prep return value
  vfs: hide file range comparison function
  vfs: enable remap callers that can handle short operations
  vfs: plumb remap flags through the vfs dedupe functions
  vfs: plumb remap flags through the vfs clone functions
  vfs: make remap_file_range functions take and return bytes completed
  vfs: remap helper should update destination inode metadata
  vfs: pass remap flags to generic_remap_checks
  vfs: pass remap flags to generic_remap_file_range_prep
  vfs: combine the clone and dedupe into a single remap_file_range
  ...

14 files changed:
1  2 
Documentation/filesystems/porting
fs/btrfs/ctree.h
fs/btrfs/file.c
fs/btrfs/ioctl.c
fs/cifs/cifsfs.c
fs/ioctl.c
fs/nfsd/vfs.c
fs/ocfs2/refcounttree.c
fs/overlayfs/copy_up.c
fs/read_write.c
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h
include/linux/fs.h
mm/filemap.c

@@@ -623,13 -623,7 +623,18 @@@ in your dentry operations instead
        On success you get a new struct file sharing the mount/dentry with the
        original, on failure - ERR_PTR().
  --
+ [mandatory]
+       ->clone_file_range() and ->dedupe_file_range have been replaced with
+       ->remap_file_range().  See Documentation/filesystems/vfs.txt for more
+       information.
++--
 +[recommended]
 +      ->lookup() instances doing an equivalent of
 +              if (IS_ERR(inode))
 +                      return ERR_CAST(inode);
 +              return d_splice_alias(inode, dentry);
 +      don't need to bother with the check - d_splice_alias() will do the
 +      right thing when given ERR_PTR(...) as inode.  Moreover, passing NULL
 +      inode to d_splice_alias() will also do the right thing (equivalent of
 +      d_add(dentry, NULL); return NULL;), so that kind of special cases
 +      also doesn't need a separate treatment.
diff --combined fs/btrfs/ctree.h
@@@ -41,6 -41,12 +41,6 @@@ extern struct kmem_cache *btrfs_path_ca
  extern struct kmem_cache *btrfs_free_space_cachep;
  struct btrfs_ordered_sum;
  
 -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 -#define STATIC noinline
 -#else
 -#define STATIC static noinline
 -#endif
 -
  #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
  
  #define BTRFS_MAX_MIRRORS 3
@@@ -361,13 -367,11 +361,13 @@@ struct btrfs_dev_replace 
  
        struct mutex lock_finishing_cancel_unmount;
        rwlock_t lock;
 -      atomic_t read_locks;
        atomic_t blocking_readers;
        wait_queue_head_t read_lock_wq;
  
        struct btrfs_scrub_progress scrub_progress;
 +
 +      struct percpu_counter bio_counter;
 +      wait_queue_head_t replace_wait;
  };
  
  /* For raid type sysfs entries */
@@@ -1090,6 -1094,9 +1090,6 @@@ struct btrfs_fs_info 
        /* device replace state */
        struct btrfs_dev_replace dev_replace;
  
 -      struct percpu_counter bio_counter;
 -      wait_queue_head_t replace_wait;
 -
        struct semaphore uuid_tree_rescan_sem;
  
        /* Used to reclaim the metadata space in the background. */
@@@ -1195,12 -1202,18 +1195,12 @@@ struct btrfs_root 
        int last_log_commit;
        pid_t log_start_pid;
  
 -      u64 objectid;
        u64 last_trans;
  
        u32 type;
  
        u64 highest_objectid;
  
 -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 -      /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
 -      u64 alloc_bytenr;
 -#endif
 -
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;
        spinlock_t qgroup_meta_rsv_lock;
        u64 qgroup_meta_rsv_pertrans;
        u64 qgroup_meta_rsv_prealloc;
 +
 +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 +      u64 alloc_bytenr;
 +#endif
  };
  
  struct btrfs_file_private {
@@@ -2598,8 -2607,10 +2598,8 @@@ static inline u64 btrfs_calc_trunc_meta
        return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
  }
  
 -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
 -                                     struct btrfs_fs_info *fs_info);
 -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 -                                     struct btrfs_fs_info *fs_info);
 +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
 +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                         const u64 start);
  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@@ -2760,7 -2771,7 +2760,7 @@@ int btrfs_block_rsv_refill(struct btrfs
                           enum btrfs_reserve_flush_enum flush);
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
 -                          int update_size);
 +                          bool update_size);
  int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *dest, u64 num_bytes,
                             int min_factor);
@@@ -2866,6 -2877,8 +2866,6 @@@ void btrfs_release_path(struct btrfs_pa
  struct btrfs_path *btrfs_alloc_path(void);
  void btrfs_free_path(struct btrfs_path *p);
  void btrfs_set_path_blocking(struct btrfs_path *p);
 -void btrfs_clear_path_blocking(struct btrfs_path *p,
 -                             struct extent_buffer *held, int held_rw);
  void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
  
  int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@@ -3008,7 -3021,8 +3008,7 @@@ int btrfs_uuid_tree_iterate(struct btrf
  /* dir-item.c */
  int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
                          const char *name, int name_len);
 -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 -                        struct btrfs_root *root, const char *name,
 +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
                          int name_len, struct btrfs_inode *dir,
                          struct btrfs_key *location, u8 type, u64 index);
  struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@@ -3166,8 -3180,8 +3166,8 @@@ void __cold btrfs_destroy_cachep(void)
  struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *was_new);
  struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
 -              struct page *page, size_t pg_offset,
 -              u64 start, u64 end, int create);
 +                                  struct page *page, size_t pg_offset,
 +                                  u64 start, u64 end, int create);
  int btrfs_update_inode(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode);
@@@ -3187,6 -3201,9 +3187,6 @@@ int btrfs_prealloc_file_range_trans(str
                                    u64 start, u64 num_bytes, u64 min_size,
                                    loff_t actual_len, u64 *alloc_hint);
  extern const struct dentry_operations btrfs_dentry_operations;
 -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 -void btrfs_test_inode_set_ops(struct inode *inode);
 -#endif
  
  /* ioctl.c */
  long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@@ -3201,9 -3218,6 +3201,6 @@@ void btrfs_get_block_group_info(struct 
                                struct btrfs_ioctl_space_info *space);
  void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_balance_args *bargs);
- int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-                           struct file *dst_file, loff_t dst_loff,
-                           u64 olen);
  
  /* file.c */
  int __init btrfs_auto_defrag_init(void);
@@@ -3233,8 -3247,9 +3230,9 @@@ int btrfs_dirty_pages(struct inode *ino
                      size_t num_pages, loff_t pos, size_t write_bytes,
                      struct extent_state **cached);
  int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
- int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                          struct file *file_out, loff_t pos_out, u64 len);
+ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+                             struct file *file_out, loff_t pos_out,
+                             loff_t len, unsigned int remap_flags);
  
  /* tree-defrag.c */
  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@@ -3699,19 -3714,18 +3697,19 @@@ static inline int btrfs_defrag_cancelle
  
  /* Sanity test specific functions */
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 +void btrfs_test_inode_set_ops(struct inode *inode);
  void btrfs_test_destroy_inode(struct inode *inode);
 -#endif
  
  static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
  {
 -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 -      if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
 -                            &fs_info->fs_state)))
 -              return 1;
 -#endif
 +      return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
 +}
 +#else
 +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 +{
        return 0;
  }
 +#endif
  
  static inline void cond_wake_up(struct wait_queue_head *wq)
  {
diff --combined fs/btrfs/file.c
@@@ -531,14 -531,6 +531,14 @@@ int btrfs_dirty_pages(struct inode *ino
  
        end_of_last_block = start_pos + num_bytes - 1;
  
 +      /*
 +       * The pages may have already been dirty, clear out old accounting so
 +       * we can set things up properly
 +       */
 +      clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
 +                       EXTENT_DIRTY | EXTENT_DELALLOC |
 +                       EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
 +
        if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
                if (start_pos >= isize &&
                    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@@ -1508,27 -1500,18 +1508,27 @@@ lock_and_cleanup_extent_if_need(struct 
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
 -              clear_extent_bit(&inode->io_tree, start_pos, last_pos,
 -                               EXTENT_DIRTY | EXTENT_DELALLOC |
 -                               EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 -                               0, 0, cached_state);
 +
                *lockstart = start_pos;
                *lockend = last_pos;
                ret = 1;
        }
  
 +      /*
 +       * It's possible the pages are dirty right now, but we don't want
 +       * to clean them yet because copy_from_user may catch a page fault
 +       * and we might have to fall back to one page at a time.  If that
 +       * happens, we'll unlock these pages and we'd have a window where
 +       * reclaim could sneak in and drop the once-dirty page on the floor
 +       * without writing it.
 +       *
 +       * We have the pages locked and the extent range locked, so there's
 +       * no way someone can start IO on any dirty pages in this range.
 +       *
 +       * We'll call btrfs_dirty_pages() later on, and that will flip around
 +       * delalloc bits and dirty the pages as required.
 +       */
        for (i = 0; i < num_pages; i++) {
 -              if (clear_page_dirty_for_io(pages[i]))
 -                      account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
@@@ -2078,14 -2061,6 +2078,14 @@@ int btrfs_sync_file(struct file *file, 
                goto out;
  
        inode_lock(inode);
 +
 +      /*
 +       * We take the dio_sem here because the tree log stuff can race with
 +       * lockless dio writes and get an extent map logged for an extent we
 +       * never waited on.  We need it this high up for lockdep reasons.
 +       */
 +      down_write(&BTRFS_I(inode)->dio_sem);
 +
        atomic_inc(&root->log_batch);
  
        /*
         */
        ret = btrfs_wait_ordered_range(inode, start, len);
        if (ret) {
 +              up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
                 * checked called fsync.
                 */
                ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
 +              up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
 +              up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
         * file again, but that will end up using the synchronization
         * inside btrfs_sync_log to keep things safe.
         */
 +      up_write(&BTRFS_I(inode)->dio_sem);
        inode_unlock(inode);
  
        /*
@@@ -2573,7 -2544,7 +2573,7 @@@ static int btrfs_punch_hole(struct inod
        }
  
        ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
 -                                    min_size, 0);
 +                                    min_size, false);
        BUG_ON(ret);
        trans->block_rsv = rsv;
  
                }
  
                ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
 -                                            rsv, min_size, 0);
 +                                            rsv, min_size, false);
                BUG_ON(ret);    /* shouldn't happen */
                trans->block_rsv = rsv;
  
@@@ -3298,8 -3269,7 +3298,7 @@@ const struct file_operations btrfs_file
  #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_compat_ioctl,
  #endif
-       .clone_file_range = btrfs_clone_file_range,
-       .dedupe_file_range = btrfs_dedupe_file_range,
+       .remap_file_range = btrfs_remap_file_range,
  };
  
  void __cold btrfs_auto_defrag_exit(void)
diff --combined fs/btrfs/ioctl.c
@@@ -491,6 -491,7 +491,6 @@@ static noinline int btrfs_ioctl_fitrim(
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
 -      u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret;
  
        if (!capable(CAP_SYS_ADMIN))
                return -EOPNOTSUPP;
        if (copy_from_user(&range, arg, sizeof(range)))
                return -EFAULT;
 -      if (range.start > total_bytes ||
 -          range.len < fs_info->sb->s_blocksize)
 +
 +      /*
 +       * NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
 +       * block group is in the logical address space, which can be any
 +       * sectorsize aligned bytenr in  the range [0, U64_MAX].
 +       */
 +      if (range.len < fs_info->sb->s_blocksize)
                return -EINVAL;
  
 -      range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
        ret = btrfs_trim_fs(fs_info, &range);
        if (ret < 0)
@@@ -689,7 -686,8 +689,7 @@@ static noinline int create_subvol(struc
                goto fail;
        }
  
 -      ret = btrfs_insert_dir_item(trans, root,
 -                                  name, namelen, BTRFS_I(dir), &key,
 +      ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
                                    BTRFS_FT_DIR, index);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
@@@ -1326,7 -1324,7 +1326,7 @@@ again
  
        if (i_done != page_cnt) {
                spin_lock(&BTRFS_I(inode)->lock);
 -              BTRFS_I(inode)->outstanding_extents++;
 +              btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
                spin_unlock(&BTRFS_I(inode)->lock);
                btrfs_delalloc_release_space(inode, data_reserved,
                                start_index << PAGE_SHIFT,
@@@ -3629,26 -3627,6 +3629,6 @@@ out_unlock
        return ret;
  }
  
- int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-                           struct file *dst_file, loff_t dst_loff,
-                           u64 olen)
- {
-       struct inode *src = file_inode(src_file);
-       struct inode *dst = file_inode(dst_file);
-       u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-       if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-               /*
-                * Btrfs does not support blocksize < page_size. As a
-                * result, btrfs_cmp_data() won't correctly handle
-                * this situation without an update.
-                */
-               return -EINVAL;
-       }
-       return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
- }
  static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
                                     struct inode *inode,
                                     u64 endoff,
@@@ -4350,10 -4328,34 +4330,34 @@@ out_unlock
        return ret;
  }
  
- int btrfs_clone_file_range(struct file *src_file, loff_t off,
-               struct file *dst_file, loff_t destoff, u64 len)
+ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+               struct file *dst_file, loff_t destoff, loff_t len,
+               unsigned int remap_flags)
  {
-       return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+       int ret;
+       if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+               return -EINVAL;
+       if (remap_flags & REMAP_FILE_DEDUP) {
+               struct inode *src = file_inode(src_file);
+               struct inode *dst = file_inode(dst_file);
+               u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+               if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
+                       /*
+                        * Btrfs does not support blocksize < page_size. As a
+                        * result, btrfs_cmp_data() won't correctly handle
+                        * this situation without an update.
+                        */
+                       return -EINVAL;
+               }
+               ret = btrfs_extent_same(src, off, len, dst, destoff);
+       } else {
+               ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+       }
+       return ret < 0 ? ret : len;
  }
  
  static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                ret = PTR_ERR(new_root);
                goto out;
        }
 -      if (!is_fstree(new_root->objectid)) {
 +      if (!is_fstree(new_root->root_key.objectid)) {
                ret = -ENOENT;
                goto out;
        }
diff --combined fs/cifs/cifsfs.c
@@@ -81,14 -81,6 +81,14 @@@ module_param(cifs_max_pending, uint, 04
  MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
                                   "CIFS/SMB1 dialect (N/A for SMB3) "
                                   "Default: 32767 Range: 2 to 32767.");
 +#ifdef CONFIG_CIFS_STATS2
 +unsigned int slow_rsp_threshold = 1;
 +module_param(slow_rsp_threshold, uint, 0644);
 +MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
 +                                 "before logging that a response is delayed. "
 +                                 "Default: 1 (if set to 0 disables msg).");
 +#endif /* STATS2 */
 +
  module_param(enable_oplocks, bool, 0644);
  MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
  
@@@ -500,8 -492,6 +500,8 @@@ cifs_show_options(struct seq_file *s, s
                seq_puts(s, ",unix");
        else
                seq_puts(s, ",nounix");
 +      if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
 +              seq_puts(s, ",nodfs");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
                seq_puts(s, ",posixpaths");
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@@ -717,14 -707,7 +717,14 @@@ cifs_smb3_do_mount(struct file_system_t
        struct cifs_mnt_data mnt_data;
        struct dentry *root;
  
 -      cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
 +      /*
 +       * Prints in Kernel / CIFS log the attempted mount operation
 +       *      If CIFS_DEBUG && cifs_FYI
 +       */
 +      if (cifsFYI)
 +              cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
 +      else
 +              cifs_info("Attempting to mount %s\n", dev_name);
  
        volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3);
        if (IS_ERR(volume_info))
@@@ -992,8 -975,9 +992,9 @@@ const struct inode_operations cifs_syml
        .listxattr = cifs_listxattr,
  };
  
- static int cifs_clone_file_range(struct file *src_file, loff_t off,
-               struct file *dst_file, loff_t destoff, u64 len)
+ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
+               struct file *dst_file, loff_t destoff, loff_t len,
+               unsigned int remap_flags)
  {
        struct inode *src_inode = file_inode(src_file);
        struct inode *target_inode = file_inode(dst_file);
        unsigned int xid;
        int rc;
  
+       if (remap_flags & ~REMAP_FILE_ADVISORY)
+               return -EINVAL;
        cifs_dbg(FYI, "clone range\n");
  
        xid = get_xid();
        unlock_two_nondirectories(src_inode, target_inode);
  out:
        free_xid(xid);
-       return rc;
+       return rc < 0 ? rc : len;
  }
  
  ssize_t cifs_file_copychunk_range(unsigned int xid,
@@@ -1151,7 -1138,7 +1155,7 @@@ const struct file_operations cifs_file_
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
  };
@@@ -1170,7 -1157,7 +1174,7 @@@ const struct file_operations cifs_file_
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
  };
@@@ -1189,7 -1176,7 +1193,7 @@@ const struct file_operations cifs_file_
        .splice_write = iter_file_splice_write,
        .unlocked_ioctl  = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@@ -1208,7 -1195,7 +1212,7 @@@ const struct file_operations cifs_file_
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
  };
@@@ -1226,7 -1213,7 +1230,7 @@@ const struct file_operations cifs_file_
        .llseek = cifs_llseek,
        .unlocked_ioctl = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
  };
@@@ -1244,7 -1231,7 +1248,7 @@@ const struct file_operations cifs_file_
        .splice_write = iter_file_splice_write,
        .unlocked_ioctl  = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .llseek = cifs_llseek,
        .setlease = cifs_setlease,
        .fallocate = cifs_fallocate,
@@@ -1256,7 -1243,7 +1260,7 @@@ const struct file_operations cifs_dir_o
        .read    = generic_read_dir,
        .unlocked_ioctl  = cifs_ioctl,
        .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
        .llseek = generic_file_llseek,
        .fsync = cifs_dir_fsync,
  };
@@@ -1435,11 -1422,6 +1439,11 @@@ init_cifs(void
  #ifdef CONFIG_CIFS_STATS2
        atomic_set(&totBufAllocCount, 0);
        atomic_set(&totSmBufAllocCount, 0);
 +      if (slow_rsp_threshold < 1)
 +              cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
 +      else if (slow_rsp_threshold > 32767)
 +              cifs_dbg(VFS,
 +                     "slow response threshold set higher than recommended (0 to 32767)\n");
  #endif /* CONFIG_CIFS_STATS2 */
  
        atomic_set(&midCount, 0);
@@@ -1560,11 -1542,11 +1564,11 @@@ exit_cifs(void
        cifs_proc_clean();
  }
  
 -MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
 +MODULE_AUTHOR("Steve French");
  MODULE_LICENSE("GPL");        /* combination of LGPL + GPL source behaves as GPL */
  MODULE_DESCRIPTION
 -    ("VFS to access servers complying with the SNIA CIFS Specification "
 -     "e.g. Samba and Windows");
 +      ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
 +      "also older servers complying with the SNIA CIFS Specification)");
  MODULE_VERSION(CIFS_VERSION);
  MODULE_SOFTDEP("pre: arc4");
  MODULE_SOFTDEP("pre: des");
diff --combined fs/ioctl.c
@@@ -223,6 -223,7 +223,7 @@@ static long ioctl_file_clone(struct fil
                             u64 off, u64 olen, u64 destoff)
  {
        struct fd src_file = fdget(srcfd);
+       loff_t cloned;
        int ret;
  
        if (!src_file.file)
        ret = -EXDEV;
        if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
                goto fdput;
-       ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+       cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+                                     olen, 0);
+       if (cloned < 0)
+               ret = cloned;
+       else if (olen && cloned != olen)
+               ret = -EINVAL;
+       else
+               ret = 0;
  fdput:
        fdput(src_file);
        return ret;
@@@ -669,9 -677,6 +677,9 @@@ int do_vfs_ioctl(struct file *filp, uns
                return ioctl_fiemap(filp, arg);
  
        case FIGETBSZ:
 +              /* anon_bdev filesystems may not have a block size */
 +              if (!inode->i_sb->s_blocksize)
 +                      return -EINVAL;
                return put_user(inode->i_sb->s_blocksize, argp);
  
        case FICLONE:
diff --combined fs/nfsd/vfs.c
@@@ -541,8 -541,12 +541,12 @@@ __be32 nfsd4_set_nfs4_label(struct svc_
  __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
                u64 dst_pos, u64 count)
  {
-       return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
-                                            count));
+       loff_t cloned;
+       cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+       if (count && cloned != count)
+               cloned = -EINVAL;
+       return nfserrno(cloned < 0 ? cloned : 0);
  }
  
  ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@@ -923,7 -927,7 +927,7 @@@ __be32 nfsd_readv(struct svc_rqst *rqst
        int host_err;
  
        trace_nfsd_read_vector(rqstp, fhp, offset, *count);
 -      iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
 +      iov_iter_kvec(&iter, READ, vec, vlen, *count);
        host_err = vfs_iter_read(file, &iter, &offset, 0);
        return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
  }
@@@ -999,7 -1003,7 +1003,7 @@@ nfsd_vfs_write(struct svc_rqst *rqstp, 
        if (stable && !use_wgather)
                flags |= RWF_SYNC;
  
 -      iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt);
 +      iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
        host_err = vfs_iter_write(file, &iter, &pos, flags);
        if (host_err < 0)
                goto out_nfserr;
@@@ -1276,6 -1280,7 +1280,6 @@@ nfsd_create(struct svc_rqst *rqstp, str
                int type, dev_t rdev, struct svc_fh *resfhp)
  {
        struct dentry   *dentry, *dchild = NULL;
 -      struct inode    *dirp;
        __be32          err;
        int             host_err;
  
                return err;
  
        dentry = fhp->fh_dentry;
 -      dirp = d_inode(dentry);
  
        host_err = fh_want_write(fhp);
        if (host_err)
@@@ -1407,7 -1413,6 +1411,7 @@@ do_nfsd_create(struct svc_rqst *rqstp, 
                                        *created = 1;
                                break;
                        }
 +                      /* fall through */
                case NFS4_CREATE_EXCLUSIVE4_1:
                        if (   d_inode(dchild)->i_mtime.tv_sec == v_mtime
                            && d_inode(dchild)->i_atime.tv_sec == v_atime
                                        *created = 1;
                                goto set_attr;
                        }
 -                       /* fallthru */
 +                      /* fall through */
                case NFS3_CREATE_GUARDED:
                        err = nfserr_exist;
                }
diff --combined fs/ocfs2/refcounttree.c
@@@ -4135,6 -4135,7 +4135,6 @@@ static int ocfs2_create_reflink_node(st
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
 -      struct ocfs2_refcount_block *rb;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
        struct ocfs2_refcount_tree *ref_tree;
  
                mlog_errno(ret);
                goto out;
        }
 -      rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
  
        ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
                                          &ref_tree->rf_ci, ref_root_bh,
@@@ -4466,9 -4468,9 +4466,9 @@@ out
  }
  
  /* Update destination inode size, if necessary. */
static int ocfs2_reflink_update_dest(struct inode *dest,
-                                    struct buffer_head *d_bh,
-                                    loff_t newlen)
+ int ocfs2_reflink_update_dest(struct inode *dest,
+                             struct buffer_head *d_bh,
+                             loff_t newlen)
  {
        handle_t *handle;
        int ret;
@@@ -4505,14 -4507,14 +4505,14 @@@ out_commit
  }
  
  /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
- static int ocfs2_reflink_remap_extent(struct inode *s_inode,
-                                     struct buffer_head *s_bh,
-                                     loff_t pos_in,
-                                     struct inode *t_inode,
-                                     struct buffer_head *t_bh,
-                                     loff_t pos_out,
-                                     loff_t len,
-                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
+ static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+                                        struct buffer_head *s_bh,
+                                        loff_t pos_in,
+                                        struct inode *t_inode,
+                                        struct buffer_head *t_bh,
+                                        loff_t pos_out,
+                                        loff_t len,
+                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
  {
        struct ocfs2_extent_tree s_et;
        struct ocfs2_extent_tree t_et;
        struct buffer_head *ref_root_bh = NULL;
        struct ocfs2_refcount_tree *ref_tree;
        struct ocfs2_super *osb;
+       loff_t remapped_bytes = 0;
        loff_t pstart, plen;
-       u32 p_cluster, num_clusters, slast, spos, tpos;
+       u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
        unsigned int ext_flags;
        int ret = 0;
  
  next_loop:
                spos += num_clusters;
                tpos += num_clusters;
+               remapped_clus += num_clusters;
        }
  
- out:
-       return ret;
+       goto out;
  out_unlock_refcount:
        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
        brelse(ref_root_bh);
-       return ret;
+ out:
+       remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+       remapped_bytes = min_t(loff_t, len, remapped_bytes);
+       return remapped_bytes > 0 ? remapped_bytes : ret;
  }
  
  /* Set up refcount tree and remap s_inode to t_inode. */
static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
-                                     struct buffer_head *s_bh,
-                                     loff_t pos_in,
-                                     struct inode *t_inode,
-                                     struct buffer_head *t_bh,
-                                     loff_t pos_out,
-                                     loff_t len)
loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+                                 struct buffer_head *s_bh,
+                                 loff_t pos_in,
+                                 struct inode *t_inode,
+                                 struct buffer_head *t_bh,
+                                 loff_t pos_out,
+                                 loff_t len)
  {
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct ocfs2_super *osb;
        struct ocfs2_dinode *dis;
        struct ocfs2_dinode *dit;
-       int ret;
+       loff_t ret;
  
        osb = OCFS2_SB(s_inode->i_sb);
        dis = (struct ocfs2_dinode *)s_bh->b_data;
        /* Actually remap extents now. */
        ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
                                         pos_out, len, &dealloc);
-       if (ret) {
+       if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
  }
  
  /* Lock an inode and grab a bh pointing to the inode. */
static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
-                                    struct buffer_head **bh1,
-                                    struct inode *t_inode,
-                                    struct buffer_head **bh2)
+ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+                             struct buffer_head **bh1,
+                             struct inode *t_inode,
+                             struct buffer_head **bh2)
  {
        struct inode *inode1;
        struct inode *inode2;
@@@ -4801,10 -4808,10 +4806,10 @@@ out_i1
  }
  
  /* Unlock both inodes and release buffers. */
static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
-                                       struct buffer_head *s_bh,
-                                       struct inode *t_inode,
-                                       struct buffer_head *t_bh)
+ void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+                                struct buffer_head *s_bh,
+                                struct inode *t_inode,
+                                struct buffer_head *t_bh)
  {
        ocfs2_inode_unlock(s_inode, 1);
        ocfs2_rw_unlock(s_inode, 1);
        }
        unlock_two_nondirectories(s_inode, t_inode);
  }
- /* Link a range of blocks from one file to another. */
- int ocfs2_reflink_remap_range(struct file *file_in,
-                             loff_t pos_in,
-                             struct file *file_out,
-                             loff_t pos_out,
-                             u64 len,
-                             bool is_dedupe)
- {
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-       struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
-       struct buffer_head *in_bh = NULL, *out_bh = NULL;
-       bool same_inode = (inode_in == inode_out);
-       ssize_t ret;
-       if (!ocfs2_refcount_tree(osb))
-               return -EOPNOTSUPP;
-       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
-               return -EROFS;
-       /* Lock both files against IO */
-       ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
-       if (ret)
-               return ret;
-       /* Check file eligibility and prepare for block sharing. */
-       ret = -EINVAL;
-       if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
-           (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
-               goto out_unlock;
-       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-                       &len, is_dedupe);
-       if (ret <= 0)
-               goto out_unlock;
-       /* Lock out changes to the allocation maps and remap. */
-       down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-       if (!same_inode)
-               down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
-                                 SINGLE_DEPTH_NESTING);
-       ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-                                        out_bh, pos_out, len);
-       /* Zap any page cache for the destination file's range. */
-       if (!ret)
-               truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                          PAGE_ALIGN(pos_out + len) - 1);
-       up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-       if (!same_inode)
-               up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_unlock;
-       }
-       /*
-        * Empty the extent map so that we may get the right extent
-        * record from the disk.
-        */
-       ocfs2_extent_map_trunc(inode_in, 0);
-       ocfs2_extent_map_trunc(inode_out, 0);
-       ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_unlock;
-       }
-       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-       return 0;
- out_unlock:
-       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-       return ret;
- }
diff --combined fs/overlayfs/copy_up.c
@@@ -125,6 -125,7 +125,7 @@@ static int ovl_copy_up_data(struct pat
        struct file *new_file;
        loff_t old_pos = 0;
        loff_t new_pos = 0;
+       loff_t cloned;
        int error = 0;
  
        if (len == 0)
        }
  
        /* Try to use clone_file_range to clone up within the same fs */
-       error = do_clone_file_range(old_file, 0, new_file, 0, len);
-       if (!error)
+       cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+       if (cloned == len)
                goto out;
        /* Couldn't clone, so now we try to copy the data */
-       error = 0;
  
        /* FIXME: copy up sparse files efficiently */
        while (len) {
@@@ -395,6 -395,7 +395,6 @@@ struct ovl_copy_up_ctx 
        struct dentry *destdir;
        struct qstr destname;
        struct dentry *workdir;
 -      bool tmpfile;
        bool origin;
        bool indexed;
        bool metacopy;
@@@ -439,6 -440,63 +439,6 @@@ static int ovl_link_up(struct ovl_copy_
        return err;
  }
  
 -static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
 -                          struct dentry **newdentry)
 -{
 -      int err;
 -      struct dentry *upper;
 -      struct inode *udir = d_inode(c->destdir);
 -
 -      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
 -      if (IS_ERR(upper))
 -              return PTR_ERR(upper);
 -
 -      if (c->tmpfile)
 -              err = ovl_do_link(temp, udir, upper);
 -      else
 -              err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
 -
 -      if (!err)
 -              *newdentry = dget(c->tmpfile ? upper : temp);
 -      dput(upper);
 -
 -      return err;
 -}
 -
 -static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
 -{
 -      int err;
 -      struct dentry *temp;
 -      const struct cred *old_creds = NULL;
 -      struct cred *new_creds = NULL;
 -      struct ovl_cattr cattr = {
 -              /* Can't properly set mode on creation because of the umask */
 -              .mode = c->stat.mode & S_IFMT,
 -              .rdev = c->stat.rdev,
 -              .link = c->link
 -      };
 -
 -      err = security_inode_copy_up(c->dentry, &new_creds);
 -      temp = ERR_PTR(err);
 -      if (err < 0)
 -              goto out;
 -
 -      if (new_creds)
 -              old_creds = override_creds(new_creds);
 -
 -      if (c->tmpfile)
 -              temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
 -      else
 -              temp = ovl_create_temp(c->workdir, &cattr);
 -out:
 -      if (new_creds) {
 -              revert_creds(old_creds);
 -              put_cred(new_creds);
 -      }
 -
 -      return temp;
 -}
 -
  static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
  {
        int err;
        return err;
  }
  
 -static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
 +struct ovl_cu_creds {
 +      const struct cred *old;
 +      struct cred *new;
 +};
 +
 +static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
 +{
 +      int err;
 +
 +      cc->old = cc->new = NULL;
 +      err = security_inode_copy_up(dentry, &cc->new);
 +      if (err < 0)
 +              return err;
 +
 +      if (cc->new)
 +              cc->old = override_creds(cc->new);
 +
 +      return 0;
 +}
 +
 +static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
 +{
 +      if (cc->new) {
 +              revert_creds(cc->old);
 +              put_cred(cc->new);
 +      }
 +}
 +
 +/*
 + * Copyup using workdir to prepare temp file.  Used when copying up directories,
 + * special files or when upper fs doesn't support O_TMPFILE.
 + */
 +static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
  {
 -      struct inode *udir = c->destdir->d_inode;
        struct inode *inode;
 -      struct dentry *newdentry = NULL;
 -      struct dentry *temp;
 +      struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
 +      struct dentry *temp, *upper;
 +      struct ovl_cu_creds cc;
        int err;
 +      struct ovl_cattr cattr = {
 +              /* Can't properly set mode on creation because of the umask */
 +              .mode = c->stat.mode & S_IFMT,
 +              .rdev = c->stat.rdev,
 +              .link = c->link
 +      };
  
 -      temp = ovl_get_tmpfile(c);
 +      err = ovl_lock_rename_workdir(c->workdir, c->destdir);
 +      if (err)
 +              return err;
 +
 +      err = ovl_prep_cu_creds(c->dentry, &cc);
 +      if (err)
 +              goto unlock;
 +
 +      temp = ovl_create_temp(c->workdir, &cattr);
 +      ovl_revert_cu_creds(&cc);
 +
 +      err = PTR_ERR(temp);
        if (IS_ERR(temp))
 -              return PTR_ERR(temp);
 +              goto unlock;
  
        err = ovl_copy_up_inode(c, temp);
        if (err)
 -              goto out;
 +              goto cleanup;
  
        if (S_ISDIR(c->stat.mode) && c->indexed) {
                err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
                if (err)
 -                      goto out;
 +                      goto cleanup;
        }
  
 -      if (c->tmpfile) {
 -              inode_lock_nested(udir, I_MUTEX_PARENT);
 -              err = ovl_install_temp(c, temp, &newdentry);
 -              inode_unlock(udir);
 -      } else {
 -              err = ovl_install_temp(c, temp, &newdentry);
 -      }
 +      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
 +      err = PTR_ERR(upper);
 +      if (IS_ERR(upper))
 +              goto cleanup;
 +
 +      err = ovl_do_rename(wdir, temp, udir, upper, 0);
 +      dput(upper);
        if (err)
 -              goto out;
 +              goto cleanup;
  
        if (!c->metacopy)
                ovl_set_upperdata(d_inode(c->dentry));
        inode = d_inode(c->dentry);
 -      ovl_inode_update(inode, newdentry);
 +      ovl_inode_update(inode, temp);
        if (S_ISDIR(inode->i_mode))
                ovl_set_flag(OVL_WHITEOUTS, inode);
 +unlock:
 +      unlock_rename(c->workdir, c->destdir);
  
 -out:
 -      if (err && !c->tmpfile)
 -              ovl_cleanup(d_inode(c->workdir), temp);
 -      dput(temp);
        return err;
  
 +cleanup:
 +      ovl_cleanup(wdir, temp);
 +      dput(temp);
 +      goto unlock;
 +}
 +
 +/* Copyup using O_TMPFILE which does not require cross dir locking */
 +static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
 +{
 +      struct inode *udir = d_inode(c->destdir);
 +      struct dentry *temp, *upper;
 +      struct ovl_cu_creds cc;
 +      int err;
 +
 +      err = ovl_prep_cu_creds(c->dentry, &cc);
 +      if (err)
 +              return err;
 +
 +      temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
 +      ovl_revert_cu_creds(&cc);
 +
 +      if (IS_ERR(temp))
 +              return PTR_ERR(temp);
 +
 +      err = ovl_copy_up_inode(c, temp);
 +      if (err)
 +              goto out_dput;
 +
 +      inode_lock_nested(udir, I_MUTEX_PARENT);
 +
 +      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
 +      err = PTR_ERR(upper);
 +      if (!IS_ERR(upper)) {
 +              err = ovl_do_link(temp, udir, upper);
 +              dput(upper);
 +      }
 +      inode_unlock(udir);
 +
 +      if (err)
 +              goto out_dput;
 +
 +      if (!c->metacopy)
 +              ovl_set_upperdata(d_inode(c->dentry));
 +      ovl_inode_update(d_inode(c->dentry), temp);
 +
 +      return 0;
 +
 +out_dput:
 +      dput(temp);
 +      return err;
  }
  
  /*
@@@ -685,10 -646,18 +685,10 @@@ static int ovl_do_copy_up(struct ovl_co
        }
  
        /* Should we copyup with O_TMPFILE or with workdir? */
 -      if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
 -              c->tmpfile = true;
 -              err = ovl_copy_up_locked(c);
 -      } else {
 -              err = ovl_lock_rename_workdir(c->workdir, c->destdir);
 -              if (!err) {
 -                      err = ovl_copy_up_locked(c);
 -                      unlock_rename(c->workdir, c->destdir);
 -              }
 -      }
 -
 -
 +      if (S_ISREG(c->stat.mode) && ofs->tmpfile)
 +              err = ovl_copy_up_tmpfile(c);
 +      else
 +              err = ovl_copy_up_workdir(c);
        if (err)
                goto out;
  
diff --combined fs/read_write.c
@@@ -331,7 -331,7 +331,7 @@@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned 
  }
  #endif
  
 -#ifdef __ARCH_WANT_SYS_LLSEEK
 +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
  SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
@@@ -1407,6 -1407,7 +1407,6 @@@ static ssize_t do_sendfile(int out_fd, 
                goto fput_in;
        if (!(out.file->f_mode & FMODE_WRITE))
                goto fput_out;
 -      retval = -EINVAL;
        in_inode = file_inode(in.file);
        out_inode = file_inode(out.file);
        out_pos = out.file->f_pos;
@@@ -1587,11 -1588,15 +1587,15 @@@ ssize_t vfs_copy_file_range(struct fil
         * Try cloning first, this is supported by more file systems, and
         * more efficient if both clone and copy are supported (e.g. NFS).
         */
-       if (file_in->f_op->clone_file_range) {
-               ret = file_in->f_op->clone_file_range(file_in, pos_in,
-                               file_out, pos_out, len);
-               if (ret == 0) {
-                       ret = len;
+       if (file_in->f_op->remap_file_range) {
+               loff_t cloned;
+               cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+                               file_out, pos_out,
+                               min_t(loff_t, MAX_RW_COUNT, len),
+                               REMAP_FILE_CAN_SHORTEN);
+               if (cloned > 0) {
+                       ret = cloned;
                        goto done;
                }
        }
@@@ -1685,11 -1690,12 +1689,12 @@@ out2
        return ret;
  }
  
- static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+                            bool write)
  {
        struct inode *inode = file_inode(file);
  
-       if (unlikely(pos < 0))
+       if (unlikely(pos < 0 || len < 0))
                return -EINVAL;
  
         if (unlikely((loff_t) (pos + len) < 0))
  
        return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
  }
+ /*
+  * Ensure that we don't remap a partial EOF block in the middle of something
+  * else.  Assume that the offsets have already been checked for block
+  * alignment.
+  *
+  * For deduplication we always scale down to the previous block because we
+  * can't meaningfully compare post-EOF contents.
+  *
+  * For clone we only link a partial EOF block above the destination file's EOF.
+  *
+  * Shorten the request if possible.
+  */
+ static int generic_remap_check_len(struct inode *inode_in,
+                                  struct inode *inode_out,
+                                  loff_t pos_out,
+                                  loff_t *len,
+                                  unsigned int remap_flags)
+ {
+       u64 blkmask = i_blocksize(inode_in) - 1;
+       loff_t new_len = *len;
+       if ((*len & blkmask) == 0)
+               return 0;
+       if ((remap_flags & REMAP_FILE_DEDUP) ||
+           pos_out + *len < i_size_read(inode_out))
+               new_len &= ~blkmask;
+       if (new_len == *len)
+               return 0;
+       if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+               *len = new_len;
+               return 0;
+       }
+       return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+ }
+ /*
+  * Read a page's worth of file data into the page cache.  Return the page
+  * locked.
+  */
+ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+ {
+       struct page *page;
+       page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       lock_page(page);
+       return page;
+ }
+ /*
+  * Compare extents of two files to see if they are the same.
+  * Caller must have locked both inodes to prevent write races.
+  */
+ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                        struct inode *dest, loff_t destoff,
+                                        loff_t len, bool *is_same)
+ {
+       loff_t src_poff;
+       loff_t dest_poff;
+       void *src_addr;
+       void *dest_addr;
+       struct page *src_page;
+       struct page *dest_page;
+       loff_t cmp_len;
+       bool same;
+       int error;
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_SIZE - 1);
+               dest_poff = destoff & (PAGE_SIZE - 1);
+               cmp_len = min(PAGE_SIZE - src_poff,
+                             PAGE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               if (cmp_len <= 0)
+                       goto out_error;
+               src_page = vfs_dedupe_get_page(src, srcoff);
+               if (IS_ERR(src_page)) {
+                       error = PTR_ERR(src_page);
+                       goto out_error;
+               }
+               dest_page = vfs_dedupe_get_page(dest, destoff);
+               if (IS_ERR(dest_page)) {
+                       error = PTR_ERR(dest_page);
+                       unlock_page(src_page);
+                       put_page(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+               kunmap_atomic(dest_addr);
+               kunmap_atomic(src_addr);
+               unlock_page(dest_page);
+               unlock_page(src_page);
+               put_page(dest_page);
+               put_page(src_page);
+               if (!same)
+                       break;
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+       *is_same = same;
+       return 0;
+ out_error:
+       return error;
+ }
  
  /*
   * Check that the two inodes are eligible for cloning, the ranges make
   * sense, and then flush all dirty data.  Caller must ensure that the
   * inodes have been locked against any other modifications.
   *
-  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
-  * the usual negative error code.
+  * If there's an error, then the usual negative error code is returned.
+  * Otherwise returns 0 with *len set to the request length.
   */
- int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-                              struct inode *inode_out, loff_t pos_out,
-                              u64 *len, bool is_dedupe)
+ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+                                 struct file *file_out, loff_t pos_out,
+                                 loff_t *len, unsigned int remap_flags)
  {
-       loff_t bs = inode_out->i_sb->s_blocksize;
-       loff_t blen;
-       loff_t isize;
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
        bool same_inode = (inode_in == inode_out);
        int ret;
  
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;
  
-       /* Are we going all the way to the end? */
-       isize = i_size_read(inode_in);
-       if (isize == 0)
-               return 0;
        /* Zero length dedupe exits immediately; reflink goes to EOF. */
        if (*len == 0) {
-               if (is_dedupe || pos_in == isize)
+               loff_t isize = i_size_read(inode_in);
+               if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
                        return 0;
                if (pos_in > isize)
                        return -EINVAL;
                *len = isize - pos_in;
+               if (*len == 0)
+                       return 0;
        }
  
-       /* Ensure offsets don't wrap and the input is inside i_size */
-       if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-           pos_in + *len > isize)
-               return -EINVAL;
-       /* Don't allow dedupe past EOF in the dest file */
-       if (is_dedupe) {
-               loff_t  disize;
-               disize = i_size_read(inode_out);
-               if (pos_out >= disize || pos_out + *len > disize)
-                       return -EINVAL;
-       }
-       /* If we're linking to EOF, continue to the block boundary. */
-       if (pos_in + *len == isize)
-               blen = ALIGN(isize, bs) - pos_in;
-       else
-               blen = *len;
-       /* Only reflink if we're aligned to block boundaries */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-               return -EINVAL;
-       /* Don't allow overlapped reflink within the same file */
-       if (same_inode) {
-               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-                       return -EINVAL;
-       }
+       /* Check that we don't violate system file offset limits. */
+       ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+                       remap_flags);
+       if (ret)
+               return ret;
  
        /* Wait for the completion of any pending IOs on both files */
        inode_dio_wait(inode_in);
        /*
         * Check that the extents are the same.
         */
-       if (is_dedupe) {
+       if (remap_flags & REMAP_FILE_DEDUP) {
                bool            is_same = false;
  
                ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
                        return -EBADE;
        }
  
-       return 1;
+       ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+                       remap_flags);
+       if (ret)
+               return ret;
+       /* If can't alter the file contents, we're done. */
+       if (!(remap_flags & REMAP_FILE_DEDUP)) {
+               /* Update the timestamps, since we can alter file contents. */
+               if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+                       ret = file_update_time(file_out);
+                       if (ret)
+                               return ret;
+               }
+               /*
+                * Clear the security bits if the process is not being run by
+                * root.  This keeps people from modifying setuid and setgid
+                * binaries.
+                */
+               ret = file_remove_privs(file_out);
+               if (ret)
+                       return ret;
+       }
+       return 0;
  }
- EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+ EXPORT_SYMBOL(generic_remap_file_range_prep);
  
- int do_clone_file_range(struct file *file_in, loff_t pos_in,
-                       struct file *file_out, loff_t pos_out, u64 len)
+ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+                          struct file *file_out, loff_t pos_out,
+                          loff_t len, unsigned int remap_flags)
  {
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
-       int ret;
+       loff_t ret;
+       WARN_ON_ONCE(remap_flags);
  
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
            (file_out->f_flags & O_APPEND))
                return -EBADF;
  
-       if (!file_in->f_op->clone_file_range)
+       if (!file_in->f_op->remap_file_range)
                return -EOPNOTSUPP;
  
-       ret = clone_verify_area(file_in, pos_in, len, false);
+       ret = remap_verify_area(file_in, pos_in, len, false);
        if (ret)
                return ret;
  
-       ret = clone_verify_area(file_out, pos_out, len, true);
+       ret = remap_verify_area(file_out, pos_out, len, true);
        if (ret)
                return ret;
  
-       if (pos_in + len > i_size_read(inode_in))
-               return -EINVAL;
-       ret = file_in->f_op->clone_file_range(file_in, pos_in,
-                       file_out, pos_out, len);
-       if (!ret) {
-               fsnotify_access(file_in);
-               fsnotify_modify(file_out);
-       }
+       ret = file_in->f_op->remap_file_range(file_in, pos_in,
+                       file_out, pos_out, len, remap_flags);
+       if (ret < 0)
+               return ret;
  
+       fsnotify_access(file_in);
+       fsnotify_modify(file_out);
        return ret;
  }
  EXPORT_SYMBOL(do_clone_file_range);
  
- int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                        struct file *file_out, loff_t pos_out, u64 len)
+ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+                           struct file *file_out, loff_t pos_out,
+                           loff_t len, unsigned int remap_flags)
  {
-       int ret;
+       loff_t ret;
  
        file_start_write(file_out);
-       ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+       ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+                                 remap_flags);
        file_end_write(file_out);
  
        return ret;
  }
  EXPORT_SYMBOL(vfs_clone_file_range);
  
- /*
-  * Read a page's worth of file data into the page cache.  Return the page
-  * locked.
-  */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
-       struct address_space *mapping;
-       struct page *page;
-       pgoff_t n;
-       n = offset >> PAGE_SHIFT;
-       mapping = inode->i_mapping;
-       page = read_mapping_page(mapping, n, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       lock_page(page);
-       return page;
- }
- /*
-  * Compare extents of two files to see if they are the same.
-  * Caller must have locked both inodes to prevent write races.
-  */
- int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-                                 struct inode *dest, loff_t destoff,
-                                 loff_t len, bool *is_same)
- {
-       loff_t src_poff;
-       loff_t dest_poff;
-       void *src_addr;
-       void *dest_addr;
-       struct page *src_page;
-       struct page *dest_page;
-       loff_t cmp_len;
-       bool same;
-       int error;
-       error = -EINVAL;
-       same = true;
-       while (len) {
-               src_poff = srcoff & (PAGE_SIZE - 1);
-               dest_poff = destoff & (PAGE_SIZE - 1);
-               cmp_len = min(PAGE_SIZE - src_poff,
-                             PAGE_SIZE - dest_poff);
-               cmp_len = min(cmp_len, len);
-               if (cmp_len <= 0)
-                       goto out_error;
-               src_page = vfs_dedupe_get_page(src, srcoff);
-               if (IS_ERR(src_page)) {
-                       error = PTR_ERR(src_page);
-                       goto out_error;
-               }
-               dest_page = vfs_dedupe_get_page(dest, destoff);
-               if (IS_ERR(dest_page)) {
-                       error = PTR_ERR(dest_page);
-                       unlock_page(src_page);
-                       put_page(src_page);
-                       goto out_error;
-               }
-               src_addr = kmap_atomic(src_page);
-               dest_addr = kmap_atomic(dest_page);
-               flush_dcache_page(src_page);
-               flush_dcache_page(dest_page);
-               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-                       same = false;
-               kunmap_atomic(dest_addr);
-               kunmap_atomic(src_addr);
-               unlock_page(dest_page);
-               unlock_page(src_page);
-               put_page(dest_page);
-               put_page(src_page);
-               if (!same)
-                       break;
-               srcoff += cmp_len;
-               destoff += cmp_len;
-               len -= cmp_len;
-       }
-       *is_same = same;
-       return 0;
- out_error:
-       return error;
- }
- EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
 +/* Check whether we are allowed to dedupe the destination file */
 +static bool allow_file_dedupe(struct file *file)
 +{
 +      if (capable(CAP_SYS_ADMIN))
 +              return true;
 +      if (file->f_mode & FMODE_WRITE)
 +              return true;
 +      if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
 +              return true;
 +      if (!inode_permission(file_inode(file), MAY_WRITE))
 +              return true;
 +      return false;
 +}
 +
- int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-                             struct file *dst_file, loff_t dst_pos, u64 len)
+ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+                                struct file *dst_file, loff_t dst_pos,
+                                loff_t len, unsigned int remap_flags)
  {
-       s64 ret;
+       loff_t ret;
+       WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+                                    REMAP_FILE_CAN_SHORTEN));
  
        ret = mnt_want_write_file(dst_file);
        if (ret)
                return ret;
  
-       ret = clone_verify_area(dst_file, dst_pos, len, true);
+       ret = remap_verify_area(dst_file, dst_pos, len, true);
        if (ret < 0)
                goto out_drop_write;
  
 -      ret = -EINVAL;
 -      if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
 +      ret = -EPERM;
 +      if (!allow_file_dedupe(dst_file))
                goto out_drop_write;
  
        ret = -EXDEV;
                goto out_drop_write;
  
        ret = -EINVAL;
-       if (!dst_file->f_op->dedupe_file_range)
+       if (!dst_file->f_op->remap_file_range)
                goto out_drop_write;
  
-       ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
-                                               dst_file, dst_pos, len);
+       if (len == 0) {
+               ret = 0;
+               goto out_drop_write;
+       }
+       ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+                       dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
  out_drop_write:
        mnt_drop_write_file(dst_file);
  
@@@ -2037,7 -2070,7 +2083,7 @@@ int vfs_dedupe_file_range(struct file *
        int i;
        int ret;
        u16 count = same->dest_count;
-       int deduped;
+       loff_t deduped;
  
        if (!(file->f_mode & FMODE_READ))
                return -EINVAL;
        if (!S_ISREG(src->i_mode))
                goto out;
  
-       ret = clone_verify_area(file, off, len, false);
+       ret = remap_verify_area(file, off, len, false);
        if (ret < 0)
                goto out;
        ret = 0;
                }
  
                deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-                                                   info->dest_offset, len);
+                                                   info->dest_offset, len,
+                                                   REMAP_FILE_CAN_SHORTEN);
                if (deduped == -EBADE)
                        info->status = FILE_DEDUPE_RANGE_DIFFERS;
                else if (deduped < 0)
diff --combined fs/xfs/xfs_reflink.c
@@@ -182,7 -182,8 +182,7 @@@ in
  xfs_reflink_trim_around_shared(
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *irec,
 -      bool                    *shared,
 -      bool                    *trimmed)
 +      bool                    *shared)
  {
        xfs_agnumber_t          agno;
        xfs_agblock_t           agbno;
        if (error)
                return error;
  
 -      *shared = *trimmed = false;
 +      *shared = false;
        if (fbno == NULLAGBLOCK) {
                /* No shared blocks at all. */
                return 0;
                 */
                irec->br_blockcount = flen;
                *shared = true;
 -              if (flen != aglen)
 -                      *trimmed = true;
                return 0;
        } else {
                /*
                 * start of the shared region.
                 */
                irec->br_blockcount = fbno - agbno;
 -              *trimmed = true;
                return 0;
        }
  }
  /*
   * Trim the passed in imap to the next shared/unshared extent boundary, and
   * if imap->br_startoff points to a shared extent reserve space for it in the
 - * COW fork.  In this case *shared is set to true, else to false.
 + * COW fork.
   *
   * Note that imap will always contain the block numbers for the existing blocks
   * in the data fork, as the upper layers need them for read-modify-write
  int
  xfs_reflink_reserve_cow(
        struct xfs_inode        *ip,
 -      struct xfs_bmbt_irec    *imap,
 -      bool                    *shared)
 +      struct xfs_bmbt_irec    *imap)
  {
        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
        struct xfs_bmbt_irec    got;
        int                     error = 0;
 -      bool                    eof = false, trimmed;
 +      bool                    eof = false;
        struct xfs_iext_cursor  icur;
 +      bool                    shared;
  
        /*
         * Search the COW fork extent list first.  This serves two purposes:
        if (!eof && got.br_startoff <= imap->br_startoff) {
                trace_xfs_reflink_cow_found(ip, imap);
                xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
 -
 -              *shared = true;
                return 0;
        }
  
        /* Trim the mapping to the nearest shared extent boundary. */
 -      error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
 +      error = xfs_reflink_trim_around_shared(ip, imap, &shared);
        if (error)
                return error;
  
        /* Not shared?  Just report the (potentially capped) extent. */
 -      if (!*shared)
 +      if (!shared)
                return 0;
  
        /*
@@@ -362,6 -368,7 +362,6 @@@ xfs_find_trim_cow_extent
        xfs_filblks_t           count_fsb = imap->br_blockcount;
        struct xfs_iext_cursor  icur;
        struct xfs_bmbt_irec    got;
 -      bool                    trimmed;
  
        *found = false;
  
         * If we don't find an overlapping extent, trim the range we need to
         * allocate to fit the hole we found.
         */
 -      if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
 -          got.br_startoff > offset_fsb)
 -              return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
 +      if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 +              got.br_startoff = offset_fsb + count_fsb;
 +      if (got.br_startoff > offset_fsb) {
 +              xfs_trim_extent(imap, imap->br_startoff,
 +                              got.br_startoff - imap->br_startoff);
 +              return xfs_reflink_trim_around_shared(ip, imap, shared);
 +      }
  
        *shared = true;
        if (isnullstartblock(got.br_startblock)) {
@@@ -913,18 -916,18 +913,18 @@@ out_error
  /*
   * Update destination inode size & cowextsize hint, if necessary.
   */
STATIC int
+ int
  xfs_reflink_update_dest(
        struct xfs_inode        *dest,
        xfs_off_t               newlen,
        xfs_extlen_t            cowextsize,
-       bool                    is_dedupe)
+       unsigned int            remap_flags)
  {
        struct xfs_mount        *mp = dest->i_mount;
        struct xfs_trans        *tp;
        int                     error;
  
-       if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+       if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
                return 0;
  
        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
                dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
        }
  
-       if (!is_dedupe) {
-               xfs_trans_ichgtime(tp, dest,
-                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       }
        xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
  
        error = xfs_trans_commit(tp);
  /*
   * Iteratively remap one file's extents (and holes) to another's.
   */
STATIC int
+ int
  xfs_reflink_remap_blocks(
        struct xfs_inode        *src,
-       xfs_fileoff_t           srcoff,
+       loff_t                  pos_in,
        struct xfs_inode        *dest,
-       xfs_fileoff_t           destoff,
-       xfs_filblks_t           len,
-       xfs_off_t               new_isize)
+       loff_t                  pos_out,
+       loff_t                  remap_len,
+       loff_t                  *remapped)
  {
        struct xfs_bmbt_irec    imap;
+       xfs_fileoff_t           srcoff;
+       xfs_fileoff_t           destoff;
+       xfs_filblks_t           len;
+       xfs_filblks_t           range_len;
+       xfs_filblks_t           remapped_len = 0;
+       xfs_off_t               new_isize = pos_out + remap_len;
        int                     nimaps;
        int                     error = 0;
-       xfs_filblks_t           range_len;
+       destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+       srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+       len = XFS_B_TO_FSB(src->i_mount, remap_len);
  
        /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
        while (len) {
                error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
                xfs_iunlock(src, lock_mode);
                if (error)
-                       goto err;
+                       break;
                ASSERT(nimaps == 1);
  
                trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
                error = xfs_reflink_remap_extent(dest, &imap, destoff,
                                new_isize);
                if (error)
-                       goto err;
+                       break;
  
                if (fatal_signal_pending(current)) {
                        error = -EINTR;
-                       goto err;
+                       break;
                }
  
                /* Advance drange/srange */
                srcoff += range_len;
                destoff += range_len;
                len -= range_len;
+               remapped_len += range_len;
        }
  
-       return 0;
- err:
-       trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+       if (error)
+               trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+       *remapped = min_t(loff_t, remap_len,
+                         XFS_FSB_TO_B(src->i_mount, remapped_len));
        return error;
  }
  
@@@ -1218,7 -1227,7 +1224,7 @@@ retry
  }
  
  /* Unlock both inodes after they've been prepped for a range clone. */
STATIC void
+ void
  xfs_reflink_remap_unlock(
        struct file             *file_in,
        struct file             *file_out)
@@@ -1286,21 -1295,20 +1292,20 @@@ xfs_reflink_zero_posteof
   * stale data in the destination file. Hence we reject these clone attempts with
   * -EINVAL in this case.
   */
STATIC int
+ int
  xfs_reflink_remap_prep(
        struct file             *file_in,
        loff_t                  pos_in,
        struct file             *file_out,
        loff_t                  pos_out,
-       u64                     *len,
-       bool                    is_dedupe)
+       loff_t                  *len,
+       unsigned int            remap_flags)
  {
        struct inode            *inode_in = file_inode(file_in);
        struct xfs_inode        *src = XFS_I(inode_in);
        struct inode            *inode_out = file_inode(file_out);
        struct xfs_inode        *dest = XFS_I(inode_out);
        bool                    same_inode = (inode_in == inode_out);
-       u64                     blkmask = i_blocksize(inode_in) - 1;
        ssize_t                 ret;
  
        /* Lock both files against IO */
        if (IS_DAX(inode_in) || IS_DAX(inode_out))
                goto out_unlock;
  
-       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-                       len, is_dedupe);
-       if (ret <= 0)
+       ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+                       len, remap_flags);
+       if (ret < 0 || *len == 0)
                goto out_unlock;
  
-       /*
-        * If the dedupe data matches, chop off the partial EOF block
-        * from the source file so we don't try to dedupe the partial
-        * EOF block.
-        */
-       if (is_dedupe) {
-               *len &= ~blkmask;
-       } else if (*len & blkmask) {
-               /*
-                * The user is attempting to share a partial EOF block,
-                * if it's inside the destination EOF then reject it.
-                */
-               if (pos_out + *len < i_size_read(inode_out)) {
-                       ret = -EINVAL;
-                       goto out_unlock;
-               }
-       }
        /* Attach dquots to dest inode before changing block map */
        ret = xfs_qm_dqattach(dest);
        if (ret)
                goto out_unlock;
  
        /* Zap any page cache for the destination file's range. */
-       truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                  PAGE_ALIGN(pos_out + *len) - 1);
-       /* If we're altering the file contents... */
-       if (!is_dedupe) {
-               /*
-                * ...update the timestamps (which will grab the ilock again
-                * from xfs_fs_dirty_inode, so we have to call it before we
-                * take the ilock).
-                */
-               if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-                       ret = file_update_time(file_out);
-                       if (ret)
-                               goto out_unlock;
-               }
-               /*
-                * ...clear the security bits if the process is not being run
-                * by root.  This keeps people from modifying setuid and setgid
-                * binaries.
-                */
-               ret = file_remove_privs(file_out);
-               if (ret)
-                       goto out_unlock;
-       }
+       truncate_inode_pages_range(&inode_out->i_data,
+                       round_down(pos_out, PAGE_SIZE),
+                       round_up(pos_out + *len, PAGE_SIZE) - 1);
  
        return 1;
  out_unlock:
  }
  
  /*
-  * Link a range of blocks from one file to another.
-  */
- int
- xfs_reflink_remap_range(
-       struct file             *file_in,
-       loff_t                  pos_in,
-       struct file             *file_out,
-       loff_t                  pos_out,
-       u64                     len,
-       bool                    is_dedupe)
- {
-       struct inode            *inode_in = file_inode(file_in);
-       struct xfs_inode        *src = XFS_I(inode_in);
-       struct inode            *inode_out = file_inode(file_out);
-       struct xfs_inode        *dest = XFS_I(inode_out);
-       struct xfs_mount        *mp = src->i_mount;
-       xfs_fileoff_t           sfsbno, dfsbno;
-       xfs_filblks_t           fsblen;
-       xfs_extlen_t            cowextsize;
-       ssize_t                 ret;
-       if (!xfs_sb_version_hasreflink(&mp->m_sb))
-               return -EOPNOTSUPP;
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-       /* Prepare and then clone file data. */
-       ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
-                       &len, is_dedupe);
-       if (ret <= 0)
-               return ret;
-       trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-       dfsbno = XFS_B_TO_FSBT(mp, pos_out);
-       sfsbno = XFS_B_TO_FSBT(mp, pos_in);
-       fsblen = XFS_B_TO_FSB(mp, len);
-       ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-                       pos_out + len);
-       if (ret)
-               goto out_unlock;
-       /*
-        * Carry the cowextsize hint from src to dest if we're sharing the
-        * entire source file to the entire destination file, the source file
-        * has a cowextsize hint, and the destination file does not.
-        */
-       cowextsize = 0;
-       if (pos_in == 0 && len == i_size_read(inode_in) &&
-           (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-           pos_out == 0 && len >= i_size_read(inode_out) &&
-           !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
-               cowextsize = src->i_d.di_cowextsize;
-       ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
-                       is_dedupe);
- out_unlock:
-       xfs_reflink_remap_unlock(file_in, file_out);
-       if (ret)
-               trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-       return ret;
- }
- /*
   * The user wants to preemptively CoW all shared blocks in this file,
   * which enables us to turn off the reflink flag.  Iterate all
   * extents which are not prealloc/delalloc to see which ranges are
diff --combined fs/xfs/xfs_reflink.h
@@@ -10,10 -10,10 +10,10 @@@ extern int xfs_reflink_find_shared(stru
                xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
                xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 -              struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
 +              struct xfs_bmbt_irec *irec, bool *shared);
  
  extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 -              struct xfs_bmbt_irec *imap, bool *shared);
 +              struct xfs_bmbt_irec *imap);
  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
  extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
@@@ -27,13 -27,24 +27,24 @@@ extern int xfs_reflink_cancel_cow_range
  extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
  extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
- extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
-               struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+ extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+               struct file *file_out, loff_t pos_out, loff_t len,
+               unsigned int remap_flags);
  extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
                struct xfs_inode *ip, bool *has_shared);
  extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
                struct xfs_trans **tpp);
  extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t len);
+ extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+               struct file *file_out, loff_t pos_out, loff_t *len,
+               unsigned int remap_flags);
+ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+               struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+               loff_t *remapped);
+ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+               xfs_extlen_t cowextsize, unsigned int remap_flags);
+ extern void xfs_reflink_remap_unlock(struct file *file_in,
+               struct file *file_out);
  
  #endif /* __XFS_REFLINK_H */
diff --combined include/linux/fs.h
@@@ -403,40 -403,24 +403,40 @@@ int pagecache_write_end(struct file *, 
                                loff_t pos, unsigned len, unsigned copied,
                                struct page *page, void *fsdata);
  
 +/**
 + * struct address_space - Contents of a cacheable, mappable object.
 + * @host: Owner, either the inode or the block_device.
 + * @i_pages: Cached pages.
 + * @gfp_mask: Memory allocation flags to use for allocating pages.
 + * @i_mmap_writable: Number of VM_SHARED mappings.
 + * @i_mmap: Tree of private and shared mappings.
 + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 + * @nrpages: Number of page entries, protected by the i_pages lock.
 + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
 + * @writeback_index: Writeback starts here.
 + * @a_ops: Methods.
 + * @flags: Error bits and flags (AS_*).
 + * @wb_err: The most recent error which has occurred.
 + * @private_lock: For use by the owner of the address_space.
 + * @private_list: For use by the owner of the address_space.
 + * @private_data: For use by the owner of the address_space.
 + */
  struct address_space {
 -      struct inode            *host;          /* owner: inode, block_device */
 -      struct radix_tree_root  i_pages;        /* cached pages */
 -      atomic_t                i_mmap_writable;/* count VM_SHARED mappings */
 -      struct rb_root_cached   i_mmap;         /* tree of private and shared mappings */
 -      struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
 -      /* Protected by the i_pages lock */
 -      unsigned long           nrpages;        /* number of total pages */
 -      /* number of shadow or DAX exceptional entries */
 +      struct inode            *host;
 +      struct xarray           i_pages;
 +      gfp_t                   gfp_mask;
 +      atomic_t                i_mmap_writable;
 +      struct rb_root_cached   i_mmap;
 +      struct rw_semaphore     i_mmap_rwsem;
 +      unsigned long           nrpages;
        unsigned long           nrexceptional;
 -      pgoff_t                 writeback_index;/* writeback starts here */
 -      const struct address_space_operations *a_ops;   /* methods */
 -      unsigned long           flags;          /* error bits */
 -      spinlock_t              private_lock;   /* for use by the address_space */
 -      gfp_t                   gfp_mask;       /* implicit gfp mask for allocations */
 -      struct list_head        private_list;   /* for use by the address_space */
 -      void                    *private_data;  /* ditto */
 +      pgoff_t                 writeback_index;
 +      const struct address_space_operations *a_ops;
 +      unsigned long           flags;
        errseq_t                wb_err;
 +      spinlock_t              private_lock;
 +      struct list_head        private_list;
 +      void                    *private_data;
  } __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
@@@ -483,18 -467,15 +483,18 @@@ struct block_device 
        struct mutex            bd_fsfreeze_mutex;
  } __randomize_layout;
  
 +/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
 +#define PAGECACHE_TAG_DIRTY   XA_MARK_0
 +#define PAGECACHE_TAG_WRITEBACK       XA_MARK_1
 +#define PAGECACHE_TAG_TOWRITE XA_MARK_2
 +
  /*
 - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
 - * radix trees
 + * Returns true if any of the pages in the mapping are marked with the tag.
   */
 -#define PAGECACHE_TAG_DIRTY   0
 -#define PAGECACHE_TAG_WRITEBACK       1
 -#define PAGECACHE_TAG_TOWRITE 2
 -
 -int mapping_tagged(struct address_space *mapping, int tag);
 +static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
 +{
 +      return xa_marked(&mapping->i_pages, tag);
 +}
  
  static inline void i_mmap_lock_write(struct address_space *mapping)
  {
@@@ -1412,26 -1393,17 +1412,26 @@@ struct super_block 
  
        struct sb_writers       s_writers;
  
 +      /*
 +       * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
 +       * s_fsnotify_marks together for cache efficiency. They are frequently
 +       * accessed and rarely modified.
 +       */
 +      void                    *s_fs_info;     /* Filesystem private info */
 +
 +      /* Granularity of c/m/atime in ns (cannot be worse than a second) */
 +      u32                     s_time_gran;
 +#ifdef CONFIG_FSNOTIFY
 +      __u32                   s_fsnotify_mask;
 +      struct fsnotify_mark_connector __rcu    *s_fsnotify_marks;
 +#endif
 +
        char                    s_id[32];       /* Informational name */
        uuid_t                  s_uuid;         /* UUID */
  
 -      void                    *s_fs_info;     /* Filesystem private info */
        unsigned int            s_max_links;
        fmode_t                 s_mode;
  
 -      /* Granularity of c/m/atime in ns.
 -         Cannot be worse than a second */
 -      u32                s_time_gran;
 -
        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;
  
 +      /* Pending fsnotify inode refs */
 +      atomic_long_t s_fsnotify_inode_refs;
 +
        /* Being remounted read-only */
        int s_readonly_remount;
  
@@@ -1752,6 -1721,25 +1752,25 @@@ struct block_device_operations
  #define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
  
+ /*
+  * These flags control the behavior of the remap_file_range function pointer.
+  * If it is called with len == 0 that means "remap to end of source file".
+  * See Documentation/filesystems/vfs.txt for more details about this call.
+  *
+  * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
+  * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
+  */
+ #define REMAP_FILE_DEDUP              (1 << 0)
+ #define REMAP_FILE_CAN_SHORTEN                (1 << 1)
+ /*
+  * These flags signal that the caller is ok with altering various aspects of
+  * the behavior of the remap operation.  The changes must be made by the
+  * implementation; the vfs remap helper functions can take advantage of them.
+  * Flags in this category exist to preserve the quirky behavior of the hoisted
+  * btrfs clone/dedupe ioctls.
+  */
+ #define REMAP_FILE_ADVISORY           (REMAP_FILE_CAN_SHORTEN)
  
  struct iov_iter;
  
@@@ -1790,10 -1778,9 +1809,9 @@@ struct file_operations 
  #endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
-       int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
-                       u64);
-       int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
-                       u64);
+       loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+                                  struct file *file_out, loff_t pos_out,
+                                  loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
  } __randomize_layout;
  
@@@ -1856,21 -1843,21 +1874,21 @@@ extern ssize_t vfs_readv(struct file *
                unsigned long, loff_t *, rwf_t);
  extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
- extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-                                     struct inode *inode_out, loff_t pos_out,
-                                     u64 *len, bool is_dedupe);
- extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
-                              struct file *file_out, loff_t pos_out, u64 len);
extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                               struct file *file_out, loff_t pos_out, u64 len);
- extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-                                        struct inode *dest, loff_t destoff,
-                                        loff_t len, bool *is_same);
+ extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+                                        struct file *file_out, loff_t pos_out,
+                                        loff_t *count,
+                                        unsigned int remap_flags);
+ extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
                                struct file *file_out, loff_t pos_out,
+                                 loff_t len, unsigned int remap_flags);
+ extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+                                  struct file *file_out, loff_t pos_out,
+                                  loff_t len, unsigned int remap_flags);
  extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
- extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-                                    struct file *dst_file, loff_t dst_pos,
-                                    u64 len);
+ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+                                       struct file *dst_file, loff_t dst_pos,
+                                       loff_t len, unsigned int remap_flags);
  
  
  struct super_operations {
@@@ -2998,6 -2985,9 +3016,9 @@@ extern int sb_min_blocksize(struct supe
  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+ extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
+                               struct file *file_out, loff_t pos_out,
+                               loff_t *count, unsigned int remap_flags);
  extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
  extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
  extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --combined mm/filemap.c
@@@ -36,8 -36,6 +36,8 @@@
  #include <linux/cleancache.h>
  #include <linux/shmem_fs.h>
  #include <linux/rmap.h>
 +#include <linux/delayacct.h>
 +#include <linux/psi.h>
  #include "internal.h"
  
  #define CREATE_TRACE_POINTS
   *   ->tasklist_lock            (memory_failure, collect_procs_ao)
   */
  
 -static int page_cache_tree_insert(struct address_space *mapping,
 -                                struct page *page, void **shadowp)
 -{
 -      struct radix_tree_node *node;
 -      void **slot;
 -      int error;
 -
 -      error = __radix_tree_create(&mapping->i_pages, page->index, 0,
 -                                  &node, &slot);
 -      if (error)
 -              return error;
 -      if (*slot) {
 -              void *p;
 -
 -              p = radix_tree_deref_slot_protected(slot,
 -                                                  &mapping->i_pages.xa_lock);
 -              if (!radix_tree_exceptional_entry(p))
 -                      return -EEXIST;
 -
 -              mapping->nrexceptional--;
 -              if (shadowp)
 -                      *shadowp = p;
 -      }
 -      __radix_tree_replace(&mapping->i_pages, node, slot, page,
 -                           workingset_lookup_update(mapping));
 -      mapping->nrpages++;
 -      return 0;
 -}
 -
 -static void page_cache_tree_delete(struct address_space *mapping,
 +static void page_cache_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
  {
 -      int i, nr;
 +      XA_STATE(xas, &mapping->i_pages, page->index);
 +      unsigned int nr = 1;
 +
 +      mapping_set_update(&xas, mapping);
  
 -      /* hugetlb pages are represented by one entry in the radix tree */
 -      nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
 +      /* hugetlb pages are represented by a single entry in the xarray */
 +      if (!PageHuge(page)) {
 +              xas_set_order(&xas, page->index, compound_order(page));
 +              nr = 1U << compound_order(page);
 +      }
  
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(nr != 1 && shadow, page);
  
 -      for (i = 0; i < nr; i++) {
 -              struct radix_tree_node *node;
 -              void **slot;
 -
 -              __radix_tree_lookup(&mapping->i_pages, page->index + i,
 -                                  &node, &slot);
 -
 -              VM_BUG_ON_PAGE(!node && nr != 1, page);
 -
 -              radix_tree_clear_tags(&mapping->i_pages, node, slot);
 -              __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
 -                              workingset_lookup_update(mapping));
 -      }
 +      xas_store(&xas, shadow);
 +      xas_init_marks(&xas);
  
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
@@@ -231,7 -263,7 +231,7 @@@ void __delete_from_page_cache(struct pa
        trace_mm_filemap_delete_from_page_cache(page);
  
        unaccount_page_cache_page(mapping, page);
 -      page_cache_tree_delete(mapping, page, shadow);
 +      page_cache_delete(mapping, page, shadow);
  }
  
  static void page_cache_free_page(struct address_space *mapping,
@@@ -274,7 -306,7 +274,7 @@@ void delete_from_page_cache(struct pag
  EXPORT_SYMBOL(delete_from_page_cache);
  
  /*
 - * page_cache_tree_delete_batch - delete several pages from page cache
 + * page_cache_delete_batch - delete several pages from page cache
   * @mapping: the mapping to which pages belong
   * @pvec: pagevec with pages to delete
   *
   *
   * The function expects the i_pages lock to be held.
   */
 -static void
 -page_cache_tree_delete_batch(struct address_space *mapping,
 +static void page_cache_delete_batch(struct address_space *mapping,
                             struct pagevec *pvec)
  {
 -      struct radix_tree_iter iter;
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
        int total_pages = 0;
        int i = 0, tail_pages = 0;
        struct page *page;
 -      pgoff_t start;
  
 -      start = pvec->pages[0]->index;
 -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
 +      mapping_set_update(&xas, mapping);
 +      xas_for_each(&xas, page, ULONG_MAX) {
                if (i >= pagevec_count(pvec) && !tail_pages)
                        break;
 -              page = radix_tree_deref_slot_protected(slot,
 -                                                     &mapping->i_pages.xa_lock);
 -              if (radix_tree_exceptional_entry(page))
 +              if (xa_is_value(page))
                        continue;
                if (!tail_pages) {
                        /*
                         * have our pages locked so they are protected from
                         * being removed.
                         */
 -                      if (page != pvec->pages[i])
 +                      if (page != pvec->pages[i]) {
 +                              VM_BUG_ON_PAGE(page->index >
 +                                              pvec->pages[i]->index, page);
                                continue;
 +                      }
                        WARN_ON_ONCE(!PageLocked(page));
                        if (PageTransHuge(page) && !PageHuge(page))
                                tail_pages = HPAGE_PMD_NR - 1;
                         */
                        i++;
                } else {
 +                      VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
 +                                      != pvec->pages[i]->index, page);
                        tail_pages--;
                }
 -              radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
 -              __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
 -                              workingset_lookup_update(mapping));
 +              xas_store(&xas, NULL);
                total_pages++;
        }
        mapping->nrpages -= total_pages;
@@@ -347,7 -381,7 +347,7 @@@ void delete_from_page_cache_batch(struc
  
                unaccount_page_cache_page(mapping, pvec->pages[i]);
        }
 -      page_cache_tree_delete_batch(mapping, pvec);
 +      page_cache_delete_batch(mapping, pvec);
        xa_unlock_irqrestore(&mapping->i_pages, flags);
  
        for (i = 0; i < pagevec_count(pvec); i++)
@@@ -457,31 -491,20 +457,31 @@@ EXPORT_SYMBOL(filemap_flush)
  bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
  {
 -      pgoff_t index = start_byte >> PAGE_SHIFT;
 -      pgoff_t end = end_byte >> PAGE_SHIFT;
        struct page *page;
 +      XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
 +      pgoff_t max = end_byte >> PAGE_SHIFT;
  
        if (end_byte < start_byte)
                return false;
  
 -      if (mapping->nrpages == 0)
 -              return false;
 +      rcu_read_lock();
 +      for (;;) {
 +              page = xas_find(&xas, max);
 +              if (xas_retry(&xas, page))
 +                      continue;
 +              /* Shadow entries don't count */
 +              if (xa_is_value(page))
 +                      continue;
 +              /*
 +               * We don't need to try to pin this page; we're about to
 +               * release the RCU lock anyway.  It is enough to know that
 +               * there was a page here recently.
 +               */
 +              break;
 +      }
 +      rcu_read_unlock();
  
 -      if (!find_get_pages_range(mapping, &index, end, 1, &page))
 -              return false;
 -      put_page(page);
 -      return true;
 +      return page != NULL;
  }
  EXPORT_SYMBOL(filemap_range_has_page);
  
@@@ -752,44 -775,51 +752,44 @@@ EXPORT_SYMBOL(file_write_and_wait_range
   * locked.  This function does not add the new page to the LRU, the
   * caller must do that.
   *
 - * The remove + add is atomic.  The only way this function can fail is
 - * memory allocation failure.
 + * The remove + add is atomic.  This function cannot fail.
   */
  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  {
 -      int error;
 +      struct address_space *mapping = old->mapping;
 +      void (*freepage)(struct page *) = mapping->a_ops->freepage;
 +      pgoff_t offset = old->index;
 +      XA_STATE(xas, &mapping->i_pages, offset);
 +      unsigned long flags;
  
        VM_BUG_ON_PAGE(!PageLocked(old), old);
        VM_BUG_ON_PAGE(!PageLocked(new), new);
        VM_BUG_ON_PAGE(new->mapping, new);
  
 -      error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
 -      if (!error) {
 -              struct address_space *mapping = old->mapping;
 -              void (*freepage)(struct page *);
 -              unsigned long flags;
 -
 -              pgoff_t offset = old->index;
 -              freepage = mapping->a_ops->freepage;
 -
 -              get_page(new);
 -              new->mapping = mapping;
 -              new->index = offset;
 +      get_page(new);
 +      new->mapping = mapping;
 +      new->index = offset;
  
 -              xa_lock_irqsave(&mapping->i_pages, flags);
 -              __delete_from_page_cache(old, NULL);
 -              error = page_cache_tree_insert(mapping, new, NULL);
 -              BUG_ON(error);
 +      xas_lock_irqsave(&xas, flags);
 +      xas_store(&xas, new);
  
 -              /*
 -               * hugetlb pages do not participate in page cache accounting.
 -               */
 -              if (!PageHuge(new))
 -                      __inc_node_page_state(new, NR_FILE_PAGES);
 -              if (PageSwapBacked(new))
 -                      __inc_node_page_state(new, NR_SHMEM);
 -              xa_unlock_irqrestore(&mapping->i_pages, flags);
 -              mem_cgroup_migrate(old, new);
 -              radix_tree_preload_end();
 -              if (freepage)
 -                      freepage(old);
 -              put_page(old);
 -      }
 +      old->mapping = NULL;
 +      /* hugetlb pages do not participate in page cache accounting. */
 +      if (!PageHuge(old))
 +              __dec_node_page_state(new, NR_FILE_PAGES);
 +      if (!PageHuge(new))
 +              __inc_node_page_state(new, NR_FILE_PAGES);
 +      if (PageSwapBacked(old))
 +              __dec_node_page_state(new, NR_SHMEM);
 +      if (PageSwapBacked(new))
 +              __inc_node_page_state(new, NR_SHMEM);
 +      xas_unlock_irqrestore(&xas, flags);
 +      mem_cgroup_migrate(old, new);
 +      if (freepage)
 +              freepage(old);
 +      put_page(old);
  
 -      return error;
 +      return 0;
  }
  EXPORT_SYMBOL_GPL(replace_page_cache_page);
  
@@@ -798,15 -828,12 +798,15 @@@ static int __add_to_page_cache_locked(s
                                      pgoff_t offset, gfp_t gfp_mask,
                                      void **shadowp)
  {
 +      XA_STATE(xas, &mapping->i_pages, offset);
        int huge = PageHuge(page);
        struct mem_cgroup *memcg;
        int error;
 +      void *old;
  
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(PageSwapBacked(page), page);
 +      mapping_set_update(&xas, mapping);
  
        if (!huge) {
                error = mem_cgroup_try_charge(page, current->mm,
                        return error;
        }
  
 -      error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
 -      if (error) {
 -              if (!huge)
 -                      mem_cgroup_cancel_charge(page, memcg, false);
 -              return error;
 -      }
 -
        get_page(page);
        page->mapping = mapping;
        page->index = offset;
  
 -      xa_lock_irq(&mapping->i_pages);
 -      error = page_cache_tree_insert(mapping, page, shadowp);
 -      radix_tree_preload_end();
 -      if (unlikely(error))
 -              goto err_insert;
 +      do {
 +              xas_lock_irq(&xas);
 +              old = xas_load(&xas);
 +              if (old && !xa_is_value(old))
 +                      xas_set_err(&xas, -EEXIST);
 +              xas_store(&xas, page);
 +              if (xas_error(&xas))
 +                      goto unlock;
 +
 +              if (xa_is_value(old)) {
 +                      mapping->nrexceptional--;
 +                      if (shadowp)
 +                              *shadowp = old;
 +              }
 +              mapping->nrpages++;
 +
 +              /* hugetlb pages do not participate in page cache accounting */
 +              if (!huge)
 +                      __inc_node_page_state(page, NR_FILE_PAGES);
 +unlock:
 +              xas_unlock_irq(&xas);
 +      } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
 +
 +      if (xas_error(&xas))
 +              goto error;
  
 -      /* hugetlb pages do not participate in page cache accounting. */
 -      if (!huge)
 -              __inc_node_page_state(page, NR_FILE_PAGES);
 -      xa_unlock_irq(&mapping->i_pages);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false, false);
        trace_mm_filemap_add_to_page_cache(page);
        return 0;
 -err_insert:
 +error:
        page->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
 -      xa_unlock_irq(&mapping->i_pages);
        if (!huge)
                mem_cgroup_cancel_charge(page, memcg, false);
        put_page(page);
 -      return error;
 +      return xas_error(&xas);
  }
  
  /**
@@@ -896,9 -915,12 +896,9 @@@ int add_to_page_cache_lru(struct page *
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
 -              if (!(gfp_mask & __GFP_WRITE) &&
 -                  shadow && workingset_refault(shadow)) {
 -                      SetPageActive(page);
 -                      workingset_activation(page);
 -              } else
 -                      ClearPageActive(page);
 +              WARN_ON_ONCE(PageActive(page));
 +              if (!(gfp_mask & __GFP_WRITE) && shadow)
 +                      workingset_refault(page, shadow);
                lru_cache_add(page);
        }
        return ret;
@@@ -1054,18 -1076,8 +1054,18 @@@ static inline int wait_on_page_bit_comm
  {
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
 +      bool thrashing = false;
 +      unsigned long pflags;
        int ret = 0;
  
 +      if (bit_nr == PG_locked &&
 +          !PageUptodate(page) && PageWorkingset(page)) {
 +              if (!PageSwapBacked(page))
 +                      delayacct_thrashing_start();
 +              psi_memstall_enter(&pflags);
 +              thrashing = true;
 +      }
 +
        init_wait(wait);
        wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
        wait->func = wake_page_function;
  
        finish_wait(q, wait);
  
 +      if (thrashing) {
 +              if (!PageSwapBacked(page))
 +                      delayacct_thrashing_end();
 +              psi_memstall_leave(&pflags);
 +      }
 +
        /*
         * A signal could leave PageWaiters set. Clearing it here if
         * !waitqueue_active would be possible (by open-coding finish_wait),
@@@ -1320,76 -1326,86 +1320,76 @@@ int __lock_page_or_retry(struct page *p
  }
  
  /**
 - * page_cache_next_hole - find the next hole (not-present entry)
 - * @mapping: mapping
 - * @index: index
 - * @max_scan: maximum range to search
 - *
 - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
 - * lowest indexed hole.
 - *
 - * Returns: the index of the hole if found, otherwise returns an index
 - * outside of the set specified (in which case 'return - index >=
 - * max_scan' will be true). In rare cases of index wrap-around, 0 will
 - * be returned.
 - *
 - * page_cache_next_hole may be called under rcu_read_lock. However,
 - * like radix_tree_gang_lookup, this will not atomically search a
 - * snapshot of the tree at a single point in time. For example, if a
 - * hole is created at index 5, then subsequently a hole is created at
 - * index 10, page_cache_next_hole covering both indexes may return 10
 - * if called under rcu_read_lock.
 + * page_cache_next_miss() - Find the next gap in the page cache.
 + * @mapping: Mapping.
 + * @index: Index.
 + * @max_scan: Maximum range to search.
 + *
 + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 + * gap with the lowest index.
 + *
 + * This function may be called under the rcu_read_lock.  However, this will
 + * not atomically search a snapshot of the cache at a single point in time.
 + * For example, if a gap is created at index 5, then subsequently a gap is
 + * created at index 10, page_cache_next_miss covering both indices may
 + * return 10 if called under the rcu_read_lock.
 + *
 + * Return: The index of the gap if found, otherwise an index outside the
 + * range specified (in which case 'return - index >= max_scan' will be true).
 + * In the rare case of index wrap-around, 0 will be returned.
   */
 -pgoff_t page_cache_next_hole(struct address_space *mapping,
 +pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
  {
 -      unsigned long i;
 -
 -      for (i = 0; i < max_scan; i++) {
 -              struct page *page;
 +      XA_STATE(xas, &mapping->i_pages, index);
  
 -              page = radix_tree_lookup(&mapping->i_pages, index);
 -              if (!page || radix_tree_exceptional_entry(page))
 +      while (max_scan--) {
 +              void *entry = xas_next(&xas);
 +              if (!entry || xa_is_value(entry))
                        break;
 -              index++;
 -              if (index == 0)
 +              if (xas.xa_index == 0)
                        break;
        }
  
 -      return index;
 +      return xas.xa_index;
  }
 -EXPORT_SYMBOL(page_cache_next_hole);
 +EXPORT_SYMBOL(page_cache_next_miss);
  
  /**
 - * page_cache_prev_hole - find the prev hole (not-present entry)
 - * @mapping: mapping
 - * @index: index
 - * @max_scan: maximum range to search
 - *
 - * Search backwards in the range [max(index-max_scan+1, 0), index] for
 - * the first hole.
 - *
 - * Returns: the index of the hole if found, otherwise returns an index
 - * outside of the set specified (in which case 'index - return >=
 - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
 - * will be returned.
 - *
 - * page_cache_prev_hole may be called under rcu_read_lock. However,
 - * like radix_tree_gang_lookup, this will not atomically search a
 - * snapshot of the tree at a single point in time. For example, if a
 - * hole is created at index 10, then subsequently a hole is created at
 - * index 5, page_cache_prev_hole covering both indexes may return 5 if
 - * called under rcu_read_lock.
 + * page_cache_prev_miss() - Find the next gap in the page cache.
 + * @mapping: Mapping.
 + * @index: Index.
 + * @max_scan: Maximum range to search.
 + *
 + * Search the range [max(index - max_scan + 1, 0), index] for the
 + * gap with the highest index.
 + *
 + * This function may be called under the rcu_read_lock.  However, this will
 + * not atomically search a snapshot of the cache at a single point in time.
 + * For example, if a gap is created at index 10, then subsequently a gap is
 + * created at index 5, page_cache_prev_miss() covering both indices may
 + * return 5 if called under the rcu_read_lock.
 + *
 + * Return: The index of the gap if found, otherwise an index outside the
 + * range specified (in which case 'index - return >= max_scan' will be true).
 + * In the rare case of wrap-around, ULONG_MAX will be returned.
   */
 -pgoff_t page_cache_prev_hole(struct address_space *mapping,
 +pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
  {
 -      unsigned long i;
 -
 -      for (i = 0; i < max_scan; i++) {
 -              struct page *page;
 +      XA_STATE(xas, &mapping->i_pages, index);
  
 -              page = radix_tree_lookup(&mapping->i_pages, index);
 -              if (!page || radix_tree_exceptional_entry(page))
 +      while (max_scan--) {
 +              void *entry = xas_prev(&xas);
 +              if (!entry || xa_is_value(entry))
                        break;
 -              index--;
 -              if (index == ULONG_MAX)
 +              if (xas.xa_index == ULONG_MAX)
                        break;
        }
  
 -      return index;
 +      return xas.xa_index;
  }
 -EXPORT_SYMBOL(page_cache_prev_hole);
 +EXPORT_SYMBOL(page_cache_prev_miss);
  
  /**
   * find_get_entry - find and get a page cache entry
   */
  struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
  {
 -      void **pagep;
 +      XA_STATE(xas, &mapping->i_pages, offset);
        struct page *head, *page;
  
        rcu_read_lock();
  repeat:
 -      page = NULL;
 -      pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
 -      if (pagep) {
 -              page = radix_tree_deref_slot(pagep);
 -              if (unlikely(!page))
 -                      goto out;
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page))
 -                              goto repeat;
 -                      /*
 -                       * A shadow entry of a recently evicted page,
 -                       * or a swap entry from shmem/tmpfs.  Return
 -                       * it without attempting to raise page count.
 -                       */
 -                      goto out;
 -              }
 +      xas_reset(&xas);
 +      page = xas_load(&xas);
 +      if (xas_retry(&xas, page))
 +              goto repeat;
 +      /*
 +       * A shadow entry of a recently evicted page, or a swap entry from
 +       * shmem/tmpfs.  Return it without attempting to raise page count.
 +       */
 +      if (!page || xa_is_value(page))
 +              goto out;
  
 -              head = compound_head(page);
 -              if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +      head = compound_head(page);
 +      if (!page_cache_get_speculative(head))
 +              goto repeat;
  
 -              /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +      /* The page was split under us? */
 +      if (compound_head(page) != head) {
 +              put_page(head);
 +              goto repeat;
 +      }
  
 -              /*
 -               * Has the page moved?
 -               * This is part of the lockless pagecache protocol. See
 -               * include/linux/pagemap.h for details.
 -               */
 -              if (unlikely(page != *pagep)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +      /*
 +       * Has the page moved?
 +       * This is part of the lockless pagecache protocol. See
 +       * include/linux/pagemap.h for details.
 +       */
 +      if (unlikely(page != xas_reload(&xas))) {
 +              put_page(head);
 +              goto repeat;
        }
  out:
        rcu_read_unlock();
@@@ -1470,7 -1493,7 +1470,7 @@@ struct page *find_lock_entry(struct add
  
  repeat:
        page = find_get_entry(mapping, offset);
 -      if (page && !radix_tree_exception(page)) {
 +      if (page && !xa_is_value(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page_mapping(page) != mapping)) {
@@@ -1516,7 -1539,7 +1516,7 @@@ struct page *pagecache_get_page(struct 
  
  repeat:
        page = find_get_entry(mapping, offset);
 -      if (radix_tree_exceptional_entry(page))
 +      if (xa_is_value(page))
                page = NULL;
        if (!page)
                goto no_page;
@@@ -1602,48 -1625,53 +1602,48 @@@ unsigned find_get_entries(struct addres
                          pgoff_t start, unsigned int nr_entries,
                          struct page **entries, pgoff_t *indices)
  {
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, start);
 +      struct page *page;
        unsigned int ret = 0;
 -      struct radix_tree_iter iter;
  
        if (!nr_entries)
                return 0;
  
        rcu_read_lock();
 -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
 -              struct page *head, *page;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              if (unlikely(!page))
 +      xas_for_each(&xas, page, ULONG_MAX) {
 +              struct page *head;
 +              if (xas_retry(&xas, page))
                        continue;
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 -                      /*
 -                       * A shadow entry of a recently evicted page, a swap
 -                       * entry from shmem/tmpfs or a DAX entry.  Return it
 -                       * without attempting to raise page count.
 -                       */
 +              /*
 +               * A shadow entry of a recently evicted page, a swap
 +               * entry from shmem/tmpfs or a DAX entry.  Return it
 +               * without attempting to raise page count.
 +               */
 +              if (xa_is_value(page))
                        goto export;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto retry;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto put_page;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto put_page;
 +
  export:
 -              indices[ret] = iter.index;
 +              indices[ret] = xas.xa_index;
                entries[ret] = page;
                if (++ret == nr_entries)
                        break;
 +              continue;
 +put_page:
 +              put_page(head);
 +retry:
 +              xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@@ -1674,50 -1702,64 +1674,50 @@@ unsigned find_get_pages_range(struct ad
                              pgoff_t end, unsigned int nr_pages,
                              struct page **pages)
  {
 -      struct radix_tree_iter iter;
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, *start);
 +      struct page *page;
        unsigned ret = 0;
  
        if (unlikely(!nr_pages))
                return 0;
  
        rcu_read_lock();
 -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
 -              struct page *head, *page;
 -
 -              if (iter.index > end)
 -                      break;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              if (unlikely(!page))
 +      xas_for_each(&xas, page, end) {
 +              struct page *head;
 +              if (xas_retry(&xas, page))
                        continue;
 -
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 -                      /*
 -                       * A shadow entry of a recently evicted page,
 -                       * or a swap entry from shmem/tmpfs.  Skip
 -                       * over it.
 -                       */
 +              /* Skip over shadow, swap and DAX entries */
 +              if (xa_is_value(page))
                        continue;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto retry;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto put_page;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto put_page;
  
                pages[ret] = page;
                if (++ret == nr_pages) {
 -                      *start = pages[ret - 1]->index + 1;
 +                      *start = page->index + 1;
                        goto out;
                }
 +              continue;
 +put_page:
 +              put_page(head);
 +retry:
 +              xas_reset(&xas);
        }
  
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
 -       * breaks the iteration when there is page at index -1 but that is
 +       * breaks the iteration when there is page at index -1 but that is
         * already broken anyway.
         */
        if (end == (pgoff_t)-1)
  unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                               unsigned int nr_pages, struct page **pages)
  {
 -      struct radix_tree_iter iter;
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, index);
 +      struct page *page;
        unsigned int ret = 0;
  
        if (unlikely(!nr_pages))
                return 0;
  
        rcu_read_lock();
 -      radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
 -              struct page *head, *page;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              /* The hole, there no reason to continue */
 -              if (unlikely(!page))
 -                      break;
 -
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 -                      /*
 -                       * A shadow entry of a recently evicted page,
 -                       * or a swap entry from shmem/tmpfs.  Stop
 -                       * looking for contiguous pages.
 -                       */
 +      for (page = xas_load(&xas); page; page = xas_next(&xas)) {
 +              struct page *head;
 +              if (xas_retry(&xas, page))
 +                      continue;
 +              /*
 +               * If the entry has been swapped out, we can stop looking.
 +               * No current caller is looking for DAX entries.
 +               */
 +              if (xa_is_value(page))
                        break;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto retry;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto put_page;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto put_page;
  
                /*
                 * must check mapping and index after taking the ref.
                 * otherwise we can get both false positives and false
                 * negatives, which is just confusing to the caller.
                 */
 -              if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
 +              if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
                        put_page(page);
                        break;
                }
                pages[ret] = page;
                if (++ret == nr_pages)
                        break;
 +              continue;
 +put_page:
 +              put_page(head);
 +retry:
 +              xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@@ -1813,58 -1864,74 +1813,58 @@@ EXPORT_SYMBOL(find_get_pages_contig)
   * @tag.   We update @index to index the next page for the traversal.
   */
  unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
 -                      pgoff_t end, int tag, unsigned int nr_pages,
 +                      pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                        struct page **pages)
  {
 -      struct radix_tree_iter iter;
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, *index);
 +      struct page *page;
        unsigned ret = 0;
  
        if (unlikely(!nr_pages))
                return 0;
  
        rcu_read_lock();
 -      radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
 -              struct page *head, *page;
 -
 -              if (iter.index > end)
 -                      break;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              if (unlikely(!page))
 +      xas_for_each_marked(&xas, page, end, tag) {
 +              struct page *head;
 +              if (xas_retry(&xas, page))
                        continue;
 -
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 -                      /*
 -                       * A shadow entry of a recently evicted page.
 -                       *
 -                       * Those entries should never be tagged, but
 -                       * this tree walk is lockless and the tags are
 -                       * looked up in bulk, one radix tree node at a
 -                       * time, so there is a sizable window for page
 -                       * reclaim to evict a page we saw tagged.
 -                       *
 -                       * Skip over it.
 -                       */
 +              /*
 +               * Shadow entries should never be tagged, but this iteration
 +               * is lockless so there is a window for page reclaim to evict
 +               * a page we saw tagged.  Skip over it.
 +               */
 +              if (xa_is_value(page))
                        continue;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto retry;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto put_page;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto put_page;
  
                pages[ret] = page;
                if (++ret == nr_pages) {
 -                      *index = pages[ret - 1]->index + 1;
 +                      *index = page->index + 1;
                        goto out;
                }
 +              continue;
 +put_page:
 +              put_page(head);
 +retry:
 +              xas_reset(&xas);
        }
  
        /*
 -       * We come here when we got at @end. We take care to not overflow the
 +       * We come here when we got to @end. We take care to not overflow the
         * index @index as it confuses some of the callers. This breaks the
 -       * iteration when there is page at index -1 but that is already broken
 -       * anyway.
 +       * iteration when there is a page at index -1 but that is already
 +       * broken anyway.
         */
        if (end == (pgoff_t)-1)
                *index = (pgoff_t)-1;
@@@ -1890,51 -1957,57 +1890,51 @@@ EXPORT_SYMBOL(find_get_pages_range_tag)
   * @tag.
   */
  unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 -                      int tag, unsigned int nr_entries,
 +                      xa_mark_t tag, unsigned int nr_entries,
                        struct page **entries, pgoff_t *indices)
  {
 -      void **slot;
 +      XA_STATE(xas, &mapping->i_pages, start);
 +      struct page *page;
        unsigned int ret = 0;
 -      struct radix_tree_iter iter;
  
        if (!nr_entries)
                return 0;
  
        rcu_read_lock();
 -      radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
 -              struct page *head, *page;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              if (unlikely(!page))
 +      xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
 +              struct page *head;
 +              if (xas_retry(&xas, page))
                        continue;
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 -
 -                      /*
 -                       * A shadow entry of a recently evicted page, a swap
 -                       * entry from shmem/tmpfs or a DAX entry.  Return it
 -                       * without attempting to raise page count.
 -                       */
 +              /*
 +               * A shadow entry of a recently evicted page, a swap
 +               * entry from shmem/tmpfs or a DAX entry.  Return it
 +               * without attempting to raise page count.
 +               */
 +              if (xa_is_value(page))
                        goto export;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto retry;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto put_page;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto put_page;
 +
  export:
 -              indices[ret] = iter.index;
 +              indices[ret] = xas.xa_index;
                entries[ret] = page;
                if (++ret == nr_entries)
                        break;
 +              continue;
 +put_page:
 +              put_page(head);
 +retry:
 +              xas_reset(&xas);
        }
        rcu_read_unlock();
        return ret;
@@@ -2049,7 -2122,7 +2049,7 @@@ find_page
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
                        /* pipes can't handle partially uptodate pages */
 -                      if (unlikely(iter->type & ITER_PIPE))
 +                      if (unlikely(iov_iter_is_pipe(iter)))
                                goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
@@@ -2508,7 -2581,9 +2508,7 @@@ no_cached_page
         * system is low on memory, or a problem occurs while trying
         * to schedule I/O.
         */
 -      if (error == -ENOMEM)
 -              return VM_FAULT_OOM;
 -      return VM_FAULT_SIGBUS;
 +      return vmf_error(error);
  
  page_not_uptodate:
        /*
@@@ -2538,31 -2613,45 +2538,31 @@@ EXPORT_SYMBOL(filemap_fault)
  void filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff)
  {
 -      struct radix_tree_iter iter;
 -      void **slot;
        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
        unsigned long max_idx;
 +      XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct page *head, *page;
  
        rcu_read_lock();
 -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
 -              if (iter.index > end_pgoff)
 -                      break;
 -repeat:
 -              page = radix_tree_deref_slot(slot);
 -              if (unlikely(!page))
 -                      goto next;
 -              if (radix_tree_exception(page)) {
 -                      if (radix_tree_deref_retry(page)) {
 -                              slot = radix_tree_iter_retry(&iter);
 -                              continue;
 -                      }
 +      xas_for_each(&xas, page, end_pgoff) {
 +              if (xas_retry(&xas, page))
 +                      continue;
 +              if (xa_is_value(page))
                        goto next;
 -              }
  
                head = compound_head(page);
                if (!page_cache_get_speculative(head))
 -                      goto repeat;
 +                      goto next;
  
                /* The page was split under us? */
 -              if (compound_head(page) != head) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (compound_head(page) != head)
 +                      goto skip;
  
                /* Has the page moved? */
 -              if (unlikely(page != *slot)) {
 -                      put_page(head);
 -                      goto repeat;
 -              }
 +              if (unlikely(page != xas_reload(&xas)))
 +                      goto skip;
  
                if (!PageUptodate(page) ||
                                PageReadahead(page) ||
                if (file->f_ra.mmap_miss > 0)
                        file->f_ra.mmap_miss--;
  
 -              vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
 +              vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                if (vmf->pte)
 -                      vmf->pte += iter.index - last_pgoff;
 -              last_pgoff = iter.index;
 +                      vmf->pte += xas.xa_index - last_pgoff;
 +              last_pgoff = xas.xa_index;
                if (alloc_set_pte(vmf, NULL, page))
                        goto unlock;
                unlock_page(page);
@@@ -2597,6 -2686,8 +2597,6 @@@ next
                /* Huge page is mapped? No need to proceed. */
                if (pmd_trans_huge(*vmf->pmd))
                        break;
 -              if (iter.index == end_pgoff)
 -                      break;
        }
        rcu_read_unlock();
  }
@@@ -2657,9 -2748,9 +2657,9 @@@ int generic_file_readonly_mmap(struct f
        return generic_file_mmap(file, vma);
  }
  #else
 -int filemap_page_mkwrite(struct vm_fault *vmf)
 +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
  {
 -      return -ENOSYS;
 +      return VM_FAULT_SIGBUS;
  }
  int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
  {
@@@ -2706,7 -2797,7 +2706,7 @@@ repeat
                        put_page(page);
                        if (err == -EEXIST)
                                goto repeat;
 -                      /* Presumably ENOMEM for radix tree node */
 +                      /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }
  
@@@ -2825,6 -2916,42 +2825,42 @@@ struct page *read_cache_page_gfp(struc
  EXPORT_SYMBOL(read_cache_page_gfp);
  
  /*
+  * Don't operate on ranges the page cache doesn't support, and don't exceed the
+  * LFS limits.  If pos is under the limit it becomes a short access.  If it
+  * exceeds the limit we return -EFBIG.
+  */
+ static int generic_access_check_limits(struct file *file, loff_t pos,
+                                      loff_t *count)
+ {
+       struct inode *inode = file->f_mapping->host;
+       loff_t max_size = inode->i_sb->s_maxbytes;
+       if (!(file->f_flags & O_LARGEFILE))
+               max_size = MAX_NON_LFS;
+       if (unlikely(pos >= max_size))
+               return -EFBIG;
+       *count = min(*count, max_size - pos);
+       return 0;
+ }
+ static int generic_write_check_limits(struct file *file, loff_t pos,
+                                     loff_t *count)
+ {
+       loff_t limit = rlimit(RLIMIT_FSIZE);
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
+               }
+               *count = min(*count, limit - pos);
+       }
+       return generic_access_check_limits(file, pos, count);
+ }
+ /*
   * Performs necessary checks before doing a write
   *
   * Can adjust writing position or amount of bytes to write.
@@@ -2835,8 -2962,8 +2871,8 @@@ inline ssize_t generic_write_checks(str
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-       unsigned long limit = rlimit(RLIMIT_FSIZE);
-       loff_t pos;
+       loff_t count;
+       int ret;
  
        if (!iov_iter_count(from))
                return 0;
        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);
  
-       pos = iocb->ki_pos;
        if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
                return -EINVAL;
  
-       if (limit != RLIM_INFINITY) {
-               if (iocb->ki_pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               iov_iter_truncate(from, limit - (unsigned long)pos);
-       }
+       count = iov_iter_count(from);
+       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+       if (ret)
+               return ret;
+       iov_iter_truncate(from, count);
+       return iov_iter_count(from);
+ }
+ EXPORT_SYMBOL(generic_write_checks);
+ /*
+  * Performs necessary checks before doing a clone.
+  *
+  * Can adjust amount of bytes to clone.
+  * Returns appropriate error code that caller should return or
+  * zero in case the clone should be allowed.
+  */
+ int generic_remap_checks(struct file *file_in, loff_t pos_in,
+                        struct file *file_out, loff_t pos_out,
+                        loff_t *req_count, unsigned int remap_flags)
+ {
+       struct inode *inode_in = file_in->f_mapping->host;
+       struct inode *inode_out = file_out->f_mapping->host;
+       uint64_t count = *req_count;
+       uint64_t bcount;
+       loff_t size_in, size_out;
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       int ret;
+       /* The start of both ranges must be aligned to an fs block. */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+               return -EINVAL;
+       /* Ensure offsets don't wrap. */
+       if (pos_in + count < pos_in || pos_out + count < pos_out)
+               return -EINVAL;
+       size_in = i_size_read(inode_in);
+       size_out = i_size_read(inode_out);
+       /* Dedupe requires both ranges to be within EOF. */
+       if ((remap_flags & REMAP_FILE_DEDUP) &&
+           (pos_in >= size_in || pos_in + count > size_in ||
+            pos_out >= size_out || pos_out + count > size_out))
+               return -EINVAL;
+       /* Ensure the infile range is within the infile. */
+       if (pos_in >= size_in)
+               return -EINVAL;
+       count = min(count, size_in - (uint64_t)pos_in);
+       ret = generic_access_check_limits(file_in, pos_in, &count);
+       if (ret)
+               return ret;
+       ret = generic_write_check_limits(file_out, pos_out, &count);
+       if (ret)
+               return ret;
  
        /*
-        * LFS rule
+        * If the user wanted us to link to the infile's EOF, round up to the
+        * next block boundary for this check.
+        *
+        * Otherwise, make sure the count is also block-aligned, having
+        * already confirmed the starting offsets' block alignment.
         */
-       if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-                               !(file->f_flags & O_LARGEFILE))) {
-               if (pos >= MAX_NON_LFS)
-                       return -EFBIG;
-               iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+       if (pos_in + count == size_in) {
+               bcount = ALIGN(size_in, bs) - pos_in;
+       } else {
+               if (!IS_ALIGNED(count, bs))
+                       count = ALIGN_DOWN(count, bs);
+               bcount = count;
        }
  
+       /* Don't allow overlapped cloning within the same file. */
+       if (inode_in == inode_out &&
+           pos_out + bcount > pos_in &&
+           pos_out < pos_in + bcount)
+               return -EINVAL;
        /*
-        * Are we about to exceed the fs block limit ?
-        *
-        * If we have written data it becomes a short write.  If we have
-        * exceeded without writing data we send a signal and return EFBIG.
-        * Linus frestrict idea will clean these up nicely..
+        * We shortened the request but the caller can't deal with that, so
+        * bounce the request back to userspace.
         */
-       if (unlikely(pos >= inode->i_sb->s_maxbytes))
-               return -EFBIG;
+       if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+               return -EINVAL;
  
-       iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
-       return iov_iter_count(from);
+       *req_count = count;
+       return 0;
  }
- EXPORT_SYMBOL(generic_write_checks);
  
  int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
@@@ -2921,7 -3104,7 +3013,7 @@@ generic_file_direct_write(struct kiocb 
        if (iocb->ki_flags & IOCB_NOWAIT) {
                /* If there are pages to writeback, return */
                if (filemap_range_has_page(inode->i_mapping, pos,
 -                                         pos + iov_iter_count(from)))
 +                                         pos + write_len))
                        return -EAGAIN;
        } else {
                written = filemap_write_and_wait_range(mapping, pos,