Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
diff --combined Documentation/filesystems/porting

index 321d74b,e6d4466..cf43bc4
--- 1/Documentation/filesystems/porting
--- 2/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@@ -623,13 -623,7 +623,18 @@@ in your dentry operations instead
         On success you get a new struct file sharing the mount/dentry with the
         original, on failure - ERR_PTR().
   --
+ [mandatory]
+       ->clone_file_range() and ->dedupe_file_range have been replaced with
+       ->remap_file_range().  See Documentation/filesystems/vfs.txt for more
+       information.
++--
+ +[recommended]
+ +      ->lookup() instances doing an equivalent of
+ +              if (IS_ERR(inode))
+ +                      return ERR_CAST(inode);
+ +              return d_splice_alias(inode, dentry);
+ +      don't need to bother with the check - d_splice_alias() will do the
+ +      right thing when given ERR_PTR(...) as inode.  Moreover, passing NULL
+ +      inode to d_splice_alias() will also do the right thing (equivalent of
+ +      d_add(dentry, NULL); return NULL;), so that kind of special cases
+ +      also doesn't need a separate treatment.
diff --combined fs/btrfs/ctree.h

index 68ca41d,771a961..8095352
--- 1/fs/btrfs/ctree.h
--- 2/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@@ -41,6 -41,12 +41,6 @@@ extern struct kmem_cache *btrfs_path_ca
   extern struct kmem_cache *btrfs_free_space_cachep;
   struct btrfs_ordered_sum;
   
- -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- -#define STATIC noinline
- -#else
- -#define STATIC static noinline
- -#endif
- -
   #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
   
   #define BTRFS_MAX_MIRRORS 3
@@@ -361,13 -367,11 +361,13 @@@ struct btrfs_dev_replace 
   
         struct mutex lock_finishing_cancel_unmount;
         rwlock_t lock;
- -      atomic_t read_locks;
         atomic_t blocking_readers;
         wait_queue_head_t read_lock_wq;
   
         struct btrfs_scrub_progress scrub_progress;
+ +
+ +      struct percpu_counter bio_counter;
+ +      wait_queue_head_t replace_wait;
   };
   
   /* For raid type sysfs entries */
@@@ -1090,6 -1094,9 +1090,6 @@@ struct btrfs_fs_info 
         /* device replace state */
         struct btrfs_dev_replace dev_replace;
   
- -      struct percpu_counter bio_counter;
- -      wait_queue_head_t replace_wait;
- -
         struct semaphore uuid_tree_rescan_sem;
   
         /* Used to reclaim the metadata space in the background. */
@@@ -1195,12 -1202,18 +1195,12 @@@ struct btrfs_root 
         int last_log_commit;
         pid_t log_start_pid;
   
- -      u64 objectid;
         u64 last_trans;
   
         u32 type;
   
         u64 highest_objectid;
   
- -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- -      /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
- -      u64 alloc_bytenr;
- -#endif
- -
         u64 defrag_trans_start;
         struct btrfs_key defrag_progress;
         struct btrfs_key defrag_max;
@@@ -1273,10 -1286,6 +1273,10 @@@
         spinlock_t qgroup_meta_rsv_lock;
         u64 qgroup_meta_rsv_pertrans;
         u64 qgroup_meta_rsv_prealloc;
+ +
+ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ +      u64 alloc_bytenr;
+ +#endif
   };
   
   struct btrfs_file_private {
@@@ -2598,8 -2607,10 +2598,8 @@@ static inline u64 btrfs_calc_trunc_meta
         return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
   }
   
- -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- -                                     struct btrfs_fs_info *fs_info);
- -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- -                                     struct btrfs_fs_info *fs_info);
+ +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+ +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
   void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
                                          const u64 start);
   void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@@ -2760,7 -2771,7 +2760,7 @@@ int btrfs_block_rsv_refill(struct btrfs
                            enum btrfs_reserve_flush_enum flush);
   int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- -                          int update_size);
+ +                          bool update_size);
   int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
                              struct btrfs_block_rsv *dest, u64 num_bytes,
                              int min_factor);
@@@ -2866,6 -2877,8 +2866,6 @@@ void btrfs_release_path(struct btrfs_pa
   struct btrfs_path *btrfs_alloc_path(void);
   void btrfs_free_path(struct btrfs_path *p);
   void btrfs_set_path_blocking(struct btrfs_path *p);
- -void btrfs_clear_path_blocking(struct btrfs_path *p,
- -                             struct extent_buffer *held, int held_rw);
   void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
   
   int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@@ -3008,7 -3021,8 +3008,7 @@@ int btrfs_uuid_tree_iterate(struct btrf
   /* dir-item.c */
   int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
                           const char *name, int name_len);
- -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
- -                        struct btrfs_root *root, const char *name,
+ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
                           int name_len, struct btrfs_inode *dir,
                           struct btrfs_key *location, u8 type, u64 index);
   struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@@ -3166,8 -3180,8 +3166,8 @@@ void __cold btrfs_destroy_cachep(void)
   struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                          struct btrfs_root *root, int *was_new);
   struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- -              struct page *page, size_t pg_offset,
- -              u64 start, u64 end, int create);
+ +                                  struct page *page, size_t pg_offset,
+ +                                  u64 start, u64 end, int create);
   int btrfs_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode);
@@@ -3187,6 -3201,9 +3187,6 @@@ int btrfs_prealloc_file_range_trans(str
                                     u64 start, u64 num_bytes, u64 min_size,
                                     loff_t actual_len, u64 *alloc_hint);
   extern const struct dentry_operations btrfs_dentry_operations;
- -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- -void btrfs_test_inode_set_ops(struct inode *inode);
- -#endif
   
   /* ioctl.c */
   long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@@ -3201,9 -3218,6 +3201,6 @@@ void btrfs_get_block_group_info(struct 
                                 struct btrfs_ioctl_space_info *space);
   void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
                                struct btrfs_ioctl_balance_args *bargs);
- int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-                           struct file *dst_file, loff_t dst_loff,
-                           u64 olen);
   
   /* file.c */
   int __init btrfs_auto_defrag_init(void);
@@@ -3233,8 -3247,9 +3230,9 @@@ int btrfs_dirty_pages(struct inode *ino
                       size_t num_pages, loff_t pos, size_t write_bytes,
                       struct extent_state **cached);
   int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
- int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                          struct file *file_out, loff_t pos_out, u64 len);
+ loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
+                             struct file *file_out, loff_t pos_out,
+                             loff_t len, unsigned int remap_flags);
   
   /* tree-defrag.c */
   int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@@ -3699,19 -3714,18 +3697,19 @@@ static inline int btrfs_defrag_cancelle
   
   /* Sanity test specific functions */
   #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ +void btrfs_test_inode_set_ops(struct inode *inode);
   void btrfs_test_destroy_inode(struct inode *inode);
- -#endif
   
   static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
   {
- -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- -      if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- -                            &fs_info->fs_state)))
- -              return 1;
- -#endif
+ +      return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+ +}
+ +#else
+ +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+ +{
         return 0;
   }
+ +#endif
   
   static inline void cond_wake_up(struct wait_queue_head *wq)
   {
diff --combined fs/btrfs/file.c

index 97c7a08,9a963f0..a3c22e1
--- 1/fs/btrfs/file.c
--- 2/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -531,14 -531,6 +531,14 @@@ int btrfs_dirty_pages(struct inode *ino
   
         end_of_last_block = start_pos + num_bytes - 1;
   
+ +      /*
+ +       * The pages may have already been dirty, clear out old accounting so
+ +       * we can set things up properly
+ +       */
+ +      clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ +                       EXTENT_DIRTY | EXTENT_DELALLOC |
+ +                       EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+ +
         if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
                 if (start_pos >= isize &&
                     !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@@ -1508,27 -1500,18 +1508,27 @@@ lock_and_cleanup_extent_if_need(struct 
                 }
                 if (ordered)
                         btrfs_put_ordered_extent(ordered);
- -              clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- -                               EXTENT_DIRTY | EXTENT_DELALLOC |
- -                               EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- -                               0, 0, cached_state);
+ +
                 *lockstart = start_pos;
                 *lockend = last_pos;
                 ret = 1;
         }
   
+ +      /*
+ +       * It's possible the pages are dirty right now, but we don't want
+ +       * to clean them yet because copy_from_user may catch a page fault
+ +       * and we might have to fall back to one page at a time.  If that
+ +       * happens, we'll unlock these pages and we'd have a window where
+ +       * reclaim could sneak in and drop the once-dirty page on the floor
+ +       * without writing it.
+ +       *
+ +       * We have the pages locked and the extent range locked, so there's
+ +       * no way someone can start IO on any dirty pages in this range.
+ +       *
+ +       * We'll call btrfs_dirty_pages() later on, and that will flip around
+ +       * delalloc bits and dirty the pages as required.
+ +       */
         for (i = 0; i < num_pages; i++) {
- -              if (clear_page_dirty_for_io(pages[i]))
- -                      account_page_redirty(pages[i]);
                 set_page_extent_mapped(pages[i]);
                 WARN_ON(!PageLocked(pages[i]));
         }
@@@ -2078,14 -2061,6 +2078,14 @@@ int btrfs_sync_file(struct file *file, 
                 goto out;
   
         inode_lock(inode);
+ +
+ +      /*
+ +       * We take the dio_sem here because the tree log stuff can race with
+ +       * lockless dio writes and get an extent map logged for an extent we
+ +       * never waited on.  We need it this high up for lockdep reasons.
+ +       */
+ +      down_write(&BTRFS_I(inode)->dio_sem);
+ +
         atomic_inc(&root->log_batch);
   
         /*
@@@ -2094,7 -2069,6 +2094,7 @@@
          */
         ret = btrfs_wait_ordered_range(inode, start, len);
         if (ret) {
+ +              up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
@@@ -2118,7 -2092,6 +2118,7 @@@
                  * checked called fsync.
                  */
                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+ +              up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
@@@ -2137,7 -2110,6 +2137,7 @@@
         trans = btrfs_start_transaction(root, 0);
         if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
+ +              up_write(&BTRFS_I(inode)->dio_sem);
                 inode_unlock(inode);
                 goto out;
         }
@@@ -2159,7 -2131,6 +2159,7 @@@
          * file again, but that will end up using the synchronization
          * inside btrfs_sync_log to keep things safe.
          */
+ +      up_write(&BTRFS_I(inode)->dio_sem);
         inode_unlock(inode);
   
         /*
@@@ -2573,7 -2544,7 +2573,7 @@@ static int btrfs_punch_hole(struct inod
         }
   
         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- -                                    min_size, 0);
+ +                                    min_size, false);
         BUG_ON(ret);
         trans->block_rsv = rsv;
   
@@@ -2623,7 -2594,7 +2623,7 @@@
                 }
   
                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- -                                            rsv, min_size, 0);
+ +                                            rsv, min_size, false);
                 BUG_ON(ret);    /* shouldn't happen */
                 trans->block_rsv = rsv;
   
@@@ -3298,8 -3269,7 +3298,7 @@@ const struct file_operations btrfs_file
   #ifdef CONFIG_COMPAT
         .compat_ioctl   = btrfs_compat_ioctl,
   #endif
-       .clone_file_range = btrfs_clone_file_range,
-       .dedupe_file_range = btrfs_dedupe_file_range,
+       .remap_file_range = btrfs_remap_file_range,
   };
   
   void __cold btrfs_auto_defrag_exit(void)
diff --combined fs/btrfs/ioctl.c

index a990a90,b0c513e..3ca6943
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -491,6 -491,7 +491,6 @@@ static noinline int btrfs_ioctl_fitrim(
         struct fstrim_range range;
         u64 minlen = ULLONG_MAX;
         u64 num_devices = 0;
- -      u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
@@@ -514,15 -515,11 +514,15 @@@
                 return -EOPNOTSUPP;
         if (copy_from_user(&range, arg, sizeof(range)))
                 return -EFAULT;
- -      if (range.start > total_bytes ||
- -          range.len < fs_info->sb->s_blocksize)
+ +
+ +      /*
+ +       * NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
+ +       * block group is in the logical address space, which can be any
+ +       * sectorsize aligned bytenr in  the range [0, U64_MAX].
+ +       */
+ +      if (range.len < fs_info->sb->s_blocksize)
                 return -EINVAL;
   
- -      range.len = min(range.len, total_bytes - range.start);
         range.minlen = max(range.minlen, minlen);
         ret = btrfs_trim_fs(fs_info, &range);
         if (ret < 0)
@@@ -689,7 -686,8 +689,7 @@@ static noinline int create_subvol(struc
                 goto fail;
         }
   
- -      ret = btrfs_insert_dir_item(trans, root,
- -                                  name, namelen, BTRFS_I(dir), &key,
+ +      ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
                                     BTRFS_FT_DIR, index);
         if (ret) {
                 btrfs_abort_transaction(trans, ret);
@@@ -1326,7 -1324,7 +1326,7 @@@ again
   
         if (i_done != page_cnt) {
                 spin_lock(&BTRFS_I(inode)->lock);
- -              BTRFS_I(inode)->outstanding_extents++;
+ +              btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
                 spin_unlock(&BTRFS_I(inode)->lock);
                 btrfs_delalloc_release_space(inode, data_reserved,
                                 start_index << PAGE_SHIFT,
@@@ -3629,26 -3627,6 +3629,6 @@@ out_unlock
         return ret;
   }
   
- int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
-                           struct file *dst_file, loff_t dst_loff,
-                           u64 olen)
- {
-       struct inode *src = file_inode(src_file);
-       struct inode *dst = file_inode(dst_file);
-       u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- 
-       if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
-               /*
-                * Btrfs does not support blocksize < page_size. As a
-                * result, btrfs_cmp_data() won't correctly handle
-                * this situation without an update.
-                */
-               return -EINVAL;
-       }
- 
-       return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
- }
- 
   static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
                                      struct inode *inode,
                                      u64 endoff,
@@@ -4350,10 -4328,34 +4330,34 @@@ out_unlock
         return ret;
   }
   
- int btrfs_clone_file_range(struct file *src_file, loff_t off,
-               struct file *dst_file, loff_t destoff, u64 len)
+ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
+               struct file *dst_file, loff_t destoff, loff_t len,
+               unsigned int remap_flags)
   {
-       return btrfs_clone_files(dst_file, src_file, off, len, destoff);
+       int ret;
+ 
+       if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+               return -EINVAL;
+ 
+       if (remap_flags & REMAP_FILE_DEDUP) {
+               struct inode *src = file_inode(src_file);
+               struct inode *dst = file_inode(dst_file);
+               u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ 
+               if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
+                       /*
+                        * Btrfs does not support blocksize < page_size. As a
+                        * result, btrfs_cmp_data() won't correctly handle
+                        * this situation without an update.
+                        */
+                       return -EINVAL;
+               }
+ 
+               ret = btrfs_extent_same(src, off, len, dst, destoff);
+       } else {
+               ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
+       }
+       return ret < 0 ? ret : len;
   }
   
   static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
@@@ -4395,7 -4397,7 +4399,7 @@@
                 ret = PTR_ERR(new_root);
                 goto out;
         }
- -      if (!is_fstree(new_root->objectid)) {
+ +      if (!is_fstree(new_root->root_key.objectid)) {
                 ret = -ENOENT;
                 goto out;
         }
diff --combined fs/cifs/cifsfs.c

index 7de9603,5ca71c6..b7ac09e
--- 1/fs/cifs/cifsfs.c
--- 2/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@@ -81,14 -81,6 +81,14 @@@ module_param(cifs_max_pending, uint, 04
   MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
                                    "CIFS/SMB1 dialect (N/A for SMB3) "
                                    "Default: 32767 Range: 2 to 32767.");
+ +#ifdef CONFIG_CIFS_STATS2
+ +unsigned int slow_rsp_threshold = 1;
+ +module_param(slow_rsp_threshold, uint, 0644);
+ +MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait "
+ +                                 "before logging that a response is delayed. "
+ +                                 "Default: 1 (if set to 0 disables msg).");
+ +#endif /* STATS2 */
+ +
   module_param(enable_oplocks, bool, 0644);
   MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
   
@@@ -500,8 -492,6 +500,8 @@@ cifs_show_options(struct seq_file *s, s
                 seq_puts(s, ",unix");
         else
                 seq_puts(s, ",nounix");
+ +      if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ +              seq_puts(s, ",nodfs");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
                 seq_puts(s, ",posixpaths");
         if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@@ -717,14 -707,7 +717,14 @@@ cifs_smb3_do_mount(struct file_system_t
         struct cifs_mnt_data mnt_data;
         struct dentry *root;
   
- -      cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ +      /*
+ +       * Prints in Kernel / CIFS log the attempted mount operation
+ +       *      If CIFS_DEBUG && cifs_FYI
+ +       */
+ +      if (cifsFYI)
+ +              cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags);
+ +      else
+ +              cifs_info("Attempting to mount %s\n", dev_name);
   
         volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3);
         if (IS_ERR(volume_info))
@@@ -992,8 -975,9 +992,9 @@@ const struct inode_operations cifs_syml
         .listxattr = cifs_listxattr,
   };
   
- static int cifs_clone_file_range(struct file *src_file, loff_t off,
-               struct file *dst_file, loff_t destoff, u64 len)
+ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
+               struct file *dst_file, loff_t destoff, loff_t len,
+               unsigned int remap_flags)
   {
         struct inode *src_inode = file_inode(src_file);
         struct inode *target_inode = file_inode(dst_file);
@@@ -1003,6 -987,9 +1004,9 @@@
         unsigned int xid;
         int rc;
   
+       if (remap_flags & ~REMAP_FILE_ADVISORY)
+               return -EINVAL;
+ 
         cifs_dbg(FYI, "clone range\n");
   
         xid = get_xid();
@@@ -1042,7 -1029,7 +1046,7 @@@
         unlock_two_nondirectories(src_inode, target_inode);
   out:
         free_xid(xid);
-       return rc;
+       return rc < 0 ? rc : len;
   }
   
   ssize_t cifs_file_copychunk_range(unsigned int xid,
@@@ -1151,7 -1138,7 +1155,7 @@@ const struct file_operations cifs_file_
         .llseek = cifs_llseek,
         .unlocked_ioctl = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
   };
@@@ -1170,7 -1157,7 +1174,7 @@@ const struct file_operations cifs_file_
         .llseek = cifs_llseek,
         .unlocked_ioctl = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
   };
@@@ -1189,7 -1176,7 +1193,7 @@@ const struct file_operations cifs_file_
         .splice_write = iter_file_splice_write,
         .unlocked_ioctl  = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .llseek = cifs_llseek,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
@@@ -1208,7 -1195,7 +1212,7 @@@ const struct file_operations cifs_file_
         .llseek = cifs_llseek,
         .unlocked_ioctl = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
   };
@@@ -1226,7 -1213,7 +1230,7 @@@ const struct file_operations cifs_file_
         .llseek = cifs_llseek,
         .unlocked_ioctl = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
   };
@@@ -1244,7 -1231,7 +1248,7 @@@ const struct file_operations cifs_file_
         .splice_write = iter_file_splice_write,
         .unlocked_ioctl  = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .llseek = cifs_llseek,
         .setlease = cifs_setlease,
         .fallocate = cifs_fallocate,
@@@ -1256,7 -1243,7 +1260,7 @@@ const struct file_operations cifs_dir_o
         .read    = generic_read_dir,
         .unlocked_ioctl  = cifs_ioctl,
         .copy_file_range = cifs_copy_file_range,
-       .clone_file_range = cifs_clone_file_range,
+       .remap_file_range = cifs_remap_file_range,
         .llseek = generic_file_llseek,
         .fsync = cifs_dir_fsync,
   };
@@@ -1435,11 -1422,6 +1439,11 @@@ init_cifs(void
   #ifdef CONFIG_CIFS_STATS2
         atomic_set(&totBufAllocCount, 0);
         atomic_set(&totSmBufAllocCount, 0);
+ +      if (slow_rsp_threshold < 1)
+ +              cifs_dbg(FYI, "slow_response_threshold msgs disabled\n");
+ +      else if (slow_rsp_threshold > 32767)
+ +              cifs_dbg(VFS,
+ +                     "slow response threshold set higher than recommended (0 to 32767)\n");
   #endif /* CONFIG_CIFS_STATS2 */
   
         atomic_set(&midCount, 0);
@@@ -1560,11 -1542,11 +1564,11 @@@ exit_cifs(void
         cifs_proc_clean();
   }
   
- -MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
+ +MODULE_AUTHOR("Steve French");
   MODULE_LICENSE("GPL");        /* combination of LGPL + GPL source behaves as GPL */
   MODULE_DESCRIPTION
- -    ("VFS to access servers complying with the SNIA CIFS Specification "
- -     "e.g. Samba and Windows");
+ +      ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
+ +      "also older servers complying with the SNIA CIFS Specification)");
   MODULE_VERSION(CIFS_VERSION);
   MODULE_SOFTDEP("pre: arc4");
   MODULE_SOFTDEP("pre: des");
diff --combined fs/ioctl.c

index 0400297,505275e..d64f622
--- 1/fs/ioctl.c
--- 2/fs/ioctl.c
+++ b/fs/ioctl.c
@@@ -223,6 -223,7 +223,7 @@@ static long ioctl_file_clone(struct fil
                              u64 off, u64 olen, u64 destoff)
   {
         struct fd src_file = fdget(srcfd);
+       loff_t cloned;
         int ret;
   
         if (!src_file.file)
@@@ -230,7 -231,14 +231,14 @@@
         ret = -EXDEV;
         if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
                 goto fdput;
-       ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+       cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+                                     olen, 0);
+       if (cloned < 0)
+               ret = cloned;
+       else if (olen && cloned != olen)
+               ret = -EINVAL;
+       else
+               ret = 0;
   fdput:
         fdput(src_file);
         return ret;
@@@ -669,9 -677,6 +677,9 @@@ int do_vfs_ioctl(struct file *filp, uns
                 return ioctl_fiemap(filp, arg);
   
         case FIGETBSZ:
+ +              /* anon_bdev filesystems may not have a block size */
+ +              if (!inode->i_sb->s_blocksize)
+ +                      return -EINVAL;
                 return put_user(inode->i_sb->s_blocksize, argp);
   
         case FICLONE:
diff --combined fs/nfsd/vfs.c

index fb28be6,726fc5b..eb67098
--- 1/fs/nfsd/vfs.c
--- 2/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@@ -541,8 -541,12 +541,12 @@@ __be32 nfsd4_set_nfs4_label(struct svc_
   __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
                 u64 dst_pos, u64 count)
   {
-       return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
-                                            count));
+       loff_t cloned;
+ 
+       cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
+       if (count && cloned != count)
+               cloned = -EINVAL;
+       return nfserrno(cloned < 0 ? cloned : 0);
   }
   
   ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@@ -923,7 -927,7 +927,7 @@@ __be32 nfsd_readv(struct svc_rqst *rqst
         int host_err;
   
         trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- -      iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
+ +      iov_iter_kvec(&iter, READ, vec, vlen, *count);
         host_err = vfs_iter_read(file, &iter, &offset, 0);
         return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
   }
@@@ -999,7 -1003,7 +1003,7 @@@ nfsd_vfs_write(struct svc_rqst *rqstp, 
         if (stable && !use_wgather)
                 flags |= RWF_SYNC;
   
- -      iov_iter_kvec(&iter, WRITE | ITER_KVEC, vec, vlen, *cnt);
+ +      iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
         host_err = vfs_iter_write(file, &iter, &pos, flags);
         if (host_err < 0)
                 goto out_nfserr;
@@@ -1276,6 -1280,7 +1280,6 @@@ nfsd_create(struct svc_rqst *rqstp, str
                 int type, dev_t rdev, struct svc_fh *resfhp)
   {
         struct dentry   *dentry, *dchild = NULL;
- -      struct inode    *dirp;
         __be32          err;
         int             host_err;
   
@@@ -1287,6 -1292,7 +1291,6 @@@
                 return err;
   
         dentry = fhp->fh_dentry;
- -      dirp = d_inode(dentry);
   
         host_err = fh_want_write(fhp);
         if (host_err)
@@@ -1407,7 -1413,6 +1411,7 @@@ do_nfsd_create(struct svc_rqst *rqstp, 
                                         *created = 1;
                                 break;
                         }
+ +                      /* fall through */
                 case NFS4_CREATE_EXCLUSIVE4_1:
                         if (   d_inode(dchild)->i_mtime.tv_sec == v_mtime
                             && d_inode(dchild)->i_atime.tv_sec == v_atime
@@@ -1416,7 -1421,7 +1420,7 @@@
                                         *created = 1;
                                 goto set_attr;
                         }
- -                       /* fallthru */
+ +                      /* fall through */
                 case NFS3_CREATE_GUARDED:
                         err = nfserr_exist;
                 }
diff --combined fs/ocfs2/refcounttree.c

index 1114ef0,dc66b80..a35259e
--- 1/fs/ocfs2/refcounttree.c
--- 2/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@@ -4135,6 -4135,7 +4135,6 @@@ static int ocfs2_create_reflink_node(st
         struct buffer_head *ref_root_bh = NULL;
         struct ocfs2_cached_dealloc_ctxt dealloc;
         struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
- -      struct ocfs2_refcount_block *rb;
         struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
         struct ocfs2_refcount_tree *ref_tree;
   
@@@ -4161,6 -4162,7 +4161,6 @@@
                 mlog_errno(ret);
                 goto out;
         }
- -      rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
   
         ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
                                           &ref_tree->rf_ci, ref_root_bh,
@@@ -4466,9 -4468,9 +4466,9 @@@ out
   }
   
   /* Update destination inode size, if necessary. */
- static int ocfs2_reflink_update_dest(struct inode *dest,
-                                    struct buffer_head *d_bh,
-                                    loff_t newlen)
+ int ocfs2_reflink_update_dest(struct inode *dest,
+                             struct buffer_head *d_bh,
+                             loff_t newlen)
   {
         handle_t *handle;
         int ret;
@@@ -4505,14 -4507,14 +4505,14 @@@ out_commit
   }
   
   /* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
- static int ocfs2_reflink_remap_extent(struct inode *s_inode,
-                                     struct buffer_head *s_bh,
-                                     loff_t pos_in,
-                                     struct inode *t_inode,
-                                     struct buffer_head *t_bh,
-                                     loff_t pos_out,
-                                     loff_t len,
-                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
+ static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
+                                        struct buffer_head *s_bh,
+                                        loff_t pos_in,
+                                        struct inode *t_inode,
+                                        struct buffer_head *t_bh,
+                                        loff_t pos_out,
+                                        loff_t len,
+                                        struct ocfs2_cached_dealloc_ctxt *dealloc)
   {
         struct ocfs2_extent_tree s_et;
         struct ocfs2_extent_tree t_et;
@@@ -4520,8 -4522,9 +4520,9 @@@
         struct buffer_head *ref_root_bh = NULL;
         struct ocfs2_refcount_tree *ref_tree;
         struct ocfs2_super *osb;
+       loff_t remapped_bytes = 0;
         loff_t pstart, plen;
-       u32 p_cluster, num_clusters, slast, spos, tpos;
+       u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
         unsigned int ext_flags;
         int ret = 0;
   
@@@ -4603,30 -4606,34 +4604,34 @@@
   next_loop:
                 spos += num_clusters;
                 tpos += num_clusters;
+               remapped_clus += num_clusters;
         }
   
- out:
-       return ret;
+       goto out;
   out_unlock_refcount:
         ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
         brelse(ref_root_bh);
-       return ret;
+ out:
+       remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
+       remapped_bytes = min_t(loff_t, len, remapped_bytes);
+ 
+       return remapped_bytes > 0 ? remapped_bytes : ret;
   }
   
   /* Set up refcount tree and remap s_inode to t_inode. */
- static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
-                                     struct buffer_head *s_bh,
-                                     loff_t pos_in,
-                                     struct inode *t_inode,
-                                     struct buffer_head *t_bh,
-                                     loff_t pos_out,
-                                     loff_t len)
+ loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
+                                 struct buffer_head *s_bh,
+                                 loff_t pos_in,
+                                 struct inode *t_inode,
+                                 struct buffer_head *t_bh,
+                                 loff_t pos_out,
+                                 loff_t len)
   {
         struct ocfs2_cached_dealloc_ctxt dealloc;
         struct ocfs2_super *osb;
         struct ocfs2_dinode *dis;
         struct ocfs2_dinode *dit;
-       int ret;
+       loff_t ret;
   
         osb = OCFS2_SB(s_inode->i_sb);
         dis = (struct ocfs2_dinode *)s_bh->b_data;
@@@ -4698,7 -4705,7 +4703,7 @@@
         /* Actually remap extents now. */
         ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
                                          pos_out, len, &dealloc);
-       if (ret) {
+       if (ret < 0) {
                 mlog_errno(ret);
                 goto out;
         }
@@@ -4713,10 -4720,10 +4718,10 @@@ out
   }
   
   /* Lock an inode and grab a bh pointing to the inode. */
- static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
-                                    struct buffer_head **bh1,
-                                    struct inode *t_inode,
-                                    struct buffer_head **bh2)
+ int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+                             struct buffer_head **bh1,
+                             struct inode *t_inode,
+                             struct buffer_head **bh2)
   {
         struct inode *inode1;
         struct inode *inode2;
@@@ -4801,10 -4808,10 +4806,10 @@@ out_i1
   }
   
   /* Unlock both inodes and release buffers. */
- static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
-                                       struct buffer_head *s_bh,
-                                       struct inode *t_inode,
-                                       struct buffer_head *t_bh)
+ void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+                                struct buffer_head *s_bh,
+                                struct inode *t_inode,
+                                struct buffer_head *t_bh)
   {
         ocfs2_inode_unlock(s_inode, 1);
         ocfs2_rw_unlock(s_inode, 1);
@@@ -4816,82 -4823,3 +4821,3 @@@
         }
         unlock_two_nondirectories(s_inode, t_inode);
   }
- 
- /* Link a range of blocks from one file to another. */
- int ocfs2_reflink_remap_range(struct file *file_in,
-                             loff_t pos_in,
-                             struct file *file_out,
-                             loff_t pos_out,
-                             u64 len,
-                             bool is_dedupe)
- {
-       struct inode *inode_in = file_inode(file_in);
-       struct inode *inode_out = file_inode(file_out);
-       struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
-       struct buffer_head *in_bh = NULL, *out_bh = NULL;
-       bool same_inode = (inode_in == inode_out);
-       ssize_t ret;
- 
-       if (!ocfs2_refcount_tree(osb))
-               return -EOPNOTSUPP;
-       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
-               return -EROFS;
- 
-       /* Lock both files against IO */
-       ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
-       if (ret)
-               return ret;
- 
-       /* Check file eligibility and prepare for block sharing. */
-       ret = -EINVAL;
-       if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
-           (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
-               goto out_unlock;
- 
-       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-                       &len, is_dedupe);
-       if (ret <= 0)
-               goto out_unlock;
- 
-       /* Lock out changes to the allocation maps and remap. */
-       down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-       if (!same_inode)
-               down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
-                                 SINGLE_DEPTH_NESTING);
- 
-       ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
-                                        out_bh, pos_out, len);
- 
-       /* Zap any page cache for the destination file's range. */
-       if (!ret)
-               truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                          PAGE_ALIGN(pos_out + len) - 1);
- 
-       up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
-       if (!same_inode)
-               up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_unlock;
-       }
- 
-       /*
-        * Empty the extent map so that we may get the right extent
-        * record from the disk.
-        */
-       ocfs2_extent_map_trunc(inode_in, 0);
-       ocfs2_extent_map_trunc(inode_out, 0);
- 
-       ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
-       if (ret) {
-               mlog_errno(ret);
-               goto out_unlock;
-       }
- 
-       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-       return 0;
- 
- out_unlock:
-       ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
-       return ret;
- }
diff --combined fs/overlayfs/copy_up.c

index d6a3346,5f82fec..9e62dcf
--- 1/fs/overlayfs/copy_up.c
--- 2/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@@ -125,6 -125,7 +125,7 @@@ static int ovl_copy_up_data(struct pat
         struct file *new_file;
         loff_t old_pos = 0;
         loff_t new_pos = 0;
+       loff_t cloned;
         int error = 0;
   
         if (len == 0)
@@@ -141,11 -142,10 +142,10 @@@
         }
   
         /* Try to use clone_file_range to clone up within the same fs */
-       error = do_clone_file_range(old_file, 0, new_file, 0, len);
-       if (!error)
+       cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
+       if (cloned == len)
                 goto out;
         /* Couldn't clone, so now we try to copy the data */
-       error = 0;
   
         /* FIXME: copy up sparse files efficiently */
         while (len) {
@@@ -395,6 -395,7 +395,6 @@@ struct ovl_copy_up_ctx 
         struct dentry *destdir;
         struct qstr destname;
         struct dentry *workdir;
- -      bool tmpfile;
         bool origin;
         bool indexed;
         bool metacopy;
@@@ -439,6 -440,63 +439,6 @@@ static int ovl_link_up(struct ovl_copy_
         return err;
   }
   
- -static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
- -                          struct dentry **newdentry)
- -{
- -      int err;
- -      struct dentry *upper;
- -      struct inode *udir = d_inode(c->destdir);
- -
- -      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
- -      if (IS_ERR(upper))
- -              return PTR_ERR(upper);
- -
- -      if (c->tmpfile)
- -              err = ovl_do_link(temp, udir, upper);
- -      else
- -              err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
- -
- -      if (!err)
- -              *newdentry = dget(c->tmpfile ? upper : temp);
- -      dput(upper);
- -
- -      return err;
- -}
- -
- -static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
- -{
- -      int err;
- -      struct dentry *temp;
- -      const struct cred *old_creds = NULL;
- -      struct cred *new_creds = NULL;
- -      struct ovl_cattr cattr = {
- -              /* Can't properly set mode on creation because of the umask */
- -              .mode = c->stat.mode & S_IFMT,
- -              .rdev = c->stat.rdev,
- -              .link = c->link
- -      };
- -
- -      err = security_inode_copy_up(c->dentry, &new_creds);
- -      temp = ERR_PTR(err);
- -      if (err < 0)
- -              goto out;
- -
- -      if (new_creds)
- -              old_creds = override_creds(new_creds);
- -
- -      if (c->tmpfile)
- -              temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
- -      else
- -              temp = ovl_create_temp(c->workdir, &cattr);
- -out:
- -      if (new_creds) {
- -              revert_creds(old_creds);
- -              put_cred(new_creds);
- -      }
- -
- -      return temp;
- -}
- -
   static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
   {
         int err;
@@@ -490,148 -548,51 +490,148 @@@
         return err;
   }
   
- -static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
+ +struct ovl_cu_creds {
+ +      const struct cred *old;
+ +      struct cred *new;
+ +};
+ +
+ +static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
+ +{
+ +      int err;
+ +
+ +      cc->old = cc->new = NULL;
+ +      err = security_inode_copy_up(dentry, &cc->new);
+ +      if (err < 0)
+ +              return err;
+ +
+ +      if (cc->new)
+ +              cc->old = override_creds(cc->new);
+ +
+ +      return 0;
+ +}
+ +
+ +static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
+ +{
+ +      if (cc->new) {
+ +              revert_creds(cc->old);
+ +              put_cred(cc->new);
+ +      }
+ +}
+ +
+ +/*
+ + * Copyup using workdir to prepare temp file.  Used when copying up directories,
+ + * special files or when upper fs doesn't support O_TMPFILE.
+ + */
+ +static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
   {
- -      struct inode *udir = c->destdir->d_inode;
         struct inode *inode;
- -      struct dentry *newdentry = NULL;
- -      struct dentry *temp;
+ +      struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+ +      struct dentry *temp, *upper;
+ +      struct ovl_cu_creds cc;
         int err;
+ +      struct ovl_cattr cattr = {
+ +              /* Can't properly set mode on creation because of the umask */
+ +              .mode = c->stat.mode & S_IFMT,
+ +              .rdev = c->stat.rdev,
+ +              .link = c->link
+ +      };
   
- -      temp = ovl_get_tmpfile(c);
+ +      err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+ +      if (err)
+ +              return err;
+ +
+ +      err = ovl_prep_cu_creds(c->dentry, &cc);
+ +      if (err)
+ +              goto unlock;
+ +
+ +      temp = ovl_create_temp(c->workdir, &cattr);
+ +      ovl_revert_cu_creds(&cc);
+ +
+ +      err = PTR_ERR(temp);
         if (IS_ERR(temp))
- -              return PTR_ERR(temp);
+ +              goto unlock;
   
         err = ovl_copy_up_inode(c, temp);
         if (err)
- -              goto out;
+ +              goto cleanup;
   
         if (S_ISDIR(c->stat.mode) && c->indexed) {
                 err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
                 if (err)
- -                      goto out;
+ +                      goto cleanup;
         }
   
- -      if (c->tmpfile) {
- -              inode_lock_nested(udir, I_MUTEX_PARENT);
- -              err = ovl_install_temp(c, temp, &newdentry);
- -              inode_unlock(udir);
- -      } else {
- -              err = ovl_install_temp(c, temp, &newdentry);
- -      }
+ +      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ +      err = PTR_ERR(upper);
+ +      if (IS_ERR(upper))
+ +              goto cleanup;
+ +
+ +      err = ovl_do_rename(wdir, temp, udir, upper, 0);
+ +      dput(upper);
         if (err)
- -              goto out;
+ +              goto cleanup;
   
         if (!c->metacopy)
                 ovl_set_upperdata(d_inode(c->dentry));
         inode = d_inode(c->dentry);
- -      ovl_inode_update(inode, newdentry);
+ +      ovl_inode_update(inode, temp);
         if (S_ISDIR(inode->i_mode))
                 ovl_set_flag(OVL_WHITEOUTS, inode);
+ +unlock:
+ +      unlock_rename(c->workdir, c->destdir);
   
- -out:
- -      if (err && !c->tmpfile)
- -              ovl_cleanup(d_inode(c->workdir), temp);
- -      dput(temp);
         return err;
   
+ +cleanup:
+ +      ovl_cleanup(wdir, temp);
+ +      dput(temp);
+ +      goto unlock;
+ +}
+ +
+ +/* Copyup using O_TMPFILE which does not require cross dir locking */
+ +static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
+ +{
+ +      struct inode *udir = d_inode(c->destdir);
+ +      struct dentry *temp, *upper;
+ +      struct ovl_cu_creds cc;
+ +      int err;
+ +
+ +      err = ovl_prep_cu_creds(c->dentry, &cc);
+ +      if (err)
+ +              return err;
+ +
+ +      temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
+ +      ovl_revert_cu_creds(&cc);
+ +
+ +      if (IS_ERR(temp))
+ +              return PTR_ERR(temp);
+ +
+ +      err = ovl_copy_up_inode(c, temp);
+ +      if (err)
+ +              goto out_dput;
+ +
+ +      inode_lock_nested(udir, I_MUTEX_PARENT);
+ +
+ +      upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
+ +      err = PTR_ERR(upper);
+ +      if (!IS_ERR(upper)) {
+ +              err = ovl_do_link(temp, udir, upper);
+ +              dput(upper);
+ +      }
+ +      inode_unlock(udir);
+ +
+ +      if (err)
+ +              goto out_dput;
+ +
+ +      if (!c->metacopy)
+ +              ovl_set_upperdata(d_inode(c->dentry));
+ +      ovl_inode_update(d_inode(c->dentry), temp);
+ +
+ +      return 0;
+ +
+ +out_dput:
+ +      dput(temp);
+ +      return err;
   }
   
   /*
@@@ -685,10 -646,18 +685,10 @@@ static int ovl_do_copy_up(struct ovl_co
         }
   
         /* Should we copyup with O_TMPFILE or with workdir? */
- -      if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
- -              c->tmpfile = true;
- -              err = ovl_copy_up_locked(c);
- -      } else {
- -              err = ovl_lock_rename_workdir(c->workdir, c->destdir);
- -              if (!err) {
- -                      err = ovl_copy_up_locked(c);
- -                      unlock_rename(c->workdir, c->destdir);
- -              }
- -      }
- -
- -
+ +      if (S_ISREG(c->stat.mode) && ofs->tmpfile)
+ +              err = ovl_copy_up_tmpfile(c);
+ +      else
+ +              err = ovl_copy_up_workdir(c);
         if (err)
                 goto out;
   
diff --combined fs/read_write.c

index 5a2ee48,6b40a43..bfcb4ce
--- 1/fs/read_write.c
--- 2/fs/read_write.c
+++ b/fs/read_write.c
@@@ -331,7 -331,7 +331,7 @@@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned 
   }
   #endif
   
- -#ifdef __ARCH_WANT_SYS_LLSEEK
+ +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT)
   SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                 unsigned long, offset_low, loff_t __user *, result,
                 unsigned int, whence)
@@@ -1407,6 -1407,7 +1407,6 @@@ static ssize_t do_sendfile(int out_fd, 
                 goto fput_in;
         if (!(out.file->f_mode & FMODE_WRITE))
                 goto fput_out;
- -      retval = -EINVAL;
         in_inode = file_inode(in.file);
         out_inode = file_inode(out.file);
         out_pos = out.file->f_pos;
@@@ -1587,11 -1588,15 +1587,15 @@@ ssize_t vfs_copy_file_range(struct fil
          * Try cloning first, this is supported by more file systems, and
          * more efficient if both clone and copy are supported (e.g. NFS).
          */
-       if (file_in->f_op->clone_file_range) {
-               ret = file_in->f_op->clone_file_range(file_in, pos_in,
-                               file_out, pos_out, len);
-               if (ret == 0) {
-                       ret = len;
+       if (file_in->f_op->remap_file_range) {
+               loff_t cloned;
+ 
+               cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+                               file_out, pos_out,
+                               min_t(loff_t, MAX_RW_COUNT, len),
+                               REMAP_FILE_CAN_SHORTEN);
+               if (cloned > 0) {
+                       ret = cloned;
                         goto done;
                 }
         }
@@@ -1685,11 -1690,12 +1689,12 @@@ out2
         return ret;
   }
   
- static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
+                            bool write)
   {
         struct inode *inode = file_inode(file);
   
-       if (unlikely(pos < 0))
+       if (unlikely(pos < 0 || len < 0))
                 return -EINVAL;
   
          if (unlikely((loff_t) (pos + len) < 0))
@@@ -1707,22 -1713,150 +1712,150 @@@
   
         return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
   }
+ /*
+  * Ensure that we don't remap a partial EOF block in the middle of something
+  * else.  Assume that the offsets have already been checked for block
+  * alignment.
+  *
+  * For deduplication we always scale down to the previous block because we
+  * can't meaningfully compare post-EOF contents.
+  *
+  * For clone we only link a partial EOF block above the destination file's EOF.
+  *
+  * Shorten the request if possible.
+  */
+ static int generic_remap_check_len(struct inode *inode_in,
+                                  struct inode *inode_out,
+                                  loff_t pos_out,
+                                  loff_t *len,
+                                  unsigned int remap_flags)
+ {
+       u64 blkmask = i_blocksize(inode_in) - 1;
+       loff_t new_len = *len;
+ 
+       if ((*len & blkmask) == 0)
+               return 0;
+ 
+       if ((remap_flags & REMAP_FILE_DEDUP) ||
+           pos_out + *len < i_size_read(inode_out))
+               new_len &= ~blkmask;
+ 
+       if (new_len == *len)
+               return 0;
+ 
+       if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
+               *len = new_len;
+               return 0;
+       }
+ 
+       return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+ }
+ 
+ /*
+  * Read a page's worth of file data into the page cache.  Return the page
+  * locked.
+  */
+ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+ {
+       struct page *page;
+ 
+       page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
+       if (IS_ERR(page))
+               return page;
+       if (!PageUptodate(page)) {
+               put_page(page);
+               return ERR_PTR(-EIO);
+       }
+       lock_page(page);
+       return page;
+ }
+ 
+ /*
+  * Compare extents of two files to see if they are the same.
+  * Caller must have locked both inodes to prevent write races.
+  */
+ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+                                        struct inode *dest, loff_t destoff,
+                                        loff_t len, bool *is_same)
+ {
+       loff_t src_poff;
+       loff_t dest_poff;
+       void *src_addr;
+       void *dest_addr;
+       struct page *src_page;
+       struct page *dest_page;
+       loff_t cmp_len;
+       bool same;
+       int error;
+ 
+       error = -EINVAL;
+       same = true;
+       while (len) {
+               src_poff = srcoff & (PAGE_SIZE - 1);
+               dest_poff = destoff & (PAGE_SIZE - 1);
+               cmp_len = min(PAGE_SIZE - src_poff,
+                             PAGE_SIZE - dest_poff);
+               cmp_len = min(cmp_len, len);
+               if (cmp_len <= 0)
+                       goto out_error;
+ 
+               src_page = vfs_dedupe_get_page(src, srcoff);
+               if (IS_ERR(src_page)) {
+                       error = PTR_ERR(src_page);
+                       goto out_error;
+               }
+               dest_page = vfs_dedupe_get_page(dest, destoff);
+               if (IS_ERR(dest_page)) {
+                       error = PTR_ERR(dest_page);
+                       unlock_page(src_page);
+                       put_page(src_page);
+                       goto out_error;
+               }
+               src_addr = kmap_atomic(src_page);
+               dest_addr = kmap_atomic(dest_page);
+ 
+               flush_dcache_page(src_page);
+               flush_dcache_page(dest_page);
+ 
+               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+                       same = false;
+ 
+               kunmap_atomic(dest_addr);
+               kunmap_atomic(src_addr);
+               unlock_page(dest_page);
+               unlock_page(src_page);
+               put_page(dest_page);
+               put_page(src_page);
+ 
+               if (!same)
+                       break;
+ 
+               srcoff += cmp_len;
+               destoff += cmp_len;
+               len -= cmp_len;
+       }
+ 
+       *is_same = same;
+       return 0;
+ 
+ out_error:
+       return error;
+ }
   
   /*
    * Check that the two inodes are eligible for cloning, the ranges make
    * sense, and then flush all dirty data.  Caller must ensure that the
    * inodes have been locked against any other modifications.
    *
-  * Returns: 0 for "nothing to clone", 1 for "something to clone", or
-  * the usual negative error code.
+  * If there's an error, then the usual negative error code is returned.
+  * Otherwise returns 0 with *len set to the request length.
    */
- int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-                              struct inode *inode_out, loff_t pos_out,
-                              u64 *len, bool is_dedupe)
+ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+                                 struct file *file_out, loff_t pos_out,
+                                 loff_t *len, unsigned int remap_flags)
   {
-       loff_t bs = inode_out->i_sb->s_blocksize;
-       loff_t blen;
-       loff_t isize;
+       struct inode *inode_in = file_inode(file_in);
+       struct inode *inode_out = file_inode(file_out);
         bool same_inode = (inode_in == inode_out);
         int ret;
   
@@@ -1739,50 -1873,24 +1872,24 @@@
         if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                 return -EINVAL;
   
-       /* Are we going all the way to the end? */
-       isize = i_size_read(inode_in);
-       if (isize == 0)
-               return 0;
- 
         /* Zero length dedupe exits immediately; reflink goes to EOF. */
         if (*len == 0) {
-               if (is_dedupe || pos_in == isize)
+               loff_t isize = i_size_read(inode_in);
+ 
+               if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
                         return 0;
                 if (pos_in > isize)
                         return -EINVAL;
                 *len = isize - pos_in;
+               if (*len == 0)
+                       return 0;
         }
   
-       /* Ensure offsets don't wrap and the input is inside i_size */
-       if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
-           pos_in + *len > isize)
-               return -EINVAL;
- 
-       /* Don't allow dedupe past EOF in the dest file */
-       if (is_dedupe) {
-               loff_t  disize;
- 
-               disize = i_size_read(inode_out);
-               if (pos_out >= disize || pos_out + *len > disize)
-                       return -EINVAL;
-       }
- 
-       /* If we're linking to EOF, continue to the block boundary. */
-       if (pos_in + *len == isize)
-               blen = ALIGN(isize, bs) - pos_in;
-       else
-               blen = *len;
- 
-       /* Only reflink if we're aligned to block boundaries */
-       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-           !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-               return -EINVAL;
- 
-       /* Don't allow overlapped reflink within the same file */
-       if (same_inode) {
-               if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-                       return -EINVAL;
-       }
+       /* Check that we don't violate system file offset limits. */
+       ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
+                       remap_flags);
+       if (ret)
+               return ret;
   
         /* Wait for the completion of any pending IOs on both files */
         inode_dio_wait(inode_in);
@@@ -1802,7 -1910,7 +1909,7 @@@
         /*
          * Check that the extents are the same.
          */
-       if (is_dedupe) {
+       if (remap_flags & REMAP_FILE_DEDUP) {
                 bool            is_same = false;
   
                 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
@@@ -1813,16 -1921,43 +1920,43 @@@
                         return -EBADE;
         }
   
-       return 1;
+       ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
+                       remap_flags);
+       if (ret)
+               return ret;
+ 
+       /* If can't alter the file contents, we're done. */
+       if (!(remap_flags & REMAP_FILE_DEDUP)) {
+               /* Update the timestamps, since we can alter file contents. */
+               if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+                       ret = file_update_time(file_out);
+                       if (ret)
+                               return ret;
+               }
+ 
+               /*
+                * Clear the security bits if the process is not being run by
+                * root.  This keeps people from modifying setuid and setgid
+                * binaries.
+                */
+               ret = file_remove_privs(file_out);
+               if (ret)
+                       return ret;
+       }
+ 
+       return 0;
   }
- EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+ EXPORT_SYMBOL(generic_remap_file_range_prep);
   
- int do_clone_file_range(struct file *file_in, loff_t pos_in,
-                       struct file *file_out, loff_t pos_out, u64 len)
+ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+                          struct file *file_out, loff_t pos_out,
+                          loff_t len, unsigned int remap_flags)
   {
         struct inode *inode_in = file_inode(file_in);
         struct inode *inode_out = file_inode(file_out);
-       int ret;
+       loff_t ret;
+ 
+       WARN_ON_ONCE(remap_flags);
   
         if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                 return -EISDIR;
@@@ -1842,169 -1977,62 +1976,76 @@@
             (file_out->f_flags & O_APPEND))
                 return -EBADF;
   
-       if (!file_in->f_op->clone_file_range)
+       if (!file_in->f_op->remap_file_range)
                 return -EOPNOTSUPP;
   
-       ret = clone_verify_area(file_in, pos_in, len, false);
+       ret = remap_verify_area(file_in, pos_in, len, false);
         if (ret)
                 return ret;
   
-       ret = clone_verify_area(file_out, pos_out, len, true);
+       ret = remap_verify_area(file_out, pos_out, len, true);
         if (ret)
                 return ret;
   
-       if (pos_in + len > i_size_read(inode_in))
-               return -EINVAL;
- 
-       ret = file_in->f_op->clone_file_range(file_in, pos_in,
-                       file_out, pos_out, len);
-       if (!ret) {
-               fsnotify_access(file_in);
-               fsnotify_modify(file_out);
-       }
+       ret = file_in->f_op->remap_file_range(file_in, pos_in,
+                       file_out, pos_out, len, remap_flags);
+       if (ret < 0)
+               return ret;
   
+       fsnotify_access(file_in);
+       fsnotify_modify(file_out);
         return ret;
   }
   EXPORT_SYMBOL(do_clone_file_range);
   
- int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                        struct file *file_out, loff_t pos_out, u64 len)
+ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+                           struct file *file_out, loff_t pos_out,
+                           loff_t len, unsigned int remap_flags)
   {
-       int ret;
+       loff_t ret;
   
         file_start_write(file_out);
-       ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+       ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
+                                 remap_flags);
         file_end_write(file_out);
   
         return ret;
   }
   EXPORT_SYMBOL(vfs_clone_file_range);
   
- /*
-  * Read a page's worth of file data into the page cache.  Return the page
-  * locked.
-  */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
-       struct address_space *mapping;
-       struct page *page;
-       pgoff_t n;
- 
-       n = offset >> PAGE_SHIFT;
-       mapping = inode->i_mapping;
-       page = read_mapping_page(mapping, n, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       lock_page(page);
-       return page;
- }
- 
- /*
-  * Compare extents of two files to see if they are the same.
-  * Caller must have locked both inodes to prevent write races.
-  */
- int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-                                 struct inode *dest, loff_t destoff,
-                                 loff_t len, bool *is_same)
- {
-       loff_t src_poff;
-       loff_t dest_poff;
-       void *src_addr;
-       void *dest_addr;
-       struct page *src_page;
-       struct page *dest_page;
-       loff_t cmp_len;
-       bool same;
-       int error;
- 
-       error = -EINVAL;
-       same = true;
-       while (len) {
-               src_poff = srcoff & (PAGE_SIZE - 1);
-               dest_poff = destoff & (PAGE_SIZE - 1);
-               cmp_len = min(PAGE_SIZE - src_poff,
-                             PAGE_SIZE - dest_poff);
-               cmp_len = min(cmp_len, len);
-               if (cmp_len <= 0)
-                       goto out_error;
- 
-               src_page = vfs_dedupe_get_page(src, srcoff);
-               if (IS_ERR(src_page)) {
-                       error = PTR_ERR(src_page);
-                       goto out_error;
-               }
-               dest_page = vfs_dedupe_get_page(dest, destoff);
-               if (IS_ERR(dest_page)) {
-                       error = PTR_ERR(dest_page);
-                       unlock_page(src_page);
-                       put_page(src_page);
-                       goto out_error;
-               }
-               src_addr = kmap_atomic(src_page);
-               dest_addr = kmap_atomic(dest_page);
- 
-               flush_dcache_page(src_page);
-               flush_dcache_page(dest_page);
- 
-               if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-                       same = false;
- 
-               kunmap_atomic(dest_addr);
-               kunmap_atomic(src_addr);
-               unlock_page(dest_page);
-               unlock_page(src_page);
-               put_page(dest_page);
-               put_page(src_page);
- 
-               if (!same)
-                       break;
- 
-               srcoff += cmp_len;
-               destoff += cmp_len;
-               len -= cmp_len;
-       }
- 
-       *is_same = same;
-       return 0;
- 
- out_error:
-       return error;
- }
- EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
- 
+ +/* Check whether we are allowed to dedupe the destination file */
+ +static bool allow_file_dedupe(struct file *file)
+ +{
+ +      if (capable(CAP_SYS_ADMIN))
+ +              return true;
+ +      if (file->f_mode & FMODE_WRITE)
+ +              return true;
+ +      if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
+ +              return true;
+ +      if (!inode_permission(file_inode(file), MAY_WRITE))
+ +              return true;
+ +      return false;
+ +}
+ +
- int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-                             struct file *dst_file, loff_t dst_pos, u64 len)
+ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+                                struct file *dst_file, loff_t dst_pos,
+                                loff_t len, unsigned int remap_flags)
   {
-       s64 ret;
+       loff_t ret;
+ 
+       WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+                                    REMAP_FILE_CAN_SHORTEN));
   
         ret = mnt_want_write_file(dst_file);
         if (ret)
                 return ret;
   
-       ret = clone_verify_area(dst_file, dst_pos, len, true);
+       ret = remap_verify_area(dst_file, dst_pos, len, true);
         if (ret < 0)
                 goto out_drop_write;
   
- -      ret = -EINVAL;
- -      if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
+ +      ret = -EPERM;
+ +      if (!allow_file_dedupe(dst_file))
                 goto out_drop_write;
   
         ret = -EXDEV;
@@@ -2016,11 -2044,16 +2057,16 @@@
                 goto out_drop_write;
   
         ret = -EINVAL;
-       if (!dst_file->f_op->dedupe_file_range)
+       if (!dst_file->f_op->remap_file_range)
                 goto out_drop_write;
   
-       ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
-                                               dst_file, dst_pos, len);
+       if (len == 0) {
+               ret = 0;
+               goto out_drop_write;
+       }
+ 
+       ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
+                       dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
   out_drop_write:
         mnt_drop_write_file(dst_file);
   
@@@ -2037,7 -2070,7 +2083,7 @@@ int vfs_dedupe_file_range(struct file *
         int i;
         int ret;
         u16 count = same->dest_count;
-       int deduped;
+       loff_t deduped;
   
         if (!(file->f_mode & FMODE_READ))
                 return -EINVAL;
@@@ -2056,7 -2089,7 +2102,7 @@@
         if (!S_ISREG(src->i_mode))
                 goto out;
   
-       ret = clone_verify_area(file, off, len, false);
+       ret = remap_verify_area(file, off, len, false);
         if (ret < 0)
                 goto out;
         ret = 0;
@@@ -2088,7 -2121,8 +2134,8 @@@
                 }
   
                 deduped = vfs_dedupe_file_range_one(file, off, dst_file,
-                                                   info->dest_offset, len);
+                                                   info->dest_offset, len,
+                                                   REMAP_FILE_CAN_SHORTEN);
                 if (deduped == -EBADE)
                         info->status = FILE_DEDUPE_RANGE_DIFFERS;
                 else if (deduped < 0)
diff --combined fs/xfs/xfs_reflink.c

index 8eaeec9,e722184..ecdb086
--- 1/fs/xfs/xfs_reflink.c
--- 2/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@@ -182,7 -182,8 +182,7 @@@ in
   xfs_reflink_trim_around_shared(
         struct xfs_inode        *ip,
         struct xfs_bmbt_irec    *irec,
- -      bool                    *shared,
- -      bool                    *trimmed)
+ +      bool                    *shared)
   {
         xfs_agnumber_t          agno;
         xfs_agblock_t           agbno;
@@@ -208,7 -209,7 +208,7 @@@
         if (error)
                 return error;
   
- -      *shared = *trimmed = false;
+ +      *shared = false;
         if (fbno == NULLAGBLOCK) {
                 /* No shared blocks at all. */
                 return 0;
@@@ -221,6 -222,8 +221,6 @@@
                  */
                 irec->br_blockcount = flen;
                 *shared = true;
- -              if (flen != aglen)
- -                      *trimmed = true;
                 return 0;
         } else {
                 /*
@@@ -230,6 -233,7 +230,6 @@@
                  * start of the shared region.
                  */
                 irec->br_blockcount = fbno - agbno;
- -              *trimmed = true;
                 return 0;
         }
   }
@@@ -237,7 -241,7 +237,7 @@@
   /*
    * Trim the passed in imap to the next shared/unshared extent boundary, and
    * if imap->br_startoff points to a shared extent reserve space for it in the
- - * COW fork.  In this case *shared is set to true, else to false.
+ + * COW fork.
    *
    * Note that imap will always contain the block numbers for the existing blocks
    * in the data fork, as the upper layers need them for read-modify-write
@@@ -246,14 -250,14 +246,14 @@@
   int
   xfs_reflink_reserve_cow(
         struct xfs_inode        *ip,
- -      struct xfs_bmbt_irec    *imap,
- -      bool                    *shared)
+ +      struct xfs_bmbt_irec    *imap)
   {
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
         struct xfs_bmbt_irec    got;
         int                     error = 0;
- -      bool                    eof = false, trimmed;
+ +      bool                    eof = false;
         struct xfs_iext_cursor  icur;
+ +      bool                    shared;
   
         /*
          * Search the COW fork extent list first.  This serves two purposes:
@@@ -269,16 -273,18 +269,16 @@@
         if (!eof && got.br_startoff <= imap->br_startoff) {
                 trace_xfs_reflink_cow_found(ip, imap);
                 xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- -
- -              *shared = true;
                 return 0;
         }
   
         /* Trim the mapping to the nearest shared extent boundary. */
- -      error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ +      error = xfs_reflink_trim_around_shared(ip, imap, &shared);
         if (error)
                 return error;
   
         /* Not shared?  Just report the (potentially capped) extent. */
- -      if (!*shared)
+ +      if (!shared)
                 return 0;
   
         /*
@@@ -362,6 -368,7 +362,6 @@@ xfs_find_trim_cow_extent
         xfs_filblks_t           count_fsb = imap->br_blockcount;
         struct xfs_iext_cursor  icur;
         struct xfs_bmbt_irec    got;
- -      bool                    trimmed;
   
         *found = false;
   
@@@ -369,13 -376,9 +369,13 @@@
          * If we don't find an overlapping extent, trim the range we need to
          * allocate to fit the hole we found.
          */
- -      if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
- -          got.br_startoff > offset_fsb)
- -              return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ +      if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ +              got.br_startoff = offset_fsb + count_fsb;
+ +      if (got.br_startoff > offset_fsb) {
+ +              xfs_trim_extent(imap, imap->br_startoff,
+ +                              got.br_startoff - imap->br_startoff);
+ +              return xfs_reflink_trim_around_shared(ip, imap, shared);
+ +      }
   
         *shared = true;
         if (isnullstartblock(got.br_startblock)) {
@@@ -913,18 -916,18 +913,18 @@@ out_error
   /*
    * Update destination inode size & cowextsize hint, if necessary.
    */
- STATIC int
+ int
   xfs_reflink_update_dest(
         struct xfs_inode        *dest,
         xfs_off_t               newlen,
         xfs_extlen_t            cowextsize,
-       bool                    is_dedupe)
+       unsigned int            remap_flags)
   {
         struct xfs_mount        *mp = dest->i_mount;
         struct xfs_trans        *tp;
         int                     error;
   
-       if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+       if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
                 return 0;
   
         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@@ -945,10 -948,6 +945,6 @@@
                 dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
         }
   
-       if (!is_dedupe) {
-               xfs_trans_ichgtime(tp, dest,
-                                  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       }
         xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
   
         error = xfs_trans_commit(tp);
@@@ -1112,19 -1111,28 +1108,28 @@@ out
   /*
    * Iteratively remap one file's extents (and holes) to another's.
    */
- STATIC int
+ int
   xfs_reflink_remap_blocks(
         struct xfs_inode        *src,
-       xfs_fileoff_t           srcoff,
+       loff_t                  pos_in,
         struct xfs_inode        *dest,
-       xfs_fileoff_t           destoff,
-       xfs_filblks_t           len,
-       xfs_off_t               new_isize)
+       loff_t                  pos_out,
+       loff_t                  remap_len,
+       loff_t                  *remapped)
   {
         struct xfs_bmbt_irec    imap;
+       xfs_fileoff_t           srcoff;
+       xfs_fileoff_t           destoff;
+       xfs_filblks_t           len;
+       xfs_filblks_t           range_len;
+       xfs_filblks_t           remapped_len = 0;
+       xfs_off_t               new_isize = pos_out + remap_len;
         int                     nimaps;
         int                     error = 0;
-       xfs_filblks_t           range_len;
+ 
+       destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+       srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+       len = XFS_B_TO_FSB(src->i_mount, remap_len);
   
         /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
         while (len) {
@@@ -1139,7 -1147,7 +1144,7 @@@
                 error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
                 xfs_iunlock(src, lock_mode);
                 if (error)
-                       goto err;
+                       break;
                 ASSERT(nimaps == 1);
   
                 trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
@@@ -1153,23 -1161,24 +1158,24 @@@
                 error = xfs_reflink_remap_extent(dest, &imap, destoff,
                                 new_isize);
                 if (error)
-                       goto err;
+                       break;
   
                 if (fatal_signal_pending(current)) {
                         error = -EINTR;
-                       goto err;
+                       break;
                 }
   
                 /* Advance drange/srange */
                 srcoff += range_len;
                 destoff += range_len;
                 len -= range_len;
+               remapped_len += range_len;
         }
   
-       return 0;
- 
- err:
-       trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+       if (error)
+               trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+       *remapped = min_t(loff_t, remap_len,
+                         XFS_FSB_TO_B(src->i_mount, remapped_len));
         return error;
   }
   
@@@ -1218,7 -1227,7 +1224,7 @@@ retry
   }
   
   /* Unlock both inodes after they've been prepped for a range clone. */
- STATIC void
+ void
   xfs_reflink_remap_unlock(
         struct file             *file_in,
         struct file             *file_out)
@@@ -1286,21 -1295,20 +1292,20 @@@ xfs_reflink_zero_posteof
    * stale data in the destination file. Hence we reject these clone attempts with
    * -EINVAL in this case.
    */
- STATIC int
+ int
   xfs_reflink_remap_prep(
         struct file             *file_in,
         loff_t                  pos_in,
         struct file             *file_out,
         loff_t                  pos_out,
-       u64                     *len,
-       bool                    is_dedupe)
+       loff_t                  *len,
+       unsigned int            remap_flags)
   {
         struct inode            *inode_in = file_inode(file_in);
         struct xfs_inode        *src = XFS_I(inode_in);
         struct inode            *inode_out = file_inode(file_out);
         struct xfs_inode        *dest = XFS_I(inode_out);
         bool                    same_inode = (inode_in == inode_out);
-       u64                     blkmask = i_blocksize(inode_in) - 1;
         ssize_t                 ret;
   
         /* Lock both files against IO */
@@@ -1323,29 -1331,11 +1328,11 @@@
         if (IS_DAX(inode_in) || IS_DAX(inode_out))
                 goto out_unlock;
   
-       ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
-                       len, is_dedupe);
-       if (ret <= 0)
+       ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+                       len, remap_flags);
+       if (ret < 0 || *len == 0)
                 goto out_unlock;
   
-       /*
-        * If the dedupe data matches, chop off the partial EOF block
-        * from the source file so we don't try to dedupe the partial
-        * EOF block.
-        */
-       if (is_dedupe) {
-               *len &= ~blkmask;
-       } else if (*len & blkmask) {
-               /*
-                * The user is attempting to share a partial EOF block,
-                * if it's inside the destination EOF then reject it.
-                */
-               if (pos_out + *len < i_size_read(inode_out)) {
-                       ret = -EINVAL;
-                       goto out_unlock;
-               }
-       }
- 
         /* Attach dquots to dest inode before changing block map */
         ret = xfs_qm_dqattach(dest);
         if (ret)
@@@ -1365,31 -1355,9 +1352,9 @@@
                 goto out_unlock;
   
         /* Zap any page cache for the destination file's range. */
-       truncate_inode_pages_range(&inode_out->i_data, pos_out,
-                                  PAGE_ALIGN(pos_out + *len) - 1);
- 
-       /* If we're altering the file contents... */
-       if (!is_dedupe) {
-               /*
-                * ...update the timestamps (which will grab the ilock again
-                * from xfs_fs_dirty_inode, so we have to call it before we
-                * take the ilock).
-                */
-               if (!(file_out->f_mode & FMODE_NOCMTIME)) {
-                       ret = file_update_time(file_out);
-                       if (ret)
-                               goto out_unlock;
-               }
- 
-               /*
-                * ...clear the security bits if the process is not being run
-                * by root.  This keeps people from modifying setuid and setgid
-                * binaries.
-                */
-               ret = file_remove_privs(file_out);
-               if (ret)
-                       goto out_unlock;
-       }
+       truncate_inode_pages_range(&inode_out->i_data,
+                       round_down(pos_out, PAGE_SIZE),
+                       round_up(pos_out + *len, PAGE_SIZE) - 1);
   
         return 1;
   out_unlock:
@@@ -1398,72 -1366,6 +1363,6 @@@
   }
   
   /*
-  * Link a range of blocks from one file to another.
-  */
- int
- xfs_reflink_remap_range(
-       struct file             *file_in,
-       loff_t                  pos_in,
-       struct file             *file_out,
-       loff_t                  pos_out,
-       u64                     len,
-       bool                    is_dedupe)
- {
-       struct inode            *inode_in = file_inode(file_in);
-       struct xfs_inode        *src = XFS_I(inode_in);
-       struct inode            *inode_out = file_inode(file_out);
-       struct xfs_inode        *dest = XFS_I(inode_out);
-       struct xfs_mount        *mp = src->i_mount;
-       xfs_fileoff_t           sfsbno, dfsbno;
-       xfs_filblks_t           fsblen;
-       xfs_extlen_t            cowextsize;
-       ssize_t                 ret;
- 
-       if (!xfs_sb_version_hasreflink(&mp->m_sb))
-               return -EOPNOTSUPP;
- 
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
- 
-       /* Prepare and then clone file data. */
-       ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
-                       &len, is_dedupe);
-       if (ret <= 0)
-               return ret;
- 
-       trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
- 
-       dfsbno = XFS_B_TO_FSBT(mp, pos_out);
-       sfsbno = XFS_B_TO_FSBT(mp, pos_in);
-       fsblen = XFS_B_TO_FSB(mp, len);
-       ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
-                       pos_out + len);
-       if (ret)
-               goto out_unlock;
- 
-       /*
-        * Carry the cowextsize hint from src to dest if we're sharing the
-        * entire source file to the entire destination file, the source file
-        * has a cowextsize hint, and the destination file does not.
-        */
-       cowextsize = 0;
-       if (pos_in == 0 && len == i_size_read(inode_in) &&
-           (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
-           pos_out == 0 && len >= i_size_read(inode_out) &&
-           !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
-               cowextsize = src->i_d.di_cowextsize;
- 
-       ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
-                       is_dedupe);
- 
- out_unlock:
-       xfs_reflink_remap_unlock(file_in, file_out);
-       if (ret)
-               trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
-       return ret;
- }
- 
- /*
    * The user wants to preemptively CoW all shared blocks in this file,
    * which enables us to turn off the reflink flag.  Iterate all
    * extents which are not prealloc/delalloc to see which ranges are
diff --combined fs/xfs/xfs_reflink.h

index 7f47202,28a84ed..6d73dae
--- 1/fs/xfs/xfs_reflink.h
--- 2/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@@ -10,10 -10,10 +10,10 @@@ extern int xfs_reflink_find_shared(stru
                 xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
                 xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
   extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
- -              struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+ +              struct xfs_bmbt_irec *irec, bool *shared);
   
   extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- -              struct xfs_bmbt_irec *imap, bool *shared);
+ +              struct xfs_bmbt_irec *imap);
   extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
                 struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
   extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
@@@ -27,13 -27,24 +27,24 @@@ extern int xfs_reflink_cancel_cow_range
   extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                 xfs_off_t count);
   extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
- extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
-               struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+ extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+               struct file *file_out, loff_t pos_out, loff_t len,
+               unsigned int remap_flags);
   extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
                 struct xfs_inode *ip, bool *has_shared);
   extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
                 struct xfs_trans **tpp);
   extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
                 xfs_off_t len);
+ extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+               struct file *file_out, loff_t pos_out, loff_t *len,
+               unsigned int remap_flags);
+ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+               struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+               loff_t *remapped);
+ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+               xfs_extlen_t cowextsize, unsigned int remap_flags);
+ extern void xfs_reflink_remap_unlock(struct file *file_in,
+               struct file *file_out);
   
   #endif /* __XFS_REFLINK_H */
diff --combined include/linux/fs.h

index 8252df3,346036a..c95c080
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -403,40 -403,24 +403,40 @@@ int pagecache_write_end(struct file *, 
                                 loff_t pos, unsigned len, unsigned copied,
                                 struct page *page, void *fsdata);
   
+ +/**
+ + * struct address_space - Contents of a cacheable, mappable object.
+ + * @host: Owner, either the inode or the block_device.
+ + * @i_pages: Cached pages.
+ + * @gfp_mask: Memory allocation flags to use for allocating pages.
+ + * @i_mmap_writable: Number of VM_SHARED mappings.
+ + * @i_mmap: Tree of private and shared mappings.
+ + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
+ + * @nrpages: Number of page entries, protected by the i_pages lock.
+ + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock.
+ + * @writeback_index: Writeback starts here.
+ + * @a_ops: Methods.
+ + * @flags: Error bits and flags (AS_*).
+ + * @wb_err: The most recent error which has occurred.
+ + * @private_lock: For use by the owner of the address_space.
+ + * @private_list: For use by the owner of the address_space.
+ + * @private_data: For use by the owner of the address_space.
+ + */
   struct address_space {
- -      struct inode            *host;          /* owner: inode, block_device */
- -      struct radix_tree_root  i_pages;        /* cached pages */
- -      atomic_t                i_mmap_writable;/* count VM_SHARED mappings */
- -      struct rb_root_cached   i_mmap;         /* tree of private and shared mappings */
- -      struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
- -      /* Protected by the i_pages lock */
- -      unsigned long           nrpages;        /* number of total pages */
- -      /* number of shadow or DAX exceptional entries */
+ +      struct inode            *host;
+ +      struct xarray           i_pages;
+ +      gfp_t                   gfp_mask;
+ +      atomic_t                i_mmap_writable;
+ +      struct rb_root_cached   i_mmap;
+ +      struct rw_semaphore     i_mmap_rwsem;
+ +      unsigned long           nrpages;
         unsigned long           nrexceptional;
- -      pgoff_t                 writeback_index;/* writeback starts here */
- -      const struct address_space_operations *a_ops;   /* methods */
- -      unsigned long           flags;          /* error bits */
- -      spinlock_t              private_lock;   /* for use by the address_space */
- -      gfp_t                   gfp_mask;       /* implicit gfp mask for allocations */
- -      struct list_head        private_list;   /* for use by the address_space */
- -      void                    *private_data;  /* ditto */
+ +      pgoff_t                 writeback_index;
+ +      const struct address_space_operations *a_ops;
+ +      unsigned long           flags;
         errseq_t                wb_err;
+ +      spinlock_t              private_lock;
+ +      struct list_head        private_list;
+ +      void                    *private_data;
   } __attribute__((aligned(sizeof(long)))) __randomize_layout;
         /*
          * On most architectures that alignment is already the case; but
@@@ -483,18 -467,15 +483,18 @@@ struct block_device 
         struct mutex            bd_fsfreeze_mutex;
   } __randomize_layout;
   
+ +/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
+ +#define PAGECACHE_TAG_DIRTY   XA_MARK_0
+ +#define PAGECACHE_TAG_WRITEBACK       XA_MARK_1
+ +#define PAGECACHE_TAG_TOWRITE XA_MARK_2
+ +
   /*
- - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
- - * radix trees
+ + * Returns true if any of the pages in the mapping are marked with the tag.
    */
- -#define PAGECACHE_TAG_DIRTY   0
- -#define PAGECACHE_TAG_WRITEBACK       1
- -#define PAGECACHE_TAG_TOWRITE 2
- -
- -int mapping_tagged(struct address_space *mapping, int tag);
+ +static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
+ +{
+ +      return xa_marked(&mapping->i_pages, tag);
+ +}
   
   static inline void i_mmap_lock_write(struct address_space *mapping)
   {
@@@ -1412,26 -1393,17 +1412,26 @@@ struct super_block 
   
         struct sb_writers       s_writers;
   
+ +      /*
+ +       * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
+ +       * s_fsnotify_marks together for cache efficiency. They are frequently
+ +       * accessed and rarely modified.
+ +       */
+ +      void                    *s_fs_info;     /* Filesystem private info */
+ +
+ +      /* Granularity of c/m/atime in ns (cannot be worse than a second) */
+ +      u32                     s_time_gran;
+ +#ifdef CONFIG_FSNOTIFY
+ +      __u32                   s_fsnotify_mask;
+ +      struct fsnotify_mark_connector __rcu    *s_fsnotify_marks;
+ +#endif
+ +
         char                    s_id[32];       /* Informational name */
         uuid_t                  s_uuid;         /* UUID */
   
- -      void                    *s_fs_info;     /* Filesystem private info */
         unsigned int            s_max_links;
         fmode_t                 s_mode;
   
- -      /* Granularity of c/m/atime in ns.
- -         Cannot be worse than a second */
- -      u32                s_time_gran;
- -
         /*
          * The next field is for VFS *only*. No filesystems have any business
          * even looking at it. You had been warned.
@@@ -1456,9 -1428,6 +1456,9 @@@
         /* Number of inodes with nlink == 0 but still referenced */
         atomic_long_t s_remove_count;
   
+ +      /* Pending fsnotify inode refs */
+ +      atomic_long_t s_fsnotify_inode_refs;
+ +
         /* Being remounted read-only */
         int s_readonly_remount;
   
@@@ -1752,6 -1721,25 +1752,25 @@@ struct block_device_operations
   #define NOMMU_VMFLAGS \
         (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
   
+ /*
+  * These flags control the behavior of the remap_file_range function pointer.
+  * If it is called with len == 0 that means "remap to end of source file".
+  * See Documentation/filesystems/vfs.txt for more details about this call.
+  *
+  * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
+  * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
+  */
+ #define REMAP_FILE_DEDUP              (1 << 0)
+ #define REMAP_FILE_CAN_SHORTEN                (1 << 1)
+ 
+ /*
+  * These flags signal that the caller is ok with altering various aspects of
+  * the behavior of the remap operation.  The changes must be made by the
+  * implementation; the vfs remap helper functions can take advantage of them.
+  * Flags in this category exist to preserve the quirky behavior of the hoisted
+  * btrfs clone/dedupe ioctls.
+  */
+ #define REMAP_FILE_ADVISORY           (REMAP_FILE_CAN_SHORTEN)
   
   struct iov_iter;
   
@@@ -1790,10 -1778,9 +1809,9 @@@ struct file_operations 
   #endif
         ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                         loff_t, size_t, unsigned int);
-       int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
-                       u64);
-       int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
-                       u64);
+       loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
+                                  struct file *file_out, loff_t pos_out,
+                                  loff_t len, unsigned int remap_flags);
         int (*fadvise)(struct file *, loff_t, loff_t, int);
   } __randomize_layout;
   
@@@ -1856,21 -1843,21 +1874,21 @@@ extern ssize_t vfs_readv(struct file *
                 unsigned long, loff_t *, rwf_t);
   extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                    loff_t, size_t, unsigned int);
- extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-                                     struct inode *inode_out, loff_t pos_out,
-                                     u64 *len, bool is_dedupe);
- extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
-                              struct file *file_out, loff_t pos_out, u64 len);
- extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-                               struct file *file_out, loff_t pos_out, u64 len);
- extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
-                                        struct inode *dest, loff_t destoff,
-                                        loff_t len, bool *is_same);
+ extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
+                                        struct file *file_out, loff_t pos_out,
+                                        loff_t *count,
+                                        unsigned int remap_flags);
+ extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
+                                 struct file *file_out, loff_t pos_out,
+                                 loff_t len, unsigned int remap_flags);
+ extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+                                  struct file *file_out, loff_t pos_out,
+                                  loff_t len, unsigned int remap_flags);
   extern int vfs_dedupe_file_range(struct file *file,
                                  struct file_dedupe_range *same);
- extern int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
-                                    struct file *dst_file, loff_t dst_pos,
-                                    u64 len);
+ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+                                       struct file *dst_file, loff_t dst_pos,
+                                       loff_t len, unsigned int remap_flags);
   
   
   struct super_operations {
@@@ -2998,6 -2985,9 +3016,9 @@@ extern int sb_min_blocksize(struct supe
   extern int generic_file_mmap(struct file *, struct vm_area_struct *);
   extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
   extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+ extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
+                               struct file *file_out, loff_t pos_out,
+                               loff_t *count, unsigned int remap_flags);
   extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
   extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
   extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --combined mm/filemap.c

index 1fe6c4c,1775d4a..81adec8
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -36,8 -36,6 +36,8 @@@
   #include <linux/cleancache.h>
   #include <linux/shmem_fs.h>
   #include <linux/rmap.h>
+ +#include <linux/delayacct.h>
+ +#include <linux/psi.h>
   #include "internal.h"
   
   #define CREATE_TRACE_POINTS
@@@ -113,26 -111,60 +113,26 @@@
    *   ->tasklist_lock            (memory_failure, collect_procs_ao)
    */
   
- -static int page_cache_tree_insert(struct address_space *mapping,
- -                                struct page *page, void **shadowp)
- -{
- -      struct radix_tree_node *node;
- -      void **slot;
- -      int error;
- -
- -      error = __radix_tree_create(&mapping->i_pages, page->index, 0,
- -                                  &node, &slot);
- -      if (error)
- -              return error;
- -      if (*slot) {
- -              void *p;
- -
- -              p = radix_tree_deref_slot_protected(slot,
- -                                                  &mapping->i_pages.xa_lock);
- -              if (!radix_tree_exceptional_entry(p))
- -                      return -EEXIST;
- -
- -              mapping->nrexceptional--;
- -              if (shadowp)
- -                      *shadowp = p;
- -      }
- -      __radix_tree_replace(&mapping->i_pages, node, slot, page,
- -                           workingset_lookup_update(mapping));
- -      mapping->nrpages++;
- -      return 0;
- -}
- -
- -static void page_cache_tree_delete(struct address_space *mapping,
+ +static void page_cache_delete(struct address_space *mapping,
                                    struct page *page, void *shadow)
   {
- -      int i, nr;
+ +      XA_STATE(xas, &mapping->i_pages, page->index);
+ +      unsigned int nr = 1;
+ +
+ +      mapping_set_update(&xas, mapping);
   
- -      /* hugetlb pages are represented by one entry in the radix tree */
- -      nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ +      /* hugetlb pages are represented by a single entry in the xarray */
+ +      if (!PageHuge(page)) {
+ +              xas_set_order(&xas, page->index, compound_order(page));
+ +              nr = 1U << compound_order(page);
+ +      }
   
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageTail(page), page);
         VM_BUG_ON_PAGE(nr != 1 && shadow, page);
   
- -      for (i = 0; i < nr; i++) {
- -              struct radix_tree_node *node;
- -              void **slot;
- -
- -              __radix_tree_lookup(&mapping->i_pages, page->index + i,
- -                                  &node, &slot);
- -
- -              VM_BUG_ON_PAGE(!node && nr != 1, page);
- -
- -              radix_tree_clear_tags(&mapping->i_pages, node, slot);
- -              __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
- -                              workingset_lookup_update(mapping));
- -      }
+ +      xas_store(&xas, shadow);
+ +      xas_init_marks(&xas);
   
         page->mapping = NULL;
         /* Leave page->index set: truncation lookup relies upon it */
@@@ -231,7 -263,7 +231,7 @@@ void __delete_from_page_cache(struct pa
         trace_mm_filemap_delete_from_page_cache(page);
   
         unaccount_page_cache_page(mapping, page);
- -      page_cache_tree_delete(mapping, page, shadow);
+ +      page_cache_delete(mapping, page, shadow);
   }
   
   static void page_cache_free_page(struct address_space *mapping,
@@@ -274,7 -306,7 +274,7 @@@ void delete_from_page_cache(struct pag
   EXPORT_SYMBOL(delete_from_page_cache);
   
   /*
- - * page_cache_tree_delete_batch - delete several pages from page cache
+ + * page_cache_delete_batch - delete several pages from page cache
    * @mapping: the mapping to which pages belong
    * @pvec: pagevec with pages to delete
    *
@@@ -287,19 -319,24 +287,19 @@@
    *
    * The function expects the i_pages lock to be held.
    */
- -static void
- -page_cache_tree_delete_batch(struct address_space *mapping,
+ +static void page_cache_delete_batch(struct address_space *mapping,
                              struct pagevec *pvec)
   {
- -      struct radix_tree_iter iter;
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
         int total_pages = 0;
         int i = 0, tail_pages = 0;
         struct page *page;
- -      pgoff_t start;
   
- -      start = pvec->pages[0]->index;
- -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ +      mapping_set_update(&xas, mapping);
+ +      xas_for_each(&xas, page, ULONG_MAX) {
                 if (i >= pagevec_count(pvec) && !tail_pages)
                         break;
- -              page = radix_tree_deref_slot_protected(slot,
- -                                                     &mapping->i_pages.xa_lock);
- -              if (radix_tree_exceptional_entry(page))
+ +              if (xa_is_value(page))
                         continue;
                 if (!tail_pages) {
                         /*
@@@ -307,11 -344,8 +307,11 @@@
                          * have our pages locked so they are protected from
                          * being removed.
                          */
- -                      if (page != pvec->pages[i])
+ +                      if (page != pvec->pages[i]) {
+ +                              VM_BUG_ON_PAGE(page->index >
+ +                                              pvec->pages[i]->index, page);
                                 continue;
+ +                      }
                         WARN_ON_ONCE(!PageLocked(page));
                         if (PageTransHuge(page) && !PageHuge(page))
                                 tail_pages = HPAGE_PMD_NR - 1;
@@@ -322,11 -356,11 +322,11 @@@
                          */
                         i++;
                 } else {
+ +                      VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
+ +                                      != pvec->pages[i]->index, page);
                         tail_pages--;
                 }
- -              radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
- -              __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
- -                              workingset_lookup_update(mapping));
+ +              xas_store(&xas, NULL);
                 total_pages++;
         }
         mapping->nrpages -= total_pages;
@@@ -347,7 -381,7 +347,7 @@@ void delete_from_page_cache_batch(struc
   
                 unaccount_page_cache_page(mapping, pvec->pages[i]);
         }
- -      page_cache_tree_delete_batch(mapping, pvec);
+ +      page_cache_delete_batch(mapping, pvec);
         xa_unlock_irqrestore(&mapping->i_pages, flags);
   
         for (i = 0; i < pagevec_count(pvec); i++)
@@@ -457,31 -491,20 +457,31 @@@ EXPORT_SYMBOL(filemap_flush)
   bool filemap_range_has_page(struct address_space *mapping,
                            loff_t start_byte, loff_t end_byte)
   {
- -      pgoff_t index = start_byte >> PAGE_SHIFT;
- -      pgoff_t end = end_byte >> PAGE_SHIFT;
         struct page *page;
+ +      XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ +      pgoff_t max = end_byte >> PAGE_SHIFT;
   
         if (end_byte < start_byte)
                 return false;
   
- -      if (mapping->nrpages == 0)
- -              return false;
+ +      rcu_read_lock();
+ +      for (;;) {
+ +              page = xas_find(&xas, max);
+ +              if (xas_retry(&xas, page))
+ +                      continue;
+ +              /* Shadow entries don't count */
+ +              if (xa_is_value(page))
+ +                      continue;
+ +              /*
+ +               * We don't need to try to pin this page; we're about to
+ +               * release the RCU lock anyway.  It is enough to know that
+ +               * there was a page here recently.
+ +               */
+ +              break;
+ +      }
+ +      rcu_read_unlock();
   
- -      if (!find_get_pages_range(mapping, &index, end, 1, &page))
- -              return false;
- -      put_page(page);
- -      return true;
+ +      return page != NULL;
   }
   EXPORT_SYMBOL(filemap_range_has_page);
   
@@@ -752,44 -775,51 +752,44 @@@ EXPORT_SYMBOL(file_write_and_wait_range
    * locked.  This function does not add the new page to the LRU, the
    * caller must do that.
    *
- - * The remove + add is atomic.  The only way this function can fail is
- - * memory allocation failure.
+ + * The remove + add is atomic.  This function cannot fail.
    */
   int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
   {
- -      int error;
+ +      struct address_space *mapping = old->mapping;
+ +      void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ +      pgoff_t offset = old->index;
+ +      XA_STATE(xas, &mapping->i_pages, offset);
+ +      unsigned long flags;
   
         VM_BUG_ON_PAGE(!PageLocked(old), old);
         VM_BUG_ON_PAGE(!PageLocked(new), new);
         VM_BUG_ON_PAGE(new->mapping, new);
   
- -      error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
- -      if (!error) {
- -              struct address_space *mapping = old->mapping;
- -              void (*freepage)(struct page *);
- -              unsigned long flags;
- -
- -              pgoff_t offset = old->index;
- -              freepage = mapping->a_ops->freepage;
- -
- -              get_page(new);
- -              new->mapping = mapping;
- -              new->index = offset;
+ +      get_page(new);
+ +      new->mapping = mapping;
+ +      new->index = offset;
   
- -              xa_lock_irqsave(&mapping->i_pages, flags);
- -              __delete_from_page_cache(old, NULL);
- -              error = page_cache_tree_insert(mapping, new, NULL);
- -              BUG_ON(error);
+ +      xas_lock_irqsave(&xas, flags);
+ +      xas_store(&xas, new);
   
- -              /*
- -               * hugetlb pages do not participate in page cache accounting.
- -               */
- -              if (!PageHuge(new))
- -                      __inc_node_page_state(new, NR_FILE_PAGES);
- -              if (PageSwapBacked(new))
- -                      __inc_node_page_state(new, NR_SHMEM);
- -              xa_unlock_irqrestore(&mapping->i_pages, flags);
- -              mem_cgroup_migrate(old, new);
- -              radix_tree_preload_end();
- -              if (freepage)
- -                      freepage(old);
- -              put_page(old);
- -      }
+ +      old->mapping = NULL;
+ +      /* hugetlb pages do not participate in page cache accounting. */
+ +      if (!PageHuge(old))
+ +              __dec_node_page_state(new, NR_FILE_PAGES);
+ +      if (!PageHuge(new))
+ +              __inc_node_page_state(new, NR_FILE_PAGES);
+ +      if (PageSwapBacked(old))
+ +              __dec_node_page_state(new, NR_SHMEM);
+ +      if (PageSwapBacked(new))
+ +              __inc_node_page_state(new, NR_SHMEM);
+ +      xas_unlock_irqrestore(&xas, flags);
+ +      mem_cgroup_migrate(old, new);
+ +      if (freepage)
+ +              freepage(old);
+ +      put_page(old);
   
- -      return error;
+ +      return 0;
   }
   EXPORT_SYMBOL_GPL(replace_page_cache_page);
   
@@@ -798,15 -828,12 +798,15 @@@ static int __add_to_page_cache_locked(s
                                       pgoff_t offset, gfp_t gfp_mask,
                                       void **shadowp)
   {
+ +      XA_STATE(xas, &mapping->i_pages, offset);
         int huge = PageHuge(page);
         struct mem_cgroup *memcg;
         int error;
+ +      void *old;
   
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ +      mapping_set_update(&xas, mapping);
   
         if (!huge) {
                 error = mem_cgroup_try_charge(page, current->mm,
@@@ -815,47 -842,39 +815,47 @@@
                         return error;
         }
   
- -      error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
- -      if (error) {
- -              if (!huge)
- -                      mem_cgroup_cancel_charge(page, memcg, false);
- -              return error;
- -      }
- -
         get_page(page);
         page->mapping = mapping;
         page->index = offset;
   
- -      xa_lock_irq(&mapping->i_pages);
- -      error = page_cache_tree_insert(mapping, page, shadowp);
- -      radix_tree_preload_end();
- -      if (unlikely(error))
- -              goto err_insert;
+ +      do {
+ +              xas_lock_irq(&xas);
+ +              old = xas_load(&xas);
+ +              if (old && !xa_is_value(old))
+ +                      xas_set_err(&xas, -EEXIST);
+ +              xas_store(&xas, page);
+ +              if (xas_error(&xas))
+ +                      goto unlock;
+ +
+ +              if (xa_is_value(old)) {
+ +                      mapping->nrexceptional--;
+ +                      if (shadowp)
+ +                              *shadowp = old;
+ +              }
+ +              mapping->nrpages++;
+ +
+ +              /* hugetlb pages do not participate in page cache accounting */
+ +              if (!huge)
+ +                      __inc_node_page_state(page, NR_FILE_PAGES);
+ +unlock:
+ +              xas_unlock_irq(&xas);
+ +      } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ +
+ +      if (xas_error(&xas))
+ +              goto error;
   
- -      /* hugetlb pages do not participate in page cache accounting. */
- -      if (!huge)
- -              __inc_node_page_state(page, NR_FILE_PAGES);
- -      xa_unlock_irq(&mapping->i_pages);
         if (!huge)
                 mem_cgroup_commit_charge(page, memcg, false, false);
         trace_mm_filemap_add_to_page_cache(page);
         return 0;
- -err_insert:
+ +error:
         page->mapping = NULL;
         /* Leave page->index set: truncation relies upon it */
- -      xa_unlock_irq(&mapping->i_pages);
         if (!huge)
                 mem_cgroup_cancel_charge(page, memcg, false);
         put_page(page);
- -      return error;
+ +      return xas_error(&xas);
   }
   
   /**
@@@ -896,9 -915,12 +896,9 @@@ int add_to_page_cache_lru(struct page *
                  * data from the working set, only to cache data that will
                  * get overwritten with something else, is a waste of memory.
                  */
- -              if (!(gfp_mask & __GFP_WRITE) &&
- -                  shadow && workingset_refault(shadow)) {
- -                      SetPageActive(page);
- -                      workingset_activation(page);
- -              } else
- -                      ClearPageActive(page);
+ +              WARN_ON_ONCE(PageActive(page));
+ +              if (!(gfp_mask & __GFP_WRITE) && shadow)
+ +                      workingset_refault(page, shadow);
                 lru_cache_add(page);
         }
         return ret;
@@@ -1054,18 -1076,8 +1054,18 @@@ static inline int wait_on_page_bit_comm
   {
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
+ +      bool thrashing = false;
+ +      unsigned long pflags;
         int ret = 0;
   
+ +      if (bit_nr == PG_locked &&
+ +          !PageUptodate(page) && PageWorkingset(page)) {
+ +              if (!PageSwapBacked(page))
+ +                      delayacct_thrashing_start();
+ +              psi_memstall_enter(&pflags);
+ +              thrashing = true;
+ +      }
+ +
         init_wait(wait);
         wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
         wait->func = wake_page_function;
@@@ -1104,12 -1116,6 +1104,12 @@@
   
         finish_wait(q, wait);
   
+ +      if (thrashing) {
+ +              if (!PageSwapBacked(page))
+ +                      delayacct_thrashing_end();
+ +              psi_memstall_leave(&pflags);
+ +      }
+ +
         /*
          * A signal could leave PageWaiters set. Clearing it here if
          * !waitqueue_active would be possible (by open-coding finish_wait),
@@@ -1320,76 -1326,86 +1320,76 @@@ int __lock_page_or_retry(struct page *p
   }
   
   /**
- - * page_cache_next_hole - find the next hole (not-present entry)
- - * @mapping: mapping
- - * @index: index
- - * @max_scan: maximum range to search
- - *
- - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- - * lowest indexed hole.
- - *
- - * Returns: the index of the hole if found, otherwise returns an index
- - * outside of the set specified (in which case 'return - index >=
- - * max_scan' will be true). In rare cases of index wrap-around, 0 will
- - * be returned.
- - *
- - * page_cache_next_hole may be called under rcu_read_lock. However,
- - * like radix_tree_gang_lookup, this will not atomically search a
- - * snapshot of the tree at a single point in time. For example, if a
- - * hole is created at index 5, then subsequently a hole is created at
- - * index 10, page_cache_next_hole covering both indexes may return 10
- - * if called under rcu_read_lock.
+ + * page_cache_next_miss() - Find the next gap in the page cache.
+ + * @mapping: Mapping.
+ + * @index: Index.
+ + * @max_scan: Maximum range to search.
+ + *
+ + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ + * gap with the lowest index.
+ + *
+ + * This function may be called under the rcu_read_lock.  However, this will
+ + * not atomically search a snapshot of the cache at a single point in time.
+ + * For example, if a gap is created at index 5, then subsequently a gap is
+ + * created at index 10, page_cache_next_miss covering both indices may
+ + * return 10 if called under the rcu_read_lock.
+ + *
+ + * Return: The index of the gap if found, otherwise an index outside the
+ + * range specified (in which case 'return - index >= max_scan' will be true).
+ + * In the rare case of index wrap-around, 0 will be returned.
    */
- -pgoff_t page_cache_next_hole(struct address_space *mapping,
+ +pgoff_t page_cache_next_miss(struct address_space *mapping,
                              pgoff_t index, unsigned long max_scan)
   {
- -      unsigned long i;
- -
- -      for (i = 0; i < max_scan; i++) {
- -              struct page *page;
+ +      XA_STATE(xas, &mapping->i_pages, index);
   
- -              page = radix_tree_lookup(&mapping->i_pages, index);
- -              if (!page || radix_tree_exceptional_entry(page))
+ +      while (max_scan--) {
+ +              void *entry = xas_next(&xas);
+ +              if (!entry || xa_is_value(entry))
                         break;
- -              index++;
- -              if (index == 0)
+ +              if (xas.xa_index == 0)
                         break;
         }
   
- -      return index;
+ +      return xas.xa_index;
   }
- -EXPORT_SYMBOL(page_cache_next_hole);
+ +EXPORT_SYMBOL(page_cache_next_miss);
   
   /**
- - * page_cache_prev_hole - find the prev hole (not-present entry)
- - * @mapping: mapping
- - * @index: index
- - * @max_scan: maximum range to search
- - *
- - * Search backwards in the range [max(index-max_scan+1, 0), index] for
- - * the first hole.
- - *
- - * Returns: the index of the hole if found, otherwise returns an index
- - * outside of the set specified (in which case 'index - return >=
- - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- - * will be returned.
- - *
- - * page_cache_prev_hole may be called under rcu_read_lock. However,
- - * like radix_tree_gang_lookup, this will not atomically search a
- - * snapshot of the tree at a single point in time. For example, if a
- - * hole is created at index 10, then subsequently a hole is created at
- - * index 5, page_cache_prev_hole covering both indexes may return 5 if
- - * called under rcu_read_lock.
+ + * page_cache_prev_miss() - Find the next gap in the page cache.
+ + * @mapping: Mapping.
+ + * @index: Index.
+ + * @max_scan: Maximum range to search.
+ + *
+ + * Search the range [max(index - max_scan + 1, 0), index] for the
+ + * gap with the highest index.
+ + *
+ + * This function may be called under the rcu_read_lock.  However, this will
+ + * not atomically search a snapshot of the cache at a single point in time.
+ + * For example, if a gap is created at index 10, then subsequently a gap is
+ + * created at index 5, page_cache_prev_miss() covering both indices may
+ + * return 5 if called under the rcu_read_lock.
+ + *
+ + * Return: The index of the gap if found, otherwise an index outside the
+ + * range specified (in which case 'index - return >= max_scan' will be true).
+ + * In the rare case of wrap-around, ULONG_MAX will be returned.
    */
- -pgoff_t page_cache_prev_hole(struct address_space *mapping,
+ +pgoff_t page_cache_prev_miss(struct address_space *mapping,
                              pgoff_t index, unsigned long max_scan)
   {
- -      unsigned long i;
- -
- -      for (i = 0; i < max_scan; i++) {
- -              struct page *page;
+ +      XA_STATE(xas, &mapping->i_pages, index);
   
- -              page = radix_tree_lookup(&mapping->i_pages, index);
- -              if (!page || radix_tree_exceptional_entry(page))
+ +      while (max_scan--) {
+ +              void *entry = xas_prev(&xas);
+ +              if (!entry || xa_is_value(entry))
                         break;
- -              index--;
- -              if (index == ULONG_MAX)
+ +              if (xas.xa_index == ULONG_MAX)
                         break;
         }
   
- -      return index;
+ +      return xas.xa_index;
   }
- -EXPORT_SYMBOL(page_cache_prev_hole);
+ +EXPORT_SYMBOL(page_cache_prev_miss);
   
   /**
    * find_get_entry - find and get a page cache entry
@@@ -1406,40 -1422,47 +1406,40 @@@
    */
   struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
   {
- -      void **pagep;
+ +      XA_STATE(xas, &mapping->i_pages, offset);
         struct page *head, *page;
   
         rcu_read_lock();
   repeat:
- -      page = NULL;
- -      pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
- -      if (pagep) {
- -              page = radix_tree_deref_slot(pagep);
- -              if (unlikely(!page))
- -                      goto out;
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page))
- -                              goto repeat;
- -                      /*
- -                       * A shadow entry of a recently evicted page,
- -                       * or a swap entry from shmem/tmpfs.  Return
- -                       * it without attempting to raise page count.
- -                       */
- -                      goto out;
- -              }
+ +      xas_reset(&xas);
+ +      page = xas_load(&xas);
+ +      if (xas_retry(&xas, page))
+ +              goto repeat;
+ +      /*
+ +       * A shadow entry of a recently evicted page, or a swap entry from
+ +       * shmem/tmpfs.  Return it without attempting to raise page count.
+ +       */
+ +      if (!page || xa_is_value(page))
+ +              goto out;
   
- -              head = compound_head(page);
- -              if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +      head = compound_head(page);
+ +      if (!page_cache_get_speculative(head))
+ +              goto repeat;
   
- -              /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +      /* The page was split under us? */
+ +      if (compound_head(page) != head) {
+ +              put_page(head);
+ +              goto repeat;
+ +      }
   
- -              /*
- -               * Has the page moved?
- -               * This is part of the lockless pagecache protocol. See
- -               * include/linux/pagemap.h for details.
- -               */
- -              if (unlikely(page != *pagep)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +      /*
+ +       * Has the page moved?
+ +       * This is part of the lockless pagecache protocol. See
+ +       * include/linux/pagemap.h for details.
+ +       */
+ +      if (unlikely(page != xas_reload(&xas))) {
+ +              put_page(head);
+ +              goto repeat;
         }
   out:
         rcu_read_unlock();
@@@ -1470,7 -1493,7 +1470,7 @@@ struct page *find_lock_entry(struct add
   
   repeat:
         page = find_get_entry(mapping, offset);
- -      if (page && !radix_tree_exception(page)) {
+ +      if (page && !xa_is_value(page)) {
                 lock_page(page);
                 /* Has the page been truncated? */
                 if (unlikely(page_mapping(page) != mapping)) {
@@@ -1516,7 -1539,7 +1516,7 @@@ struct page *pagecache_get_page(struct 
   
   repeat:
         page = find_get_entry(mapping, offset);
- -      if (radix_tree_exceptional_entry(page))
+ +      if (xa_is_value(page))
                 page = NULL;
         if (!page)
                 goto no_page;
@@@ -1602,48 -1625,53 +1602,48 @@@ unsigned find_get_entries(struct addres
                           pgoff_t start, unsigned int nr_entries,
                           struct page **entries, pgoff_t *indices)
   {
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, start);
+ +      struct page *page;
         unsigned int ret = 0;
- -      struct radix_tree_iter iter;
   
         if (!nr_entries)
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- -              struct page *head, *page;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              if (unlikely(!page))
+ +      xas_for_each(&xas, page, ULONG_MAX) {
+ +              struct page *head;
+ +              if (xas_retry(&xas, page))
                         continue;
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
- -                      /*
- -                       * A shadow entry of a recently evicted page, a swap
- -                       * entry from shmem/tmpfs or a DAX entry.  Return it
- -                       * without attempting to raise page count.
- -                       */
+ +              /*
+ +               * A shadow entry of a recently evicted page, a swap
+ +               * entry from shmem/tmpfs or a DAX entry.  Return it
+ +               * without attempting to raise page count.
+ +               */
+ +              if (xa_is_value(page))
                         goto export;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto retry;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto put_page;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto put_page;
+ +
   export:
- -              indices[ret] = iter.index;
+ +              indices[ret] = xas.xa_index;
                 entries[ret] = page;
                 if (++ret == nr_entries)
                         break;
+ +              continue;
+ +put_page:
+ +              put_page(head);
+ +retry:
+ +              xas_reset(&xas);
         }
         rcu_read_unlock();
         return ret;
@@@ -1674,50 -1702,64 +1674,50 @@@ unsigned find_get_pages_range(struct ad
                               pgoff_t end, unsigned int nr_pages,
                               struct page **pages)
   {
- -      struct radix_tree_iter iter;
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, *start);
+ +      struct page *page;
         unsigned ret = 0;
   
         if (unlikely(!nr_pages))
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
- -              struct page *head, *page;
- -
- -              if (iter.index > end)
- -                      break;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              if (unlikely(!page))
+ +      xas_for_each(&xas, page, end) {
+ +              struct page *head;
+ +              if (xas_retry(&xas, page))
                         continue;
- -
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
- -                      /*
- -                       * A shadow entry of a recently evicted page,
- -                       * or a swap entry from shmem/tmpfs.  Skip
- -                       * over it.
- -                       */
+ +              /* Skip over shadow, swap and DAX entries */
+ +              if (xa_is_value(page))
                         continue;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto retry;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto put_page;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto put_page;
   
                 pages[ret] = page;
                 if (++ret == nr_pages) {
- -                      *start = pages[ret - 1]->index + 1;
+ +                      *start = page->index + 1;
                         goto out;
                 }
+ +              continue;
+ +put_page:
+ +              put_page(head);
+ +retry:
+ +              xas_reset(&xas);
         }
   
         /*
          * We come here when there is no page beyond @end. We take care to not
          * overflow the index @start as it confuses some of the callers. This
- -       * breaks the iteration when there is page at index -1 but that is
+ +       * breaks the iteration when there is a page at index -1 but that is
          * already broken anyway.
          */
         if (end == (pgoff_t)-1)
@@@ -1745,43 -1787,57 +1745,43 @@@ out
   unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                                unsigned int nr_pages, struct page **pages)
   {
- -      struct radix_tree_iter iter;
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, index);
+ +      struct page *page;
         unsigned int ret = 0;
   
         if (unlikely(!nr_pages))
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
- -              struct page *head, *page;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              /* The hole, there no reason to continue */
- -              if (unlikely(!page))
- -                      break;
- -
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
- -                      /*
- -                       * A shadow entry of a recently evicted page,
- -                       * or a swap entry from shmem/tmpfs.  Stop
- -                       * looking for contiguous pages.
- -                       */
+ +      for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ +              struct page *head;
+ +              if (xas_retry(&xas, page))
+ +                      continue;
+ +              /*
+ +               * If the entry has been swapped out, we can stop looking.
+ +               * No current caller is looking for DAX entries.
+ +               */
+ +              if (xa_is_value(page))
                         break;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto retry;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto put_page;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto put_page;
   
                 /*
                  * must check mapping and index after taking the ref.
                  * otherwise we can get both false positives and false
                  * negatives, which is just confusing to the caller.
                  */
- -              if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+ +              if (!page->mapping || page_to_pgoff(page) != xas.xa_index) {
                         put_page(page);
                         break;
                 }
@@@ -1789,11 -1845,6 +1789,11 @@@
                 pages[ret] = page;
                 if (++ret == nr_pages)
                         break;
+ +              continue;
+ +put_page:
+ +              put_page(head);
+ +retry:
+ +              xas_reset(&xas);
         }
         rcu_read_unlock();
         return ret;
@@@ -1813,58 -1864,74 +1813,58 @@@ EXPORT_SYMBOL(find_get_pages_contig)
    * @tag.   We update @index to index the next page for the traversal.
    */
   unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- -                      pgoff_t end, int tag, unsigned int nr_pages,
+ +                      pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
                         struct page **pages)
   {
- -      struct radix_tree_iter iter;
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, *index);
+ +      struct page *page;
         unsigned ret = 0;
   
         if (unlikely(!nr_pages))
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
- -              struct page *head, *page;
- -
- -              if (iter.index > end)
- -                      break;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              if (unlikely(!page))
+ +      xas_for_each_marked(&xas, page, end, tag) {
+ +              struct page *head;
+ +              if (xas_retry(&xas, page))
                         continue;
- -
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
- -                      /*
- -                       * A shadow entry of a recently evicted page.
- -                       *
- -                       * Those entries should never be tagged, but
- -                       * this tree walk is lockless and the tags are
- -                       * looked up in bulk, one radix tree node at a
- -                       * time, so there is a sizable window for page
- -                       * reclaim to evict a page we saw tagged.
- -                       *
- -                       * Skip over it.
- -                       */
+ +              /*
+ +               * Shadow entries should never be tagged, but this iteration
+ +               * is lockless so there is a window for page reclaim to evict
+ +               * a page we saw tagged.  Skip over it.
+ +               */
+ +              if (xa_is_value(page))
                         continue;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto retry;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto put_page;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto put_page;
   
                 pages[ret] = page;
                 if (++ret == nr_pages) {
- -                      *index = pages[ret - 1]->index + 1;
+ +                      *index = page->index + 1;
                         goto out;
                 }
+ +              continue;
+ +put_page:
+ +              put_page(head);
+ +retry:
+ +              xas_reset(&xas);
         }
   
         /*
- -       * We come here when we got at @end. We take care to not overflow the
+ +       * We come here when we got to @end. We take care to not overflow the
          * index @index as it confuses some of the callers. This breaks the
- -       * iteration when there is page at index -1 but that is already broken
- -       * anyway.
+ +       * iteration when there is a page at index -1 but that is already
+ +       * broken anyway.
          */
         if (end == (pgoff_t)-1)
                 *index = (pgoff_t)-1;
@@@ -1890,51 -1957,57 +1890,51 @@@ EXPORT_SYMBOL(find_get_pages_range_tag)
    * @tag.
    */
   unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- -                      int tag, unsigned int nr_entries,
+ +                      xa_mark_t tag, unsigned int nr_entries,
                         struct page **entries, pgoff_t *indices)
   {
- -      void **slot;
+ +      XA_STATE(xas, &mapping->i_pages, start);
+ +      struct page *page;
         unsigned int ret = 0;
- -      struct radix_tree_iter iter;
   
         if (!nr_entries)
                 return 0;
   
         rcu_read_lock();
- -      radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
- -              struct page *head, *page;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              if (unlikely(!page))
+ +      xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
+ +              struct page *head;
+ +              if (xas_retry(&xas, page))
                         continue;
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
- -
- -                      /*
- -                       * A shadow entry of a recently evicted page, a swap
- -                       * entry from shmem/tmpfs or a DAX entry.  Return it
- -                       * without attempting to raise page count.
- -                       */
+ +              /*
+ +               * A shadow entry of a recently evicted page, a swap
+ +               * entry from shmem/tmpfs or a DAX entry.  Return it
+ +               * without attempting to raise page count.
+ +               */
+ +              if (xa_is_value(page))
                         goto export;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto retry;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto put_page;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto put_page;
+ +
   export:
- -              indices[ret] = iter.index;
+ +              indices[ret] = xas.xa_index;
                 entries[ret] = page;
                 if (++ret == nr_entries)
                         break;
+ +              continue;
+ +put_page:
+ +              put_page(head);
+ +retry:
+ +              xas_reset(&xas);
         }
         rcu_read_unlock();
         return ret;
@@@ -2049,7 -2122,7 +2049,7 @@@ find_page
                                         !mapping->a_ops->is_partially_uptodate)
                                 goto page_not_up_to_date;
                         /* pipes can't handle partially uptodate pages */
- -                      if (unlikely(iter->type & ITER_PIPE))
+ +                      if (unlikely(iov_iter_is_pipe(iter)))
                                 goto page_not_up_to_date;
                         if (!trylock_page(page))
                                 goto page_not_up_to_date;
@@@ -2508,7 -2581,9 +2508,7 @@@ no_cached_page
          * system is low on memory, or a problem occurs while trying
          * to schedule I/O.
          */
- -      if (error == -ENOMEM)
- -              return VM_FAULT_OOM;
- -      return VM_FAULT_SIGBUS;
+ +      return vmf_error(error);
   
   page_not_uptodate:
         /*
@@@ -2538,31 -2613,45 +2538,31 @@@ EXPORT_SYMBOL(filemap_fault)
   void filemap_map_pages(struct vm_fault *vmf,
                 pgoff_t start_pgoff, pgoff_t end_pgoff)
   {
- -      struct radix_tree_iter iter;
- -      void **slot;
         struct file *file = vmf->vma->vm_file;
         struct address_space *mapping = file->f_mapping;
         pgoff_t last_pgoff = start_pgoff;
         unsigned long max_idx;
+ +      XA_STATE(xas, &mapping->i_pages, start_pgoff);
         struct page *head, *page;
   
         rcu_read_lock();
- -      radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
- -              if (iter.index > end_pgoff)
- -                      break;
- -repeat:
- -              page = radix_tree_deref_slot(slot);
- -              if (unlikely(!page))
- -                      goto next;
- -              if (radix_tree_exception(page)) {
- -                      if (radix_tree_deref_retry(page)) {
- -                              slot = radix_tree_iter_retry(&iter);
- -                              continue;
- -                      }
+ +      xas_for_each(&xas, page, end_pgoff) {
+ +              if (xas_retry(&xas, page))
+ +                      continue;
+ +              if (xa_is_value(page))
                         goto next;
- -              }
   
                 head = compound_head(page);
                 if (!page_cache_get_speculative(head))
- -                      goto repeat;
+ +                      goto next;
   
                 /* The page was split under us? */
- -              if (compound_head(page) != head) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (compound_head(page) != head)
+ +                      goto skip;
   
                 /* Has the page moved? */
- -              if (unlikely(page != *slot)) {
- -                      put_page(head);
- -                      goto repeat;
- -              }
+ +              if (unlikely(page != xas_reload(&xas)))
+ +                      goto skip;
   
                 if (!PageUptodate(page) ||
                                 PageReadahead(page) ||
@@@ -2581,10 -2670,10 +2581,10 @@@
                 if (file->f_ra.mmap_miss > 0)
                         file->f_ra.mmap_miss--;
   
- -              vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ +              vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                 if (vmf->pte)
- -                      vmf->pte += iter.index - last_pgoff;
- -              last_pgoff = iter.index;
+ +                      vmf->pte += xas.xa_index - last_pgoff;
+ +              last_pgoff = xas.xa_index;
                 if (alloc_set_pte(vmf, NULL, page))
                         goto unlock;
                 unlock_page(page);
@@@ -2597,6 -2686,8 +2597,6 @@@ next
                 /* Huge page is mapped? No need to proceed. */
                 if (pmd_trans_huge(*vmf->pmd))
                         break;
- -              if (iter.index == end_pgoff)
- -                      break;
         }
         rcu_read_unlock();
   }
@@@ -2657,9 -2748,9 +2657,9 @@@ int generic_file_readonly_mmap(struct f
         return generic_file_mmap(file, vma);
   }
   #else
- -int filemap_page_mkwrite(struct vm_fault *vmf)
+ +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
   {
- -      return -ENOSYS;
+ +      return VM_FAULT_SIGBUS;
   }
   int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
   {
@@@ -2706,7 -2797,7 +2706,7 @@@ repeat
                         put_page(page);
                         if (err == -EEXIST)
                                 goto repeat;
- -                      /* Presumably ENOMEM for radix tree node */
+ +                      /* Presumably ENOMEM for xarray node */
                         return ERR_PTR(err);
                 }
   
@@@ -2825,6 -2916,42 +2825,42 @@@ struct page *read_cache_page_gfp(struc
   EXPORT_SYMBOL(read_cache_page_gfp);
   
   /*
+  * Don't operate on ranges the page cache doesn't support, and don't exceed the
+  * LFS limits.  If pos is under the limit it becomes a short access.  If it
+  * exceeds the limit we return -EFBIG.
+  */
+ static int generic_access_check_limits(struct file *file, loff_t pos,
+                                      loff_t *count)
+ {
+       struct inode *inode = file->f_mapping->host;
+       loff_t max_size = inode->i_sb->s_maxbytes;
+ 
+       if (!(file->f_flags & O_LARGEFILE))
+               max_size = MAX_NON_LFS;
+ 
+       if (unlikely(pos >= max_size))
+               return -EFBIG;
+       *count = min(*count, max_size - pos);
+       return 0;
+ }
+ 
+ static int generic_write_check_limits(struct file *file, loff_t pos,
+                                     loff_t *count)
+ {
+       loff_t limit = rlimit(RLIMIT_FSIZE);
+ 
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
+               }
+               *count = min(*count, limit - pos);
+       }
+ 
+       return generic_access_check_limits(file, pos, count);
+ }
+ 
+ /*
    * Performs necessary checks before doing a write
    *
    * Can adjust writing position or amount of bytes to write.
@@@ -2835,8 -2962,8 +2871,8 @@@ inline ssize_t generic_write_checks(str
   {
         struct file *file = iocb->ki_filp;
         struct inode *inode = file->f_mapping->host;
-       unsigned long limit = rlimit(RLIMIT_FSIZE);
-       loff_t pos;
+       loff_t count;
+       int ret;
   
         if (!iov_iter_count(from))
                 return 0;
@@@ -2845,43 -2972,99 +2881,99 @@@
         if (iocb->ki_flags & IOCB_APPEND)
                 iocb->ki_pos = i_size_read(inode);
   
-       pos = iocb->ki_pos;
- 
         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
                 return -EINVAL;
   
-       if (limit != RLIM_INFINITY) {
-               if (iocb->ki_pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               iov_iter_truncate(from, limit - (unsigned long)pos);
-       }
+       count = iov_iter_count(from);
+       ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+       if (ret)
+               return ret;
+ 
+       iov_iter_truncate(from, count);
+       return iov_iter_count(from);
+ }
+ EXPORT_SYMBOL(generic_write_checks);
+ 
+ /*
+  * Performs necessary checks before doing a clone.
+  *
+  * Can adjust amount of bytes to clone.
+  * Returns appropriate error code that caller should return or
+  * zero in case the clone should be allowed.
+  */
+ int generic_remap_checks(struct file *file_in, loff_t pos_in,
+                        struct file *file_out, loff_t pos_out,
+                        loff_t *req_count, unsigned int remap_flags)
+ {
+       struct inode *inode_in = file_in->f_mapping->host;
+       struct inode *inode_out = file_out->f_mapping->host;
+       uint64_t count = *req_count;
+       uint64_t bcount;
+       loff_t size_in, size_out;
+       loff_t bs = inode_out->i_sb->s_blocksize;
+       int ret;
+ 
+       /* The start of both ranges must be aligned to an fs block. */
+       if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+               return -EINVAL;
+ 
+       /* Ensure offsets don't wrap. */
+       if (pos_in + count < pos_in || pos_out + count < pos_out)
+               return -EINVAL;
+ 
+       size_in = i_size_read(inode_in);
+       size_out = i_size_read(inode_out);
+ 
+       /* Dedupe requires both ranges to be within EOF. */
+       if ((remap_flags & REMAP_FILE_DEDUP) &&
+           (pos_in >= size_in || pos_in + count > size_in ||
+            pos_out >= size_out || pos_out + count > size_out))
+               return -EINVAL;
+ 
+       /* Ensure the infile range is within the infile. */
+       if (pos_in >= size_in)
+               return -EINVAL;
+       count = min(count, size_in - (uint64_t)pos_in);
+ 
+       ret = generic_access_check_limits(file_in, pos_in, &count);
+       if (ret)
+               return ret;
+ 
+       ret = generic_write_check_limits(file_out, pos_out, &count);
+       if (ret)
+               return ret;
   
         /*
-        * LFS rule
+        * If the user wanted us to link to the infile's EOF, round up to the
+        * next block boundary for this check.
+        *
+        * Otherwise, make sure the count is also block-aligned, having
+        * already confirmed the starting offsets' block alignment.
          */
-       if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
-                               !(file->f_flags & O_LARGEFILE))) {
-               if (pos >= MAX_NON_LFS)
-                       return -EFBIG;
-               iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
+       if (pos_in + count == size_in) {
+               bcount = ALIGN(size_in, bs) - pos_in;
+       } else {
+               if (!IS_ALIGNED(count, bs))
+                       count = ALIGN_DOWN(count, bs);
+               bcount = count;
         }
   
+       /* Don't allow overlapped cloning within the same file. */
+       if (inode_in == inode_out &&
+           pos_out + bcount > pos_in &&
+           pos_out < pos_in + bcount)
+               return -EINVAL;
+ 
         /*
-        * Are we about to exceed the fs block limit ?
-        *
-        * If we have written data it becomes a short write.  If we have
-        * exceeded without writing data we send a signal and return EFBIG.
-        * Linus frestrict idea will clean these up nicely..
+        * We shortened the request but the caller can't deal with that, so
+        * bounce the request back to userspace.
          */
-       if (unlikely(pos >= inode->i_sb->s_maxbytes))
-               return -EFBIG;
+       if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+               return -EINVAL;
   
-       iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
-       return iov_iter_count(from);
+       *req_count = count;
+       return 0;
   }
- EXPORT_SYMBOL(generic_write_checks);
   
   int pagecache_write_begin(struct file *file, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned flags,
@@@ -2921,7 -3104,7 +3013,7 @@@ generic_file_direct_write(struct kiocb 
         if (iocb->ki_flags & IOCB_NOWAIT) {
                 /* If there are pages to writeback, return */
                 if (filemap_range_has_page(inode->i_mapping, pos,
- -                                         pos + iov_iter_count(from)))
+ +                                         pos + write_len))
                         return -EAGAIN;
         } else {
                 written = filemap_write_and_wait_range(mapping, pos,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Nov 2018 16:33:08 +0000 (09:33 -0700)
		1	2
Documentation/filesystems/porting	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ctree.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/cifs/cifsfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/vfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/refcounttree.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/copy_up.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/read_write.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_reflink.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_reflink.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history