Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c

index a3b830b..444e9c8 100644 (file)
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1,5 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0
  
+#include <linux/list_sort.h>
  #include "misc.h"
  #include "ctree.h"
  #include "block-group.h"
@@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
                  */
                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
                 kfree(cache->free_space_ctl);
+               kfree(cache->physical_map);
                 kfree(cache);
         }
  }
@@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         spin_unlock(&cluster->refill_lock);
  
         btrfs_clear_treelog_bg(block_group);
+       btrfs_clear_data_reloc_bg(block_group);
  
         path = btrfs_alloc_path();
         if (!path) {
@@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
         spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+                          const struct list_head *b)
+{
+       const struct btrfs_block_group *bg1, *bg2;
+
+       bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+       bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+       return bg1->used > bg2->used;
+}
+
  void btrfs_reclaim_bgs_work(struct work_struct *work)
  {
         struct btrfs_fs_info *fs_info =
@@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
         }
  
         spin_lock(&fs_info->unused_bgs_lock);
+       /*
+        * Sort happens under lock because we can't simply splice it and sort.
+        * The block groups might still be in use and reachable via bg_list,
+        * and their presence in the reclaim_bgs list must be preserved.
+        */
+       list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
         while (!list_empty(&fs_info->reclaim_bgs)) {
                 u64 zone_unusable;
                 int ret = 0;
@@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
         INIT_LIST_HEAD(&cache->discard_list);
         INIT_LIST_HEAD(&cache->dirty_list);
         INIT_LIST_HEAD(&cache->io_list);
+       INIT_LIST_HEAD(&cache->active_bg_list);
         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
         atomic_set(&cache->frozen, 0);
         mutex_init(&cache->free_space_lock);
@@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
          */
         if (btrfs_is_zoned(info)) {
                 btrfs_calc_zone_unusable(cache);
+               /* Should not have any excluded extents. Just in case, though. */
+               btrfs_free_excluded_extents(cache);
         } else if (cache->length == cache->used) {
                 cache->last_byte_to_unpin = (u64)-1;
                 cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
         link_block_group(cache);
  
         set_avail_alloc_bits(info, cache->flags);
-       if (btrfs_chunk_readonly(info, cache->start)) {
+       if (btrfs_chunk_writeable(info, cache->start)) {
+               if (cache->used == 0) {
+                       ASSERT(list_empty(&cache->bg_list));
+                       if (btrfs_test_opt(info, DISCARD_ASYNC))
+                               btrfs_discard_queue_work(&info->discard_ctl, cache);
+                       else
+                               btrfs_mark_bg_unused(cache);
+               }
+       } else {
                 inc_block_group_ro(cache, 1);
-       } else if (cache->used == 0) {
-               ASSERT(list_empty(&cache->bg_list));
-               if (btrfs_test_opt(info, DISCARD_ASYNC))
-                       btrfs_discard_queue_work(&info->discard_ctl, cache);
-               else
-                       btrfs_mark_bg_unused(cache);
         }
+
         return 0;
  error:
         btrfs_put_block_group(cache);
@@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
                 return ERR_PTR(ret);
         }
  
+       /*
+        * New block group is likely to be used soon. Try to activate it now.
+        * Failure is OK for now.
+        */
+       btrfs_zone_activate(cache);
+
         ret = exclude_super_stripes(cache);
         if (ret) {
                 /* We may have excluded something, so call this just in case */
@@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
          */
         trace_btrfs_add_block_group(fs_info, cache, 1);
         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
-                               cache->bytes_super, 0, &cache->space_info);
+                               cache->bytes_super, cache->zone_unusable,
+                               &cache->space_info);
         btrfs_update_global_block_rsv(fs_info);
  
         link_block_group(cache);
@@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
         if (!--cache->ro) {
                 if (btrfs_is_zoned(cache->fs_info)) {
                         /* Migrate zone_unusable bytes back */
-                       cache->zone_unusable = cache->alloc_offset - cache->used;
+                       cache->zone_unusable =
+                               (cache->alloc_offset - cache->used) +
+                               (cache->length - cache->zone_capacity);
                         sinfo->bytes_zone_unusable += cache->zone_unusable;
                         sinfo->bytes_readonly -= cache->zone_unusable;
                 }
@@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
  }
  
  int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-                            u64 bytenr, u64 num_bytes, int alloc)
+                            u64 bytenr, u64 num_bytes, bool alloc)
  {
         struct btrfs_fs_info *info = trans->fs_info;
         struct btrfs_block_group *cache = NULL;
@@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
          */
         check_system_chunk(trans, flags);
  
-       bg = btrfs_alloc_chunk(trans, flags);
+       bg = btrfs_create_chunk(trans, flags);
         if (IS_ERR(bg)) {
                 ret = PTR_ERR(bg);
                 goto out;
         }
  
-       /*
-        * If this is a system chunk allocation then stop right here and do not
-        * add the chunk item to the chunk btree. This is to prevent a deadlock
-        * because this system chunk allocation can be triggered while COWing
-        * some extent buffer of the chunk btree and while holding a lock on a
-        * parent extent buffer, in which case attempting to insert the chunk
-        * item (or update the device item) would result in a deadlock on that
-        * parent extent buffer. In this case defer the chunk btree updates to
-        * the second phase of chunk allocation and keep our reservation until
-        * the second phase completes.
-        *
-        * This is a rare case and can only be triggered by the very few cases
-        * we have where we need to touch the chunk btree outside chunk allocation
-        * and chunk removal. These cases are basically adding a device, removing
-        * a device or resizing a device.
-        */
-       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               return 0;
-
         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
         /*
          * Normally we are not expected to fail with -ENOSPC here, since we have
          * previously reserved space in the system space_info and allocated one
-        * new system chunk if necessary. However there are two exceptions:
+        * new system chunk if necessary. However there are three exceptions:
          *
          * 1) We may have enough free space in the system space_info but all the
          *    existing system block groups have a profile which can not be used
@@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
          *    with enough free space got turned into RO mode by a running scrub,
          *    and in this case we have to allocate a new one and retry. We only
          *    need do this allocate and retry once, since we have a transaction
-        *    handle and scrub uses the commit root to search for block groups.
+        *    handle and scrub uses the commit root to search for block groups;
+        *
+        * 3) We had one system block group with enough free space when we called
+        *    check_system_chunk(), but after that, right before we tried to
+        *    allocate the last extent buffer we needed, a discard operation came
+        *    in and it temporarily removed the last free space entry from the
+        *    block group (discard removes a free space entry, discards it, and
+        *    then adds back the entry to the block group cache).
          */
         if (ret == -ENOSPC) {
                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
                 struct btrfs_block_group *sys_bg;
  
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                 if (IS_ERR(sys_bg)) {
                         ret = PTR_ERR(sys_bg);
                         btrfs_abort_transaction(trans, ret);
@@ -3519,7 +3546,15 @@ out:
   *    properly, either intentionally or as a bug. One example where this is
   *    done intentionally is fsync, as it does not reserve any transaction units
   *    and ends up allocating a variable number of metadata extents for log
- *    tree extent buffers.
+ *    tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ *    before it tries to allocate the last extent buffer it needs, a discard
+ *    operation comes in and, temporarily, removes the last free space entry from
+ *    the only metadata block group that had free space (discard starts by
+ *    removing a free space entry from a block group, then does the discard
+ *    operation and, once it's done, it adds back the free space entry to the
+ *    block group).
   *
   * We also need this 2 phases setup when adding a device to a filesystem with
   * a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3572,14 @@ out:
   * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
   * the system chunk array due to concurrent allocations") provides more details.
   *
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
   *
   * The reservation of system space, done through check_system_chunk(), as well
   * as all the updates and insertions into the chunk btree must be done while
@@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
         if (trans->allocating_chunk)
                 return -ENOSPC;
         /*
-        * If we are removing a chunk, don't re-enter or we would deadlock.
-        * System space reservation and system chunk allocation is done by the
-        * chunk remove operation (btrfs_remove_chunk()).
+        * Allocation of system chunks can not happen through this path, as we
+        * could end up in a deadlock if we are allocating a data or metadata
+        * chunk and there is another task modifying the chunk btree.
+        *
+        * This is because while we are holding the chunk mutex, we will attempt
+        * to add the new chunk item to the chunk btree or update an existing
+        * device item in the chunk btree, while the other task that is modifying
+        * the chunk btree is attempting to COW an extent buffer while holding a
+        * lock on it and on its parent - if the COW operation triggers a system
+        * chunk allocation, then we can deadlock because we are holding the
+        * chunk mutex and we may need to access that extent buffer or its parent
+        * in order to add the chunk item or update a device item.
+        *
+        * Tasks that want to modify the chunk tree should reserve system space
+        * before updating the chunk btree, by calling either
+        * btrfs_reserve_chunk_metadata() or check_system_chunk().
+        * It's possible that after a task reserves the space, it still ends up
+        * here - this happens in the cases described above at do_chunk_alloc().
+        * The task will have to either retry or fail.
          */
-       if (trans->removing_chunk)
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                 return -ENOSPC;
  
         space_info = btrfs_find_space_info(fs_info, flags);
@@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
         return num_dev;
  }
  
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+                               u64 bytes,
+                               u64 type)
  {
         struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_space_info *info;
         u64 left;
-       u64 thresh;
         int ret = 0;
-       u64 num_devs;
  
         /*
          * Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
         left = info->total_bytes - btrfs_space_info_used(info, true);
         spin_unlock(&info->lock);
  
-       num_devs = get_profile_num_devs(fs_info, type);
-
-       /* num_devs device items to update and 1 chunk item to add or remove */
-       thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
-               btrfs_calc_insert_metadata_size(fs_info, 1);
-
-       if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+       if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
-                          left, thresh, type);
+                          left, bytes, type);
                 btrfs_dump_space_info(fs_info, info, 0, 0);
         }
  
-       if (left < thresh) {
+       if (left < bytes) {
                 u64 flags = btrfs_system_alloc_profile(fs_info);
                 struct btrfs_block_group *bg;
  
@@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
                  * needing it, as we might not need to COW all nodes/leafs from
                  * the paths we visit in the chunk tree (they were already COWed
                  * or created in the current transaction for example).
-                *
-                * Also, if our caller is allocating a system chunk, do not
-                * attempt to insert the chunk item in the chunk btree, as we
-                * could deadlock on an extent buffer since our caller may be
-                * COWing an extent buffer from the chunk btree.
                  */
-               bg = btrfs_alloc_chunk(trans, flags);
+               bg = btrfs_create_chunk(trans, flags);
                 if (IS_ERR(bg)) {
                         ret = PTR_ERR(bg);
-               } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+               } else {
                         /*
                          * If we fail to add the chunk item here, we end up
                          * trying again at phase 2 of chunk allocation, at
                          * btrfs_create_pending_block_groups(). So ignore
-                        * any error here.
+                        * any error here. An ENOSPC here could happen, due to
+                        * the cases described at do_chunk_alloc() - the system
+                        * block group we just created was just turned into RO
+                        * mode by a scrub for example, or a running discard
+                        * temporarily removed its free space entries, etc.
                          */
                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
                 }
@@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
         if (!ret) {
                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
                                           &fs_info->chunk_block_rsv,
-                                         thresh, BTRFS_RESERVE_NO_FLUSH);
+                                         bytes, BTRFS_RESERVE_NO_FLUSH);
                 if (!ret)
-                       trans->chunk_bytes_reserved += thresh;
+                       trans->chunk_bytes_reserved += bytes;
         }
  }
  
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       const u64 num_devs = get_profile_num_devs(fs_info, type);
+       u64 bytes;
+
+       /* num_devs device items to update and 1 chunk item to add or remove. */
+       bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+               btrfs_calc_insert_metadata_size(fs_info, 1);
+
+       reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans:             A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ *                     in the chunk btree or if it's for the deletion or update
+ *                     of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+                                 bool is_item_insertion)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       u64 bytes;
+
+       if (is_item_insertion)
+               bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+       else
+               bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+       mutex_lock(&fs_info->chunk_mutex);
+       reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+       mutex_unlock(&fs_info->chunk_mutex);
+}
+
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
  {
         struct btrfs_block_group *block_group;
@@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         }
         spin_unlock(&info->unused_bgs_lock);
  
+       spin_lock(&info->zone_active_bgs_lock);
+       while (!list_empty(&info->zone_active_bgs)) {
+               block_group = list_first_entry(&info->zone_active_bgs,
+                                              struct btrfs_block_group,
+                                              active_bg_list);
+               list_del_init(&block_group->active_bg_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&info->zone_active_bgs_lock);
+
         spin_lock(&info->block_group_cache_lock);
         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
                 block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h

index c72a71e..5878b7c 100644 (file)
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -98,6 +98,7 @@ struct btrfs_block_group {
         unsigned int to_copy:1;
         unsigned int relocating_repair:1;
         unsigned int chunk_item_inserted:1;
+       unsigned int zone_is_active:1;
  
         int disk_cache_state;
  
@@ -202,7 +203,10 @@ struct btrfs_block_group {
          */
         u64 alloc_offset;
         u64 zone_unusable;
+       u64 zone_capacity;
         u64 meta_write_pointer;
+       struct map_lookup *physical_map;
+       struct list_head active_bg_list;
  };
  
  static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
  int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
  int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
  int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-                            u64 bytenr, u64 num_bytes, int alloc);
+                            u64 bytenr, u64 num_bytes, bool alloc);
  int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
                              u64 ram_bytes, u64 num_bytes, int delalloc);
  void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                       enum btrfs_chunk_alloc_enum force);
  int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
  void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+                                 bool is_item_insertion);
  u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
  int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 76ee145..ab2a4a5 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -138,17 +138,34 @@ struct btrfs_inode {
         /* a local copy of root's last_log_commit */
         int last_log_commit;
  
-       /* total number of bytes pending delalloc, used by stat to calc the
-        * real block usage of the file
-        */
-       u64 delalloc_bytes;
-
-       /*
-        * Total number of bytes pending delalloc that fall within a file
-        * range that is either a hole or beyond EOF (and no prealloc extent
-        * exists in the range). This is always <= delalloc_bytes.
-        */
-       u64 new_delalloc_bytes;
+       union {
+               /*
+                * Total number of bytes pending delalloc, used by stat to
+                * calculate the real block usage of the file. This is used
+                * only for files.
+                */
+               u64 delalloc_bytes;
+               /*
+                * The offset of the last dir item key that was logged.
+                * This is used only for directories.
+                */
+               u64 last_dir_item_offset;
+       };
+
+       union {
+               /*
+                * Total number of bytes pending delalloc that fall within a file
+                * range that is either a hole or beyond EOF (and no prealloc extent
+                * exists in the range). This is always <= delalloc_bytes and this
+                * is used only for files.
+                */
+               u64 new_delalloc_bytes;
+               /*
+                * The offset of the last dir index key that was logged.
+                * This is used only for directories.
+                */
+               u64 last_dir_index_offset;
+       };
  
         /*
          * total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
  
  struct btrfs_dio_private {
         struct inode *inode;
-       u64 logical_offset;
+
+       /*
+        * Since DIO can use anonymous page, we cannot use page_offset() to
+        * grab the file offset, thus need a dedicated member for file offset.
+        */
+       u64 file_offset;
         u64 disk_bytenr;
         /* Used for bio::bi_size */
         u32 bytes;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index 8681608..7e9f90f 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -186,7 +186,6 @@ struct btrfsic_dev_state {
         struct list_head collision_resolving_node;      /* list node */
         struct btrfsic_block dummy_block_for_bio_bh_flush;
         u64 last_flush_gen;
-       char name[BDEVNAME_SIZE];
  };
  
  struct btrfsic_block_hashtable {
@@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
         ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
         ds->bdev = NULL;
         ds->state = NULL;
-       ds->name[0] = '\0';
         INIT_LIST_HEAD(&ds->collision_resolving_node);
         ds->last_flush_gen = 0;
         btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror(
                 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
                 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
                         btrfs_info_in_rcu(fs_info,
-                               "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+                       "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
                                      superblock_bdev,
                                      rcu_str_deref(device->name), dev_bytenr,
-                                    dev_state->name, dev_bytenr,
+                                    dev_state->bdev, dev_bytenr,
                                      superblock_mirror_num);
                 list_add(&superblock_tmp->all_blocks_node,
                          &state->all_blocks_list);
@@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame:
                         if (disk_item_offset + sizeof(struct btrfs_item) >
                             sf->block_ctx->len) {
  leaf_item_out_of_bounce_error:
-                               pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+                               pr_info(
+               "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
                                        sf->block_ctx->start,
-                                      sf->block_ctx->dev->name);
+                                      sf->block_ctx->dev->bdev);
                                 goto one_stack_frame_backwards;
                         }
                         btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame:
                                           (uintptr_t)nodehdr;
                         if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
                             sf->block_ctx->len) {
-                               pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+                               pr_info(
+               "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
                                        sf->block_ctx->start,
-                                      sf->block_ctx->dev->name);
+                                      sf->block_ctx->dev->bdev);
                                 goto one_stack_frame_backwards;
                         }
                         btrfsic_read_from_block_data(
@@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block(
                         if (next_block->logical_bytenr != next_bytenr &&
                             !(!next_block->is_metadata &&
                               0 == next_block->logical_bytenr))
-                               pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                                      next_bytenr, next_block_ctx->dev->name,
+                               pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+                                      next_bytenr, next_block_ctx->dev->bdev,
                                        next_block_ctx->dev_bytenr, *mirror_nump,
                                        btrfsic_get_block_type(state,
                                                               next_block),
                                        next_block->logical_bytenr);
                         else
-                               pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                      next_bytenr, next_block_ctx->dev->name,
+                               pr_info(
+               "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                      next_bytenr, next_block_ctx->dev->bdev,
                                        next_block_ctx->dev_bytenr, *mirror_nump,
                                        btrfsic_get_block_type(state,
                                                               next_block));
@@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data(
         if (file_extent_item_offset +
             offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
             block_ctx->len) {
-               pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
-                      block_ctx->start, block_ctx->dev->name);
+               pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+                      block_ctx->start, block_ctx->dev->bdev);
                 return -1;
         }
  
@@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data(
  
         if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
             block_ctx->len) {
-               pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
-                      block_ctx->start, block_ctx->dev->name);
+               pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+                      block_ctx->start, block_ctx->dev->bdev);
                 return -1;
         }
         btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data(
                                     next_block->logical_bytenr != next_bytenr &&
                                     !(!next_block->is_metadata &&
                                       0 == next_block->logical_bytenr)) {
-                                       pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+                                       pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
                                                next_bytenr,
-                                              next_block_ctx.dev->name,
+                                              next_block_ctx.dev->bdev,
                                                next_block_ctx.dev_bytenr,
                                                mirror_num,
                                                next_block->logical_bytenr);
@@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
         struct btrfs_fs_info *fs_info = state->fs_info;
         int ret;
         u64 length;
-       struct btrfs_bio *multi = NULL;
+       struct btrfs_io_context *multi = NULL;
         struct btrfs_device *device;
  
         length = len;
@@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                 struct bio *bio;
                 unsigned int j;
  
-               bio = btrfs_io_bio_alloc(num_pages - i);
+               bio = btrfs_bio_alloc(num_pages - i);
                 bio_set_dev(bio, block_ctx->dev->bdev);
                 bio->bi_iter.bi_sector = dev_bytenr >> 9;
                 bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                         return -1;
                 }
                 if (submit_bio_wait(bio)) {
-                       pr_info("btrfsic: read error at logical %llu dev %s!\n",
-                              block_ctx->start, block_ctx->dev->name);
+                       pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+                              block_ctx->start, block_ctx->dev->bdev);
                         bio_put(bio);
                         return -1;
                 }
@@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
         list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
                 const struct btrfsic_block_link *l;
  
-               pr_info("%c-block @%llu (%s/%llu/%d)\n",
+               pr_info("%c-block @%llu (%pg/%llu/%d)\n",
                        btrfsic_get_block_type(state, b_all),
-                      b_all->logical_bytenr, b_all->dev_state->name,
+                      b_all->logical_bytenr, b_all->dev_state->bdev,
                        b_all->dev_bytenr, b_all->mirror_num);
  
                 list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
-                       pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
                                btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                                b_all->dev_bytenr, b_all->mirror_num,
                                l->ref_cnt,
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num);
                 }
  
                 list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
-                       pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
                                btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                                b_all->dev_bytenr, b_all->mirror_num,
                                l->ref_cnt,
                                btrfsic_get_block_type(state, l->block_ref_from),
                                l->block_ref_from->logical_bytenr,
-                              l->block_ref_from->dev_state->name,
+                              l->block_ref_from->dev_state->bdev,
                                l->block_ref_from->dev_bytenr,
                                l->block_ref_from->mirror_num);
                 }
@@ -1743,16 +1748,18 @@ again:
                                 if (block->logical_bytenr != bytenr &&
                                     !(!block->is_metadata &&
                                       block->logical_bytenr == 0))
-                                       pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                                              bytenr, dev_state->name,
+                                       pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+                                              bytenr, dev_state->bdev,
                                                dev_bytenr,
                                                block->mirror_num,
                                                btrfsic_get_block_type(state,
                                                                       block),
                                                block->logical_bytenr);
                                 else
-                                       pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                              bytenr, dev_state->name,
+                                       pr_info(
+               "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                              bytenr, dev_state->bdev,
                                                dev_bytenr, block->mirror_num,
                                                btrfsic_get_block_type(state,
                                                                       block));
@@ -1767,8 +1774,9 @@ again:
                         processed_len = state->datablock_size;
                         bytenr = block->logical_bytenr;
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                      bytenr, dev_state->name, dev_bytenr,
+                               pr_info(
+               "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                      bytenr, dev_state->bdev, dev_bytenr,
                                        block->mirror_num,
                                        btrfsic_get_block_type(state, block));
                 }
@@ -1778,9 +1786,10 @@ again:
                                list_empty(&block->ref_to_list) ? ' ' : '!',
                                list_empty(&block->ref_from_list) ? ' ' : '!');
                 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
-                       pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+                       pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
                                btrfsic_get_block_type(state, block), bytenr,
-                              dev_state->name, dev_bytenr, block->mirror_num,
+                              dev_state->bdev, dev_bytenr, block->mirror_num,
                                block->generation,
                                btrfs_disk_key_objectid(&block->disk_key),
                                block->disk_key.type,
@@ -1792,9 +1801,10 @@ again:
                 }
  
                 if (!block->is_iodone && !block->never_written) {
-                       pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
                                btrfsic_get_block_type(state, block), bytenr,
-                              dev_state->name, dev_bytenr, block->mirror_num,
+                              dev_state->bdev, dev_bytenr, block->mirror_num,
                                block->generation,
                                btrfs_stack_header_generation(
                                        (struct btrfs_header *)
@@ -1921,8 +1931,9 @@ again:
                 if (!is_metadata) {
                         processed_len = state->datablock_size;
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
-                                      dev_state->name, dev_bytenr);
+                               pr_info(
+                       "written block (%pg/%llu/?) !found in hash table, D\n",
+                                      dev_state->bdev, dev_bytenr);
                         if (!state->include_extent_data) {
                                 /* ignore that written D block */
                                 goto continue_loop;
@@ -1939,8 +1950,9 @@ again:
                         btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
                                                        dev_bytenr);
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
-                                      bytenr, dev_state->name, dev_bytenr);
+                               pr_info(
+                       "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+                                      bytenr, dev_state->bdev, dev_bytenr);
                 }
  
                 block_ctx.dev = dev_state;
@@ -1995,9 +2007,9 @@ again:
                         block->next_in_same_bio = NULL;
                 }
                 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+                       pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
                                is_metadata ? 'M' : 'D',
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                                block->dev_bytenr, block->mirror_num);
                 list_add(&block->all_blocks_node, &state->all_blocks_list);
                 btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
  
                 if ((dev_state->state->print_mask &
                      BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-                       pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                       pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
                                bp->bi_status,
                                btrfsic_get_block_type(dev_state->state, block),
-                              block->logical_bytenr, dev_state->name,
+                              block->logical_bytenr, dev_state->bdev,
                                block->dev_bytenr, block->mirror_num);
                 next_block = block->next_in_same_bio;
                 block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
                         dev_state->last_flush_gen++;
                         if ((dev_state->state->print_mask &
                              BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-                               pr_info("bio_end_io() new %s flush_gen=%llu\n",
-                                      dev_state->name,
+                               pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+                                      dev_state->bdev,
                                        dev_state->last_flush_gen);
                 }
                 if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock(
         if (!(superblock->generation > state->max_superblock_generation ||
               0 == state->max_superblock_generation)) {
                 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-                       pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+                       pr_info(
+       "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
                                superblock->logical_bytenr,
-                              superblock->dev_state->name,
+                              superblock->dev_state->bdev,
                                superblock->dev_bytenr, superblock->mirror_num,
                                btrfs_super_generation(super_hdr),
                                state->max_superblock_generation);
         } else {
                 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-                       pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+                       pr_info(
+       "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
                                superblock->logical_bytenr,
-                              superblock->dev_state->name,
+                              superblock->dev_state->bdev,
                                superblock->dev_bytenr, superblock->mirror_num,
                                btrfs_super_generation(super_hdr),
                                state->max_superblock_generation);
@@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
          */
         list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
                 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
                                recursion_level,
                                btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                                block->dev_bytenr, block->mirror_num,
                                l->ref_cnt,
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num);
                 if (l->block_ref_to->never_written) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num);
                         ret = -1;
                 } else if (!l->block_ref_to->is_iodone) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num);
                         ret = -1;
                 } else if (l->block_ref_to->iodone_w_error) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num);
                         ret = -1;
@@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                            l->parent_generation &&
                            BTRFSIC_GENERATION_UNKNOWN !=
                            l->block_ref_to->generation) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num,
                                l->block_ref_to->generation,
@@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                         ret = -1;
                 } else if (l->block_ref_to->flush_gen >
                            l->block_ref_to->dev_state->last_flush_gen) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
                                btrfsic_get_block_type(state, l->block_ref_to),
                                l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                                l->block_ref_to->dev_bytenr,
                                l->block_ref_to->mirror_num, block->flush_gen,
                                l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock(
          */
         list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
                 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+       "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
                                recursion_level,
                                btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                                block->dev_bytenr, block->mirror_num,
                                l->ref_cnt,
                                btrfsic_get_block_type(state, l->block_ref_from),
                                l->block_ref_from->logical_bytenr,
-                              l->block_ref_from->dev_state->name,
+                              l->block_ref_from->dev_state->bdev,
                                l->block_ref_from->dev_bytenr,
                                l->block_ref_from->mirror_num);
                 if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock(
  static void btrfsic_print_add_link(const struct btrfsic_state *state,
                                    const struct btrfsic_block_link *l)
  {
-       pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+       pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
                l->ref_cnt,
                btrfsic_get_block_type(state, l->block_ref_from),
                l->block_ref_from->logical_bytenr,
-              l->block_ref_from->dev_state->name,
+              l->block_ref_from->dev_state->bdev,
                l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
                btrfsic_get_block_type(state, l->block_ref_to),
                l->block_ref_to->logical_bytenr,
-              l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+              l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
                l->block_ref_to->mirror_num);
  }
  
  static void btrfsic_print_rem_link(const struct btrfsic_state *state,
                                    const struct btrfsic_block_link *l)
  {
-       pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+       pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
                l->ref_cnt,
                btrfsic_get_block_type(state, l->block_ref_from),
                l->block_ref_from->logical_bytenr,
-              l->block_ref_from->dev_state->name,
+              l->block_ref_from->dev_state->bdev,
                l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
                btrfsic_get_block_type(state, l->block_ref_to),
                l->block_ref_to->logical_bytenr,
-              l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+              l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
                l->block_ref_to->mirror_num);
  }
  
@@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
          * This algorithm is recursive because the amount of used stack space
          * is very small and the max recursion depth is limited.
          */
-       indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+       indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
                              btrfsic_get_block_type(state, block),
-                            block->logical_bytenr, block->dev_state->name,
+                            block->logical_bytenr, block->dev_state->bdev,
                              block->dev_bytenr, block->mirror_num);
         if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
                 printk("[...]\n");
@@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
                 block->never_written = never_written;
                 block->mirror_num = mirror_num;
                 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+                       pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
                                additional_string,
                                btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, dev_state->name,
+                              block->logical_bytenr, dev_state->bdev,
                                block->dev_bytenr, mirror_num);
                 list_add(&block->all_blocks_node, &state->all_blocks_list);
                 btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
         }
  
         if (WARN_ON(!match)) {
-               pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
-                      bytenr, dev_state->name, dev_bytenr);
+               pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+                      bytenr, dev_state->bdev, dev_bytenr);
                 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
                         ret = btrfsic_map_block(state, bytenr,
                                                 state->metablock_size,
@@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                         if (ret)
                                 continue;
  
-                       pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
-                              bytenr, block_ctx.dev->name,
+                       pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+                              bytenr, block_ctx.dev->bdev,
                                block_ctx.dev_bytenr, mirror_num);
                 }
         }
@@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
                         if ((dev_state->state->print_mask &
                              (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                               BTRFSIC_PRINT_MASK_VERBOSE)))
-                               pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
-                                      dev_state->name);
+                               pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+                                      dev_state->bdev);
                 } else {
                         struct btrfsic_block *const block =
                                 &dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
  
         list_for_each_entry(device, dev_head, dev_list) {
                 struct btrfsic_dev_state *ds;
-               const char *p;
  
                 if (!device->bdev || !device->name)
                         continue;
@@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
                 }
                 ds->bdev = device->bdev;
                 ds->state = state;
-               bdevname(ds->bdev, ds->name);
-               ds->name[BDEVNAME_SIZE - 1] = '\0';
-               p = kbasename(ds->name);
-               strlcpy(ds->name, p, sizeof(ds->name));
                 btrfsic_dev_state_hashtable_add(ds,
                                                 &btrfsic_dev_state_hashtable);
         }
@@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
                 if (b_all->is_iodone || b_all->never_written)
                         btrfsic_block_free(b_all);
                 else
-                       pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
                                btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                                b_all->dev_bytenr, b_all->mirror_num);
         }
  
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index 6c7eb80..32da97c 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,6 +29,7 @@
  #include "compression.h"
  #include "extent_io.h"
  #include "extent_map.h"
+#include "subpage.h"
  #include "zoned.h"
  
  static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -181,9 +182,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
                         if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                 btrfs_print_data_csum_error(inode, disk_start,
                                                 csum, cb_sum, cb->mirror_num);
-                               if (btrfs_io_bio(bio)->device)
+                               if (btrfs_bio(bio)->device)
                                         btrfs_dev_stat_inc_and_print(
-                                               btrfs_io_bio(bio)->device,
+                                               btrfs_bio(bio)->device,
                                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
                                 return -EIO;
                         }
@@ -194,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
         return 0;
  }
  
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       unsigned int bi_size = 0;
+       bool last_io = false;
+       struct bio_vec *bvec;
+       struct bvec_iter_all iter_all;
+
+       /*
+        * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+        * Thus here we have to iterate through all segments to grab correct
+        * bio size.
+        */
+       bio_for_each_segment_all(bvec, bio, iter_all)
+               bi_size += bvec->bv_len;
+
+       if (bio->bi_status)
+               cb->errors = 1;
+
+       ASSERT(bi_size && bi_size <= cb->compressed_len);
+       last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+                                       &cb->pending_sectors);
+       /*
+        * Here we must wake up the possible error handler after all other
+        * operations on @cb finished, or we can race with
+        * finish_compressed_bio_*() which may free @cb.
+        */
+       wake_up_var(cb);
+
+       return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+{
+       unsigned int index;
+       struct page *page;
+
+       /* Release the compressed pages */
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               put_page(page);
+       }
+
+       /* Do io completion on the original bio */
+       if (cb->errors) {
+               bio_io_error(cb->orig_bio);
+       } else {
+               struct bio_vec *bvec;
+               struct bvec_iter_all iter_all;
+
+               ASSERT(bio);
+               ASSERT(!bio->bi_status);
+               /*
+                * We have verified the checksum already, set page checked so
+                * the end_io handlers know about it
+                */
+               ASSERT(!bio_flagged(bio, BIO_CLONED));
+               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+                       u64 bvec_start = page_offset(bvec->bv_page) +
+                                        bvec->bv_offset;
+
+                       btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+                                       bvec->bv_page, bvec_start,
+                                       bvec->bv_len);
+               }
+
+               bio_endio(cb->orig_bio);
+       }
+
+       /* Finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+}
+
  /* when we finish reading compressed pages from the disk, we
   * decompress them and then run the bio end_io routines on the
   * decompressed pages (in the inode address space).
@@ -208,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)
  {
         struct compressed_bio *cb = bio->bi_private;
         struct inode *inode;
-       struct page *page;
-       unsigned int index;
-       unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+       unsigned int mirror = btrfs_bio(bio)->mirror_num;
         int ret = 0;
  
-       if (bio->bi_status)
-               cb->errors = 1;
-
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
+       if (!dec_and_test_compressed_bio(cb, bio))
                 goto out;
  
         /*
          * Record the correct mirror_num in cb->orig_bio so that
          * read-repair can work properly.
          */
-       btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+       btrfs_bio(cb->orig_bio)->mirror_num = mirror;
         cb->mirror_num = mirror;
  
         /*
@@ -250,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)
  csum_failed:
         if (ret)
                 cb->errors = 1;
-
-       /* release the compressed pages */
-       index = 0;
-       for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
-               page->mapping = NULL;
-               put_page(page);
-       }
-
-       /* do io completion on the original bio */
-       if (cb->errors) {
-               bio_io_error(cb->orig_bio);
-       } else {
-               struct bio_vec *bvec;
-               struct bvec_iter_all iter_all;
-
-               /*
-                * we have verified the checksum already, set page
-                * checked so the end_io handlers know about it
-                */
-               ASSERT(!bio_flagged(bio, BIO_CLONED));
-               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
-                       SetPageChecked(bvec->bv_page);
-
-               bio_endio(cb->orig_bio);
-       }
-
-       /* finally free the cb struct */
-       kfree(cb->compressed_pages);
-       kfree(cb);
+       finish_compressed_bio_read(cb, bio);
  out:
         bio_put(bio);
  }
@@ -291,6 +336,7 @@ out:
  static noinline void end_compressed_writeback(struct inode *inode,
                                               const struct compressed_bio *cb)
  {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         unsigned long index = cb->start >> PAGE_SHIFT;
         unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
         struct page *pages[16];
@@ -313,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,
                 for (i = 0; i < ret; i++) {
                         if (cb->errors)
                                 SetPageError(pages[i]);
-                       end_page_writeback(pages[i]);
+                       btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+                                                        cb->start, cb->len);
                         put_page(pages[i]);
                 }
                 nr_pages -= ret;
@@ -322,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,
         /* the inode may be gone now */
  }
  
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
  {
-       struct compressed_bio *cb = bio->bi_private;
-       struct inode *inode;
-       struct page *page;
+       struct inode *inode = cb->inode;
         unsigned int index;
  
-       if (bio->bi_status)
-               cb->errors = 1;
-
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
-               goto out;
-
-       /* ok, we're the last bio for this extent, step one is to
-        * call back into the FS and do all the end_io operations
+       /*
+        * Ok, we're the last bio for this extent, step one is to call back
+        * into the FS and do all the end_io operations.
          */
-       inode = cb->inode;
-       btrfs_record_physical_zoned(inode, cb->start, bio);
         btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
                         cb->start, cb->start + cb->len - 1,
                         !cb->errors);
  
         end_compressed_writeback(inode, cb);
-       /* note, our inode could be gone now */
+       /* Note, our inode could be gone now */
  
         /*
-        * release the compressed pages, these came from alloc_page and
+        * Release the compressed pages, these came from alloc_page and
          * are not attached to the inode at all
          */
-       index = 0;
         for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
+               struct page *page = cb->compressed_pages[index];
+
                 page->mapping = NULL;
                 put_page(page);
         }
  
-       /* finally free the cb struct */
+       /* Finally free the cb struct */
         kfree(cb->compressed_pages);
         kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk.  This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+       struct compressed_bio *cb = bio->bi_private;
+
+       if (!dec_and_test_compressed_bio(cb, bio))
+               goto out;
+
+       btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+       finish_compressed_bio_write(cb);
  out:
         bio_put(bio);
  }
  
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+                                         struct compressed_bio *cb,
+                                         struct bio *bio, int mirror_num)
+{
+       blk_status_t ret;
+
+       ASSERT(bio->bi_iter.bi_size);
+       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+       if (ret)
+               return ret;
+       ret = btrfs_map_bio(fs_info, bio, mirror_num);
+       return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb:                 The compressed_bio structure, which records all the needed
+ *                      information to bind the compressed data to the uncompressed
+ *                      page cache.
+ * @disk_byten:         The logical bytenr where the compressed data will be read
+ *                      from or written to.
+ * @endio_func:         The endio function to call after the IO for compressed data
+ *                      is finished.
+ * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
+ *                      Let the caller know to only fill the bio up to the stripe
+ *                      boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+                                       unsigned int opf, bio_end_io_t endio_func,
+                                       u64 *next_stripe_start)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       struct btrfs_io_geometry geom;
+       struct extent_map *em;
+       struct bio *bio;
+       int ret;
+
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+       bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+       bio->bi_opf = opf;
+       bio->bi_private = cb;
+       bio->bi_end_io = endio_func;
+
+       em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+       if (IS_ERR(em)) {
+               bio_put(bio);
+               return ERR_CAST(em);
+       }
+
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+               bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+       free_extent_map(em);
+       if (ret < 0) {
+               bio_put(bio);
+               return ERR_PTR(ret);
+       }
+       *next_stripe_start = disk_bytenr + geom.len;
+
+       return bio;
+}
+
  /*
   * worker function to build and submit bios for previously compressed pages.
   * The corresponding pages in the inode should be marked for writeback
@@ -396,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         struct bio *bio = NULL;
         struct compressed_bio *cb;
-       unsigned long bytes_left;
-       int pg_index = 0;
-       struct page *page;
-       u64 first_byte = disk_start;
+       u64 cur_disk_bytenr = disk_start;
+       u64 next_stripe_start;
         blk_status_t ret;
         int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
         const bool use_append = btrfs_use_zone_append(inode, disk_start);
         const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
  
-       WARN_ON(!PAGE_ALIGNED(start));
+       ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+              IS_ALIGNED(len, fs_info->sectorsize));
         cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
         if (!cb)
                 return BLK_STS_RESOURCE;
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
         cb->errors = 0;
         cb->inode = &inode->vfs_inode;
         cb->start = start;
@@ -420,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
         cb->orig_bio = NULL;
         cb->nr_pages = nr_pages;
  
-       bio = btrfs_bio_alloc(first_byte);
-       bio->bi_opf = bio_op | write_flags;
-       bio->bi_private = cb;
-       bio->bi_end_io = end_compressed_bio_write;
-
-       if (use_append) {
-               struct btrfs_device *device;
-
-               device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
-               if (IS_ERR(device)) {
-                       kfree(cb);
-                       bio_put(bio);
-                       return BLK_STS_NOTSUPP;
+       while (cur_disk_bytenr < disk_start + compressed_len) {
+               u64 offset = cur_disk_bytenr - disk_start;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = compressed_pages[index];
+               bool submit = false;
+
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!bio) {
+                       bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+                               bio_op | write_flags, end_compressed_bio_write,
+                               &next_stripe_start);
+                       if (IS_ERR(bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(bio));
+                               bio = NULL;
+                               goto finish_cb;
+                       }
                 }
-
-               bio_set_dev(bio, device->bdev);
-       }
-
-       if (blkcg_css) {
-               bio->bi_opf |= REQ_CGROUP_PUNT;
-               kthread_associate_blkcg(blkcg_css);
-       }
-       refcount_set(&cb->pending_bios, 1);
-
-       /* create and submit bios for the compressed pages */
-       bytes_left = compressed_len;
-       for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
-               int submit = 0;
-               int len = 0;
-
-               page = compressed_pages[pg_index];
-               page->mapping = inode->vfs_inode.i_mapping;
-               if (bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
-                                                         0);
-
                 /*
-                * Page can only be added to bio if the current bio fits in
-                * stripe.
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
                  */
-               if (!submit) {
-                       if (pg_index == 0 && use_append)
-                               len = bio_add_zone_append_page(bio, page,
-                                                              PAGE_SIZE, 0);
-                       else
-                               len = bio_add_page(bio, page, PAGE_SIZE, 0);
-               }
-
-               page->mapping = NULL;
-               if (submit || len < PAGE_SIZE) {
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
-                       ret = btrfs_bio_wq_end_io(fs_info, bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
+               ASSERT(cur_disk_bytenr != next_stripe_start);
  
+               /*
+                * We have various limits on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+               if (use_append)
+                       added = bio_add_zone_append_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               else
+                       added = bio_add_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               /* Reached zoned boundary */
+               if (added == 0)
+                       submit = true;
+
+               cur_disk_bytenr += added;
+               /* Reached stripe boundary */
+               if (cur_disk_bytenr == next_stripe_start)
+                       submit = true;
+
+               /* Finished the range */
+               if (cur_disk_bytenr == disk_start + compressed_len)
+                       submit = true;
+
+               if (submit) {
                         if (!skip_sum) {
                                 ret = btrfs_csum_one_bio(inode, bio, start, 1);
-                               BUG_ON(ret); /* -ENOMEM */
-                       }
-
-                       ret = btrfs_map_bio(fs_info, bio, 0);
-                       if (ret) {
-                               bio->bi_status = ret;
-                               bio_endio(bio);
+                               if (ret)
+                                       goto finish_cb;
                         }
  
-                       bio = btrfs_bio_alloc(first_byte);
-                       bio->bi_opf = bio_op | write_flags;
-                       bio->bi_private = cb;
-                       bio->bi_end_io = end_compressed_bio_write;
-                       if (blkcg_css)
-                               bio->bi_opf |= REQ_CGROUP_PUNT;
-                       /*
-                        * Use bio_add_page() to ensure the bio has at least one
-                        * page.
-                        */
-                       bio_add_page(bio, page, PAGE_SIZE, 0);
+                       ret = submit_compressed_bio(fs_info, cb, bio, 0);
+                       if (ret)
+                               goto finish_cb;
+                       bio = NULL;
                 }
-               if (bytes_left < PAGE_SIZE) {
-                       btrfs_info(fs_info,
-                                       "bytes left %lu compress len %u nr %u",
-                              bytes_left, cb->compressed_len, cb->nr_pages);
-               }
-               bytes_left -= PAGE_SIZE;
-               first_byte += PAGE_SIZE;
                 cond_resched();
         }
+       if (blkcg_css)
+               kthread_associate_blkcg(NULL);
  
-       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
-
-       if (!skip_sum) {
-               ret = btrfs_csum_one_bio(inode, bio, start, 1);
-               BUG_ON(ret); /* -ENOMEM */
-       }
+       return 0;
  
-       ret = btrfs_map_bio(fs_info, bio, 0);
-       if (ret) {
+finish_cb:
+       if (bio) {
                 bio->bi_status = ret;
                 bio_endio(bio);
         }
+       /* Last byte of @cb is submitted, endio will free @cb */
+       if (cur_disk_bytenr == disk_start + compressed_len)
+               return ret;
  
-       if (blkcg_css)
-               kthread_associate_blkcg(NULL);
-
-       return 0;
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_start + compressed_len - cur_disk_bytenr) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_write(cb);
+       return ret;
  }
  
  static u64 bio_end_offset(struct bio *bio)
@@ -541,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)
         return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
  }
  
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
  static noinline int add_ra_bio_pages(struct inode *inode,
                                      u64 compressed_end,
                                      struct compressed_bio *cb)
  {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         unsigned long end_index;
-       unsigned long pg_index;
-       u64 last_offset;
+       u64 cur = bio_end_offset(cb->orig_bio);
         u64 isize = i_size_read(inode);
         int ret;
         struct page *page;
-       unsigned long nr_pages = 0;
         struct extent_map *em;
         struct address_space *mapping = inode->i_mapping;
         struct extent_map_tree *em_tree;
         struct extent_io_tree *tree;
-       u64 end;
-       int misses = 0;
+       int sectors_missed = 0;
  
-       last_offset = bio_end_offset(cb->orig_bio);
         em_tree = &BTRFS_I(inode)->extent_tree;
         tree = &BTRFS_I(inode)->io_tree;
  
@@ -578,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
  
         end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
  
-       while (last_offset < compressed_end) {
-               pg_index = last_offset >> PAGE_SHIFT;
+       while (cur < compressed_end) {
+               u64 page_end;
+               u64 pg_index = cur >> PAGE_SHIFT;
+               u32 add_size;
  
                 if (pg_index > end_index)
                         break;
  
                 page = xa_load(&mapping->i_pages, pg_index);
                 if (page && !xa_is_value(page)) {
-                       misses++;
-                       if (misses > 4)
+                       sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+                                         fs_info->sectorsize_bits;
+
+                       /* Beyond threshold, no need to continue */
+                       if (sectors_missed > 4)
                                 break;
-                       goto next;
+
+                       /*
+                        * Jump to next page start as we already have page for
+                        * current offset.
+                        */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                 }
  
                 page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -599,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
  
                 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
                         put_page(page);
-                       goto next;
+                       /* There is already a page, skip to page end */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                 }
  
-               /*
-                * at this point, we have a locked page in the page cache
-                * for these bytes in the file.  But, we have to make
-                * sure they map to this compressed extent on disk.
-                */
                 ret = set_page_extent_mapped(page);
                 if (ret < 0) {
                         unlock_page(page);
@@ -614,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                         break;
                 }
  
-               end = last_offset + PAGE_SIZE - 1;
-               lock_extent(tree, last_offset, end);
+               page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+               lock_extent(tree, cur, page_end);
                 read_lock(&em_tree->lock);
-               em = lookup_extent_mapping(em_tree, last_offset,
-                                          PAGE_SIZE);
+               em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
                 read_unlock(&em_tree->lock);
  
-               if (!em || last_offset < em->start ||
-                   (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+               /*
+                * At this point, we have a locked page in the page cache for
+                * these bytes in the file.  But, we have to make sure they map
+                * to this compressed extent on disk.
+                */
+               if (!em || cur < em->start ||
+                   (cur + fs_info->sectorsize > extent_map_end(em)) ||
                     (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
                         free_extent_map(em);
-                       unlock_extent(tree, last_offset, end);
+                       unlock_extent(tree, cur, page_end);
                         unlock_page(page);
                         put_page(page);
                         break;
@@ -643,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                         }
                 }
  
-               ret = bio_add_page(cb->orig_bio, page,
-                                  PAGE_SIZE, 0);
-
-               if (ret == PAGE_SIZE) {
-                       nr_pages++;
-                       put_page(page);
-               } else {
-                       unlock_extent(tree, last_offset, end);
+               add_size = min(em->start + em->len, page_end + 1) - cur;
+               ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+               if (ret != add_size) {
+                       unlock_extent(tree, cur, page_end);
                         unlock_page(page);
                         put_page(page);
                         break;
                 }
-next:
-               last_offset += PAGE_SIZE;
+               /*
+                * If it's subpage, we also need to increase its
+                * subpage::readers number, as at endio we will decrease
+                * subpage::readers and to unlock the page.
+                */
+               if (fs_info->sectorsize < PAGE_SIZE)
+                       btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+               put_page(page);
+               cur += add_size;
         }
         return 0;
  }
@@ -681,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
         unsigned int compressed_len;
         unsigned int nr_pages;
         unsigned int pg_index;
-       struct page *page;
-       struct bio *comp_bio;
-       u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+       struct bio *comp_bio = NULL;
+       const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 cur_disk_byte = disk_bytenr;
+       u64 next_stripe_start;
         u64 file_offset;
         u64 em_len;
         u64 em_start;
@@ -710,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
         if (!cb)
                 goto out;
  
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
         cb->errors = 0;
         cb->inode = inode;
         cb->mirror_num = mirror_num;
@@ -750,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
         /* include any pages we added in add_ra-bio_pages */
         cb->len = bio->bi_iter.bi_size;
  
-       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-       comp_bio->bi_opf = REQ_OP_READ;
-       comp_bio->bi_private = cb;
-       comp_bio->bi_end_io = end_compressed_bio_read;
-       refcount_set(&cb->pending_bios, 1);
-
-       for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-               u32 pg_len = PAGE_SIZE;
-               int submit = 0;
+       while (cur_disk_byte < disk_bytenr + compressed_len) {
+               u64 offset = cur_disk_byte - disk_bytenr;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = cb->compressed_pages[index];
+               bool submit = false;
+
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!comp_bio) {
+                       comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+                                       REQ_OP_READ, end_compressed_bio_read,
+                                       &next_stripe_start);
+                       if (IS_ERR(comp_bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(comp_bio));
+                               comp_bio = NULL;
+                               goto finish_cb;
+                       }
+               }
+               /*
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
+                */
+               ASSERT(cur_disk_byte != next_stripe_start);
+               /*
+                * We have various limit on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
  
+               added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
                 /*
-                * To handle subpage case, we need to make sure the bio only
-                * covers the range we need.
-                *
-                * If we're at the last page, truncate the length to only cover
-                * the remaining part.
+                * Maximum compressed extent is smaller than bio size limit,
+                * thus bio_add_page() should always success.
                  */
-               if (pg_index == nr_pages - 1)
-                       pg_len = min_t(u32, PAGE_SIZE,
-                                       compressed_len - pg_index * PAGE_SIZE);
+               ASSERT(added == real_size);
+               cur_disk_byte += added;
  
-               page = cb->compressed_pages[pg_index];
-               page->mapping = inode->i_mapping;
-               page->index = em_start >> PAGE_SHIFT;
+               /* Reached stripe boundary, need to submit */
+               if (cur_disk_byte == next_stripe_start)
+                       submit = true;
  
-               if (comp_bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, pg_len,
-                                                         comp_bio, 0);
+               /* Has finished the range, need to submit */
+               if (cur_disk_byte == disk_bytenr + compressed_len)
+                       submit = true;
  
-               page->mapping = NULL;
-               if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+               if (submit) {
                         unsigned int nr_sectors;
  
-                       ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
-
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
-
                         ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-                       BUG_ON(ret); /* -ENOMEM */
+                       if (ret)
+                               goto finish_cb;
  
                         nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
                                                   fs_info->sectorsize);
                         sums += fs_info->csum_size * nr_sectors;
  
-                       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-                       if (ret) {
-                               comp_bio->bi_status = ret;
-                               bio_endio(comp_bio);
-                       }
-
-                       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-                       comp_bio->bi_opf = REQ_OP_READ;
-                       comp_bio->bi_private = cb;
-                       comp_bio->bi_end_io = end_compressed_bio_read;
-
-                       bio_add_page(comp_bio, page, pg_len, 0);
+                       ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+                       if (ret)
+                               goto finish_cb;
+                       comp_bio = NULL;
                 }
-               cur_disk_byte += pg_len;
         }
-
-       ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
-
-       ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-       BUG_ON(ret); /* -ENOMEM */
-
-       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-       if (ret) {
-               comp_bio->bi_status = ret;
-               bio_endio(comp_bio);
-       }
-
         return 0;
  
  fail2:
@@ -844,6 +951,26 @@ fail1:
  out:
         free_extent_map(em);
         return ret;
+finish_cb:
+       if (comp_bio) {
+               comp_bio->bi_status = ret;
+               bio_endio(comp_bio);
+       }
+       /* All bytes of @cb is submitted, endio will free @cb */
+       if (cur_disk_byte == disk_bytenr + compressed_len)
+               return ret;
+
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_bytenr + compressed_len - cur_disk_byte) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish @cb manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_read(cb, NULL);
+       return ret;
  }
  
  /*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h

index 399be0b..56eef08 100644 (file)
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -28,8 +28,8 @@ struct btrfs_inode;
  #define        BTRFS_ZLIB_DEFAULT_LEVEL                3
  
  struct compressed_bio {
-       /* number of bios pending for this compressed extent */
-       refcount_t pending_bios;
+       /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+       refcount_t pending_sectors;
  
         /* Number of compressed pages in the array */
         unsigned int nr_pages;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 66290b2..c3983bd 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -396,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
         if (*cow_ret == buf)
                 unlock_orig = 1;
  
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
  
         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                 trans->transid != fs_info->running_transaction->transid);
@@ -2488,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
         int ret;
  
         BUG_ON(!path->nodes[level]);
-       btrfs_assert_tree_locked(path->nodes[level]);
+       btrfs_assert_tree_write_locked(path->nodes[level]);
         lower = path->nodes[level];
         nritems = btrfs_header_nritems(lower);
         BUG_ON(slot > nritems);
@@ -2828,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
         if (slot >= btrfs_header_nritems(upper) - 1)
                 return 1;
  
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
  
         right = btrfs_read_node_slot(upper, slot + 1);
         /*
@@ -3066,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
         if (right_nritems == 0)
                 return 1;
  
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
  
         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
         /*
@@ -3582,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
  }
  
  /*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
-                        struct btrfs_path *path,
-                        const struct btrfs_key *new_key)
-{
-       struct extent_buffer *leaf;
-       int ret;
-       u32 item_size;
-
-       leaf = path->nodes[0];
-       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-       ret = setup_leaf_for_split(trans, root, path,
-                                  item_size + sizeof(struct btrfs_item));
-       if (ret)
-               return ret;
-
-       path->slots[0]++;
-       setup_items_for_insert(root, path, new_key, &item_size, 1);
-       leaf = path->nodes[0];
-       memcpy_extent_buffer(leaf,
-                            btrfs_item_ptr_offset(leaf, path->slots[0]),
-                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
-                            item_size);
-       return 0;
-}
-
-/*
   * make the item pointed to by the path smaller.  new_size indicates
   * how small to make it, and from_end tells us if we just chop bytes
   * off the end of the item or if we shift the item to chop bytes off
@@ -3786,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
   *
   * @root:      root we are inserting items to
   * @path:      points to the leaf/slot where we are going to insert new items
- * @cpu_key:   array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr:                size of @cpu_key/@data_size arrays
+ * @batch:      information about the batch of items to insert
   */
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+                                  const struct btrfs_item_batch *batch)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_item *item;
@@ -3804,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
         int slot;
         struct btrfs_map_token token;
         u32 total_size;
-       u32 total_data = 0;
-
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
  
+       /*
+        * Before anything else, update keys in the parent and other ancestors
+        * if needed, then release the write locks on them, so that other tasks
+        * can use them while we modify the leaf.
+        */
         if (path->slots[0] == 0) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
                 fixup_low_keys(path, &disk_key, 1);
         }
         btrfs_unlock_up_safe(path, 1);
@@ -3821,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
  
         nritems = btrfs_header_nritems(leaf);
         data_end = leaf_data_end(leaf);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
  
         if (btrfs_leaf_free_space(leaf) < total_size) {
                 btrfs_print_leaf(leaf);
@@ -3850,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
                         item = btrfs_item_nr(i);
                         ioff = btrfs_token_item_offset(&token, item);
                         btrfs_set_token_item_offset(&token, item,
-                                                   ioff - total_data);
+                                                   ioff - batch->total_data_size);
                 }
                 /* shift the items */
-               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
                               btrfs_item_nr_offset(slot),
                               (nritems - slot) * sizeof(struct btrfs_item));
  
                 /* shift the data */
                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-                             data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
-                             data_end, old_data - data_end);
+                                     data_end - batch->total_data_size,
+                                     BTRFS_LEAF_DATA_OFFSET + data_end,
+                                     old_data - data_end);
                 data_end = old_data;
         }
  
         /* setup the item for the new data */
-       for (i = 0; i < nr; i++) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+       for (i = 0; i < batch->nr; i++) {
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
                 btrfs_set_item_key(leaf, &disk_key, slot + i);
                 item = btrfs_item_nr(slot + i);
-               data_end -= data_size[i];
+               data_end -= batch->data_sizes[i];
                 btrfs_set_token_item_offset(&token, item, data_end);
-               btrfs_set_token_item_size(&token, item, data_size[i]);
+               btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
         }
  
-       btrfs_set_header_nritems(leaf, nritems + nr);
+       btrfs_set_header_nritems(leaf, nritems + batch->nr);
         btrfs_mark_buffer_dirty(leaf);
  
         if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3884,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
  }
  
  /*
+ * Insert a new item into a leaf.
+ *
+ * @root:      The root of the btree.
+ * @path:      A path pointing to the target leaf and slot.
+ * @key:       The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                const struct btrfs_key *key,
+                                u32 data_size)
+{
+       struct btrfs_item_batch batch;
+
+       batch.keys = key;
+       batch.data_sizes = &data_size;
+       batch.total_data_size = data_size;
+       batch.nr = 1;
+
+       setup_items_for_insert(root, path, &batch);
+}
+
+/*
   * Given a key and some data, insert items into the tree.
   * This does all the path init required, making room in the tree if needed.
   */
  int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+                           const struct btrfs_item_batch *batch)
  {
         int ret = 0;
         int slot;
-       int i;
-       u32 total_size = 0;
-       u32 total_data = 0;
-
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
+       u32 total_size;
  
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
-       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+       ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
         if (ret == 0)
                 return -EEXIST;
         if (ret < 0)
@@ -3912,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
         slot = path->slots[0];
         BUG_ON(slot < 0);
  
-       setup_items_for_insert(root, path, cpu_key, data_size, nr);
+       setup_items_for_insert(root, path, batch);
         return 0;
  }
  
@@ -3944,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  }
  
  /*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const struct btrfs_key *new_key)
+{
+       struct extent_buffer *leaf;
+       int ret;
+       u32 item_size;
+
+       leaf = path->nodes[0];
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+       ret = setup_leaf_for_split(trans, root, path,
+                                  item_size + sizeof(struct btrfs_item));
+       if (ret)
+               return ret;
+
+       path->slots[0]++;
+       btrfs_setup_item_for_insert(root, path, new_key, item_size);
+       leaf = path->nodes[0];
+       memcpy_extent_buffer(leaf,
+                            btrfs_item_ptr_offset(leaf, path->slots[0]),
+                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+                            item_size);
+       return 0;
+}
+
+/*
   * delete the pointer from a given node.
   *
   * the tree should have been previously balanced so the deletion does not
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index c0cebcf..7553e9d 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
  extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
  struct btrfs_ordered_sum;
  struct btrfs_ref;
+struct btrfs_bio;
  
  #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
  
@@ -217,6 +218,9 @@ struct btrfs_root_backup {
         u8 unused_8[10];
  } __attribute__ ((__packed__));
  
+#define BTRFS_SUPER_INFO_OFFSET                        SZ_64K
+#define BTRFS_SUPER_INFO_SIZE                  4096
+
  /*
   * the super block basically lists the main trees of the FS
   * it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
         __le64 reserved[28];
         u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
         struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+       /* Padded to 4096 bytes */
+       u8 padding[565];
  } __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
  
  /*
   * Compat flags that we support.  If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
         struct btrfs_workqueue *scrub_workers;
         struct btrfs_workqueue *scrub_wr_completion_workers;
         struct btrfs_workqueue *scrub_parity_workers;
+       struct btrfs_subpage_info *subpage_info;
  
         struct btrfs_discard_ctl discard_ctl;
  
@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
         spinlock_t treelog_bg_lock;
         u64 treelog_bg;
  
+       /*
+        * Start of the dedicated data relocation block group, protected by
+        * relocation_bg_lock.
+        */
+       spinlock_t relocation_bg_lock;
+       u64 data_reloc_bg;
+
+       spinlock_t zone_active_bgs_lock;
+       struct list_head zone_active_bgs;
+
  #ifdef CONFIG_BTRFS_FS_REF_VERIFY
         spinlock_t ref_verify_lock;
         struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
         return btrfs_del_items(trans, root, path, path->slots[0], 1);
  }
  
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+       /*
+        * Pointer to an array containing the keys of the items to insert (in
+        * sorted order).
+        */
+       const struct btrfs_key *keys;
+       /* Pointer to an array containing the data size for each item to insert. */
+       const u32 *data_sizes;
+       /*
+        * The sum of data sizes for all items. The caller can compute this while
+        * setting up the data_sizes array, so it ends up being more efficient
+        * than having btrfs_insert_empty_items() or setup_item_for_insert()
+        * doing it, as it would avoid an extra loop over a potentially large
+        * array, and in the case of setup_item_for_insert(), we would be doing
+        * it while holding a write lock on a leaf and often on upper level nodes
+        * too, unnecessarily increasing the size of a critical section.
+        */
+       u32 total_data_size;
+       /* Size of the keys and data_sizes arrays (number of items in the batch). */
+       int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                const struct btrfs_key *key,
+                                u32 data_size);
  int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                       const struct btrfs_key *key, void *data, u32 data_size);
  int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_path *path,
-                            const struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr);
+                            const struct btrfs_item_batch *batch);
  
  static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
                                           struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
                                           const struct btrfs_key *key,
                                           u32 data_size)
  {
-       return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+       struct btrfs_item_batch batch;
+
+       batch.keys = key;
+       batch.data_sizes = &data_size;
+       batch.total_data_size = data_size;
+       batch.nr = 1;
+
+       return btrfs_insert_empty_items(trans, root, path, &batch);
  }
  
  int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
  /* inode.c */
  blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
                                    int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-                                   struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+                                   u32 bio_offset, struct page *page,
+                                   u64 start, u64 end);
  struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
                                            u64 start, u64 len);
  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
  int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
                        struct btrfs_inode *dir, struct btrfs_inode *inode,
                        const char *name, int name_len);
  int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
                                  struct extent_state *other);
  void btrfs_split_delalloc_extent(struct inode *inode,
                                  struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-                            unsigned long bio_flags);
  void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
  int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
  int btrfs_ioctl_get_supported_features(void __user *arg);
  void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
  int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                       struct btrfs_ioctl_defrag_range_args *range,
-                     u64 newer_than, unsigned long max_pages);
+                     u64 newer_than, unsigned long max_to_defrag);
  void btrfs_get_block_group_info(struct list_head *groups_list,
                                 struct btrfs_ioctl_space_info *space);
  void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do {                                                              \
                           (errno), fmt, ##args);                \
  } while (0)
  
+#define BTRFS_FS_ERROR(fs_info)        (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+                                                  &(fs_info)->fs_state)))
+
  __printf(5, 6)
  __cold
  void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
         return fs_info->zoned != 0;
  }
  
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+       return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
  /*
   * We use page status Private2 to indicate there is an ordered extent with
   * unfinished IO.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index 1e08eb2..e164766 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                      struct btrfs_path *path,
                                      struct btrfs_delayed_item *first_item)
  {
-       LIST_HEAD(batch);
+       LIST_HEAD(item_list);
         struct btrfs_delayed_item *curr;
         struct btrfs_delayed_item *next;
         const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+       struct btrfs_item_batch batch;
         int total_size;
-       int nitems;
         char *ins_data = NULL;
-       struct btrfs_key *ins_keys;
-       u32 *ins_sizes;
         int ret;
  
-       list_add_tail(&first_item->tree_list, &batch);
-       nitems = 1;
+       list_add_tail(&first_item->tree_list, &item_list);
+       batch.total_data_size = first_item->data_len;
+       batch.nr = 1;
         total_size = first_item->data_len + sizeof(struct btrfs_item);
         curr = first_item;
  
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                 if (total_size + next_size > max_size)
                         break;
  
-               list_add_tail(&next->tree_list, &batch);
-               nitems++;
+               list_add_tail(&next->tree_list, &item_list);
+               batch.nr++;
                 total_size += next_size;
+               batch.total_data_size += next->data_len;
                 curr = next;
         }
  
-       if (nitems == 1) {
-               ins_keys = &first_item->key;
-               ins_sizes = &first_item->data_len;
+       if (batch.nr == 1) {
+               batch.keys = &first_item->key;
+               batch.data_sizes = &first_item->data_len;
         } else {
+               struct btrfs_key *ins_keys;
+               u32 *ins_sizes;
                 int i = 0;
  
-               ins_data = kmalloc(nitems * sizeof(u32) +
-                                  nitems * sizeof(struct btrfs_key), GFP_NOFS);
+               ins_data = kmalloc(batch.nr * sizeof(u32) +
+                                  batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
                 if (!ins_data) {
                         ret = -ENOMEM;
                         goto out;
                 }
                 ins_sizes = (u32 *)ins_data;
-               ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
-               list_for_each_entry(curr, &batch, tree_list) {
+               ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+               batch.keys = ins_keys;
+               batch.data_sizes = ins_sizes;
+               list_for_each_entry(curr, &item_list, tree_list) {
                         ins_keys[i] = curr->key;
                         ins_sizes[i] = curr->data_len;
                         i++;
                 }
         }
  
-       ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
-                                      nitems);
+       ret = btrfs_insert_empty_items(trans, root, path, &batch);
         if (ret)
                 goto out;
  
-       list_for_each_entry(curr, &batch, tree_list) {
+       list_for_each_entry(curr, &item_list, tree_list) {
                 char *data_ptr;
  
                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
          */
         btrfs_release_path(path);
  
-       list_for_each_entry_safe(curr, next, &batch, tree_list) {
+       list_for_each_entry_safe(curr, next, &item_list, tree_list) {
                 list_del(&curr->tree_list);
                 btrfs_delayed_item_release_metadata(root, curr);
                 btrfs_release_delayed_item(curr);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c

index ca848b1..cca7e85 100644 (file)
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         u64 parent = generic_ref->parent;
         u8 ref_type;
  
-       is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+       is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
  
         ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
         BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         }
  
         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-           is_fstree(generic_ref->real_root) &&
-           is_fstree(generic_ref->tree_ref.root) &&
             !generic_ref->skip_qgroup) {
                 record = kzalloc(sizeof(*record), GFP_NOFS);
                 if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
                 ref_type = BTRFS_TREE_BLOCK_REF_KEY;
  
         init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-                               generic_ref->tree_ref.root, action, ref_type);
-       ref->root = generic_ref->tree_ref.root;
+                               generic_ref->tree_ref.owning_root, action,
+                               ref_type);
+       ref->root = generic_ref->tree_ref.owning_root;
         ref->parent = parent;
         ref->level = level;
  
         init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-                             generic_ref->tree_ref.root, 0, action, false,
-                             is_system);
+                             generic_ref->tree_ref.owning_root, 0, action,
+                             false, is_system);
         head_ref->extent_op = extent_op;
  
         delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         u64 bytenr = generic_ref->bytenr;
         u64 num_bytes = generic_ref->len;
         u64 parent = generic_ref->parent;
-       u64 ref_root = generic_ref->data_ref.ref_root;
+       u64 ref_root = generic_ref->data_ref.owning_root;
         u64 owner = generic_ref->data_ref.ino;
         u64 offset = generic_ref->data_ref.offset;
         u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         }
  
         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-           is_fstree(ref_root) &&
-           is_fstree(generic_ref->real_root) &&
             !generic_ref->skip_qgroup) {
                 record = kzalloc(sizeof(*record), GFP_NOFS);
                 if (!record) {
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h

index e22fba2..91a3aab 100644 (file)
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
  struct btrfs_data_ref {
         /* For EXTENT_DATA_REF */
  
-       /* Root which refers to this data extent */
-       u64 ref_root;
+       /* Original root this data extent belongs to */
+       u64 owning_root;
  
         /* Inode which refers to this data extent */
         u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
         int level;
  
         /*
-        * Root which refers to this tree block.
+        * Root which owns this tree block.
          *
          * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
          */
-       u64 root;
+       u64 owning_root;
  
         /* For non-skinny metadata, no special member needed */
  };
@@ -231,17 +231,10 @@ struct btrfs_ref {
          */
         bool skip_qgroup;
  
-       /*
-        * Optional. For which root is this modification.
-        * Mostly used for qgroup optimization.
-        *
-        * When unset, data/tree ref init code will populate it.
-        * In certain cases, we're modifying reference for a different root.
-        * E.g. COW fs tree blocks for balance.
-        * In that case, tree_ref::root will be fs tree, but we're doing this
-        * for reloc tree, then we should set @real_root to reloc tree.
-        */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+       /* Through which root is this modification. */
         u64 real_root;
+#endif
         u64 bytenr;
         u64 len;
  
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
  }
  
  static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
-                               int level, u64 root)
+                               int level, u64 root, u64 mod_root, bool skip_qgroup)
  {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
         /* If @real_root not set, use @root as fallback */
-       if (!generic_ref->real_root)
-               generic_ref->real_root = root;
+       generic_ref->real_root = mod_root ?: root;
+#endif
         generic_ref->tree_ref.level = level;
-       generic_ref->tree_ref.root = root;
+       generic_ref->tree_ref.owning_root = root;
         generic_ref->type = BTRFS_REF_METADATA;
+       if (skip_qgroup || !(is_fstree(root) &&
+                            (!mod_root || is_fstree(mod_root))))
+               generic_ref->skip_qgroup = true;
+       else
+               generic_ref->skip_qgroup = false;
+
  }
  
  static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
-                               u64 ref_root, u64 ino, u64 offset)
+                               u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+                               bool skip_qgroup)
  {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
         /* If @real_root not set, use @root as fallback */
-       if (!generic_ref->real_root)
-               generic_ref->real_root = ref_root;
-       generic_ref->data_ref.ref_root = ref_root;
+       generic_ref->real_root = mod_root ?: ref_root;
+#endif
+       generic_ref->data_ref.owning_root = ref_root;
         generic_ref->data_ref.ino = ino;
         generic_ref->data_ref.offset = offset;
         generic_ref->type = BTRFS_REF_DATA;
+       if (skip_qgroup || !(is_fstree(ref_root) &&
+                            (!mod_root || is_fstree(mod_root))))
+               generic_ref->skip_qgroup = true;
+       else
+               generic_ref->skip_qgroup = false;
  }
  
  static inline struct btrfs_delayed_extent_op *
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

index fbb8b44..c85a7d4 100644 (file)
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
  
  int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  {
+       struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
         struct btrfs_key key;
         struct btrfs_root *dev_root = fs_info->dev_root;
         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
                  * We don't have a replace item or it's corrupted.  If there is
                  * a replace target, fail the mount.
                  */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                         btrfs_err(fs_info,
                         "found replace target device without a valid replace item");
                         ret = -EUCLEAN;
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
                  * We don't have an active replace item but if there is a
                  * replace target, fail the mount.
                  */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                         btrfs_err(fs_info,
                         "replace devid present without an active replace item");
                         ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
                 break;
         case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
         case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
-                                               src_devid, NULL, NULL);
-               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
-                                                       BTRFS_DEV_REPLACE_DEVID,
-                                                       NULL, NULL);
+               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+               args.devid = src_devid;
+               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
                 /*
                  * allow 'btrfs dev replace_cancel' if src/tgt device is
                  * missing
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 29e7598..59c3be8 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -683,7 +683,7 @@ err:
         return ret;
  }
  
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
                                    struct page *page, u64 start, u64 end,
                                    int mirror)
  {
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
                 BUG_ON(!eb);
                 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                 BUG_ON(!atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                 return __set_page_dirty_nobuffers(page);
         }
         ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
                 ASSERT(eb);
                 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                 ASSERT(atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                 free_extent_buffer(eb);
  
                 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
         struct btrfs_fs_info *fs_info = buf->fs_info;
         if (btrfs_header_generation(buf) ==
             fs_info->running_transaction->transid) {
-               btrfs_assert_tree_locked(buf);
+               btrfs_assert_tree_write_locked(buf);
  
                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
                         percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
                 goto fail;
  
         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
-           root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+           !btrfs_is_data_reloc_root(root)) {
                 set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
                 btrfs_check_and_init_root_item(&root->root_item);
         }
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
         btrfs_extent_buffer_leak_debug_check(fs_info);
         kfree(fs_info->super_copy);
         kfree(fs_info->super_for_commit);
+       kfree(fs_info->subpage_info);
         kvfree(fs_info);
  }
  
@@ -1953,8 +1954,7 @@ sleep:
                 wake_up_process(fs_info->cleaner_kthread);
                 mutex_unlock(&fs_info->transaction_kthread_mutex);
  
-               if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
-                                     &fs_info->fs_state)))
+               if (BTRFS_FS_ERROR(fs_info))
                         btrfs_cleanup_transaction(fs_info);
                 if (!kthread_should_stop() &&
                                 (!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
  
         /*
          * For 4K page size, we only support 4K sector size.
-        * For 64K page size, we support read-write for 64K sector size, and
-        * read-only for 4K sector size.
+        * For 64K page size, we support 64K and 4K sector sizes.
          */
         if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
             (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
         spin_lock_init(&fs_info->buffer_lock);
         spin_lock_init(&fs_info->unused_bgs_lock);
         spin_lock_init(&fs_info->treelog_bg_lock);
+       spin_lock_init(&fs_info->zone_active_bgs_lock);
+       spin_lock_init(&fs_info->relocation_bg_lock);
         rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->unused_bg_unpin_mutex);
         mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
         INIT_LIST_HEAD(&fs_info->unused_bgs);
         INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+       INIT_LIST_HEAD(&fs_info->zone_active_bgs);
  #ifdef CONFIG_BTRFS_DEBUG
         INIT_LIST_HEAD(&fs_info->allocated_roots);
         INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
         btrfs_init_btree_inode(fs_info);
  
-       invalidate_bdev(fs_devices->latest_bdev);
+       invalidate_bdev(fs_devices->latest_dev->bdev);
  
         /*
          * Read super block and check the signature bytes only
          */
-       disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+       disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
         if (IS_ERR(disk_super)) {
                 err = PTR_ERR(disk_super);
                 goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                 goto fail_alloc;
         }
  
-       if (sectorsize != PAGE_SIZE) {
+       if (sectorsize < PAGE_SIZE) {
+               struct btrfs_subpage_info *subpage_info;
+
                 btrfs_warn(fs_info,
                 "read-write for sector size %u with page size %lu is experimental",
                            sectorsize, PAGE_SIZE);
-       }
-       if (sectorsize != PAGE_SIZE) {
                 if (btrfs_super_incompat_flags(fs_info->super_copy) &
                         BTRFS_FEATURE_INCOMPAT_RAID56) {
                         btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                         err = -EINVAL;
                         goto fail_alloc;
                 }
+               subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+               if (!subpage_info)
+                       goto fail_alloc;
+               btrfs_init_subpage_info(subpage_info, sectorsize);
+               fs_info->subpage_info = subpage_info;
         }
  
         ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
          * below in btrfs_init_dev_replace().
          */
         btrfs_free_extra_devids(fs_devices);
-       if (!fs_devices->latest_bdev) {
+       if (!fs_devices->latest_dev->bdev) {
                 btrfs_err(fs_info, "failed to read devices");
                 goto fail_tree_roots;
         }
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                 goto fail_sysfs;
         }
  
-       if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+       if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+           !btrfs_check_rw_degradable(fs_info, NULL)) {
                 btrfs_warn(fs_info,
                 "writable mount is not allowed due to too many missing devices");
                 goto fail_sysfs;
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
                         bio->bi_opf |= REQ_FUA;
  
                 btrfsic_submit_bio(bio);
-               btrfs_advance_sb_log(device, i);
+
+               if (btrfs_advance_sb_log(device, i))
+                       errors++;
         }
         return errors < i ? 0 : -1;
  }
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                 drop_ref = true;
         spin_unlock(&fs_info->fs_roots_radix_lock);
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                 ASSERT(root->log_root == NULL);
                 if (root->reloc_root) {
                         btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
                         btrfs_err(fs_info, "commit super ret %d", ret);
         }
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
-           test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 btrfs_error_commit_super(fs_info);
  
         kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
         if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
                 return;
  #endif
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
         if (transid != fs_info->generation)
                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
                         buf->start, transid, fs_info->generation);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index 0e7e952..a2b5db4 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -6,9 +6,6 @@
  #ifndef BTRFS_DISK_IO_H
  #define BTRFS_DISK_IO_H
  
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
  #define BTRFS_SUPER_MIRROR_MAX  3
  #define BTRFS_SUPER_MIRROR_SHIFT 12
  
@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
  void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
  void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                                  struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
                                    struct page *page, u64 start, u64 end,
                                    int mirror);
  blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 0ab456c..3fd736a 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
         return ret;
  }
  
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
  {
         struct btrfs_device *dev = stripe->dev;
         struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
         u64 discarded_bytes = 0;
         u64 end = bytenr + num_bytes;
         u64 cur = bytenr;
-       struct btrfs_bio *bbio = NULL;
-
+       struct btrfs_io_context *bioc = NULL;
  
         /*
-        * Avoid races with device replace and make sure our bbio has devices
+        * Avoid races with device replace and make sure our bioc has devices
          * associated to its stripes that don't go away while we are discarding.
          */
         btrfs_bio_counter_inc_blocked(fs_info);
         while (cur < end) {
-               struct btrfs_bio_stripe *stripe;
+               struct btrfs_io_stripe *stripe;
                 int i;
  
                 num_bytes = end - cur;
                 /* Tell the block device(s) that the sectors can be discarded */
                 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
-                                     &num_bytes, &bbio, 0);
+                                     &num_bytes, &bioc, 0);
                 /*
                  * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
                  * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                 if (ret < 0)
                         goto out;
  
-               stripe = bbio->stripes;
-               for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+               stripe = bioc->stripes;
+               for (i = 0; i < bioc->num_stripes; i++, stripe++) {
                         u64 bytes;
                         struct btrfs_device *device = stripe->dev;
  
@@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                                  * And since there are two loops, explicitly
                                  * go to out to avoid confusion.
                                  */
-                               btrfs_put_bbio(bbio);
+                               btrfs_put_bioc(bioc);
                                 goto out;
                         }
  
@@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                          */
                         ret = 0;
                 }
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                 cur += num_bytes;
         }
  out:
@@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
                generic_ref->action);
         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
-              generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+              generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
  
         if (generic_ref->type == BTRFS_REF_METADATA)
                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
  
  out:
         btrfs_free_path(path);
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+       if (btrfs_is_data_reloc_root(root))
                 WARN_ON(ret > 0);
         return ret;
  }
@@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                         key.offset -= btrfs_file_extent_offset(buf, fi);
                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
                                                num_bytes, parent);
-                       generic_ref.real_root = root->root_key.objectid;
                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
-                                           key.offset);
-                       generic_ref.skip_qgroup = for_reloc;
+                                           key.offset, root->root_key.objectid,
+                                           for_reloc);
                         if (inc)
                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
                         else
@@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                         num_bytes = fs_info->nodesize;
                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
                                                num_bytes, parent);
-                       generic_ref.real_root = root->root_key.objectid;
-                       btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
-                       generic_ref.skip_qgroup = for_reloc;
+                       btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+                                           root->root_key.objectid, for_reloc);
                         if (inc)
                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
                         else
@@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                         goto out;
                 }
  
-               ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
+               ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
                         goto out;
@@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
                                buf->start, buf->len, parent);
         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
-                           root->root_key.objectid);
+                           root->root_key.objectid, 0, false);
  
         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                 btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
          * tree, just update pinning info and exit early.
          */
         if ((ref->type == BTRFS_REF_METADATA &&
-            ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+            ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
             (ref->type == BTRFS_REF_DATA &&
-            ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+            ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
                 /* unlocks the pinned mutex */
                 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
                 ret = 0;
@@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
         }
  
         if (!((ref->type == BTRFS_REF_METADATA &&
-              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+              ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
               (ref->type == BTRFS_REF_DATA &&
-              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+              ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
                 btrfs_ref_tree_mod(fs_info, ref);
  
         return ret;
@@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy {
   */
  struct find_free_extent_ctl {
         /* Basic allocation info */
+       u64 ram_bytes;
         u64 num_bytes;
+       u64 min_alloc_size;
         u64 empty_size;
         u64 flags;
         int delalloc;
@@ -3495,6 +3494,9 @@ struct find_free_extent_ctl {
         /* Allocation is called for tree-log */
         bool for_treelog;
  
+       /* Allocation is called for data relocation */
+       bool for_data_reloc;
+
         /* RAID index, converted from flags */
         int index;
  
@@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
         u64 avail;
         u64 bytenr = block_group->start;
         u64 log_bytenr;
+       u64 data_reloc_bytenr;
         int ret = 0;
-       bool skip;
+       bool skip = false;
  
         ASSERT(btrfs_is_zoned(block_group->fs_info));
  
@@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
          */
         spin_lock(&fs_info->treelog_bg_lock);
         log_bytenr = fs_info->treelog_bg;
-       skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
-                             (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+       if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+                          (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+               skip = true;
         spin_unlock(&fs_info->treelog_bg_lock);
         if (skip)
                 return 1;
  
+       /*
+        * Do not allow non-relocation blocks in the dedicated relocation block
+        * group, and vice versa.
+        */
+       spin_lock(&fs_info->relocation_bg_lock);
+       data_reloc_bytenr = fs_info->data_reloc_bg;
+       if (data_reloc_bytenr &&
+           ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+            (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+               skip = true;
+       spin_unlock(&fs_info->relocation_bg_lock);
+       if (skip)
+               return 1;
+       /* Check RO and no space case before trying to activate it */
+       spin_lock(&block_group->lock);
+       if (block_group->ro ||
+           block_group->alloc_offset == block_group->zone_capacity) {
+               spin_unlock(&block_group->lock);
+               return 1;
+       }
+       spin_unlock(&block_group->lock);
+
+       if (!btrfs_zone_activate(block_group))
+               return 1;
+
         spin_lock(&space_info->lock);
         spin_lock(&block_group->lock);
         spin_lock(&fs_info->treelog_bg_lock);
+       spin_lock(&fs_info->relocation_bg_lock);
  
         ASSERT(!ffe_ctl->for_treelog ||
                block_group->start == fs_info->treelog_bg ||
                fs_info->treelog_bg == 0);
+       ASSERT(!ffe_ctl->for_data_reloc ||
+              block_group->start == fs_info->data_reloc_bg ||
+              fs_info->data_reloc_bg == 0);
  
         if (block_group->ro) {
                 ret = 1;
@@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
                 goto out;
         }
  
-       avail = block_group->length - block_group->alloc_offset;
+       /*
+        * Do not allow currently used block group to be the data relocation
+        * dedicated block group.
+        */
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+           (block_group->used || block_group->reserved)) {
+               ret = 1;
+               goto out;
+       }
+
+       WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+       avail = block_group->zone_capacity - block_group->alloc_offset;
         if (avail < num_bytes) {
                 if (ffe_ctl->max_extent_size < avail) {
                         /*
@@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
         if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
                 fs_info->treelog_bg = block_group->start;
  
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+               fs_info->data_reloc_bg = block_group->start;
+
         ffe_ctl->found_offset = start + block_group->alloc_offset;
         block_group->alloc_offset += num_bytes;
         spin_lock(&ctl->tree_lock);
@@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
  out:
         if (ret && ffe_ctl->for_treelog)
                 fs_info->treelog_bg = 0;
+       if (ret && ffe_ctl->for_data_reloc)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
         spin_unlock(&fs_info->treelog_bg_lock);
         spin_unlock(&block_group->lock);
         spin_unlock(&space_info->lock);
@@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
                 ffe_ctl->orig_have_caching_bg = true;
  
-       if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
-           ffe_ctl->have_caching_bg)
-               return 1;
-
-       if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
-               return 1;
-
         if (ins->objectid) {
                 found_extent(ffe_ctl, ins);
                 return 0;
         }
  
+       if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+           !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+               /*
+                * If we have enough free space left in an already active block
+                * group and we can't activate any other zone now, retry the
+                * active ones with a smaller allocation size.  Returning early
+                * from here will tell btrfs_reserve_extent() to haven the
+                * size.
+                */
+               return -ENOSPC;
+       }
+
+       if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+               return 1;
+
+       ffe_ctl->index++;
+       if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+               return 1;
+
         /*
          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
          *                      caching kthreads as we move along
@@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
                                 ffe_ctl->hint_byte = fs_info->treelog_bg;
                         spin_unlock(&fs_info->treelog_bg_lock);
                 }
+               if (ffe_ctl->for_data_reloc) {
+                       spin_lock(&fs_info->relocation_bg_lock);
+                       if (fs_info->data_reloc_bg)
+                               ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+                       spin_unlock(&fs_info->relocation_bg_lock);
+               }
                 return 0;
         default:
                 BUG();
@@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
   *    |- If not found, re-iterate all block groups
   */
  static noinline int find_free_extent(struct btrfs_root *root,
-                               u64 ram_bytes, u64 num_bytes, u64 empty_size,
-                               u64 hint_byte_orig, struct btrfs_key *ins,
-                               u64 flags, int delalloc)
+                                    struct btrfs_key *ins,
+                                    struct find_free_extent_ctl *ffe_ctl)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         int ret = 0;
         int cache_block_group_error = 0;
         struct btrfs_block_group *block_group = NULL;
-       struct find_free_extent_ctl ffe_ctl = {0};
         struct btrfs_space_info *space_info;
         bool full_search = false;
-       bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
  
-       WARN_ON(num_bytes < fs_info->sectorsize);
-
-       ffe_ctl.num_bytes = num_bytes;
-       ffe_ctl.empty_size = empty_size;
-       ffe_ctl.flags = flags;
-       ffe_ctl.search_start = 0;
-       ffe_ctl.delalloc = delalloc;
-       ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
-       ffe_ctl.have_caching_bg = false;
-       ffe_ctl.orig_have_caching_bg = false;
-       ffe_ctl.found_offset = 0;
-       ffe_ctl.hint_byte = hint_byte_orig;
-       ffe_ctl.for_treelog = for_treelog;
-       ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+       WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
  
+       ffe_ctl->search_start = 0;
         /* For clustered allocation */
-       ffe_ctl.retry_clustered = false;
-       ffe_ctl.retry_unclustered = false;
-       ffe_ctl.last_ptr = NULL;
-       ffe_ctl.use_cluster = true;
+       ffe_ctl->empty_cluster = 0;
+       ffe_ctl->last_ptr = NULL;
+       ffe_ctl->use_cluster = true;
+       ffe_ctl->have_caching_bg = false;
+       ffe_ctl->orig_have_caching_bg = false;
+       ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+       ffe_ctl->loop = 0;
+       /* For clustered allocation */
+       ffe_ctl->retry_clustered = false;
+       ffe_ctl->retry_unclustered = false;
+       ffe_ctl->cached = 0;
+       ffe_ctl->max_extent_size = 0;
+       ffe_ctl->total_free_space = 0;
+       ffe_ctl->found_offset = 0;
+       ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
  
         if (btrfs_is_zoned(fs_info))
-               ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+               ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
  
         ins->type = BTRFS_EXTENT_ITEM_KEY;
         ins->objectid = 0;
         ins->offset = 0;
  
-       trace_find_free_extent(root, num_bytes, empty_size, flags);
+       trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+                              ffe_ctl->flags);
  
-       space_info = btrfs_find_space_info(fs_info, flags);
+       space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
         if (!space_info) {
-               btrfs_err(fs_info, "No space info for %llu", flags);
+               btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
                 return -ENOSPC;
         }
  
-       ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+       ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
         if (ret < 0)
                 return ret;
  
-       ffe_ctl.search_start = max(ffe_ctl.search_start,
-                                  first_logical_byte(fs_info, 0));
-       ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
-       if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+       ffe_ctl->search_start = max(ffe_ctl->search_start,
+                                   first_logical_byte(fs_info, 0));
+       ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+       if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
                 block_group = btrfs_lookup_block_group(fs_info,
-                                                      ffe_ctl.search_start);
+                                                      ffe_ctl->search_start);
                 /*
                  * we don't want to use the block group if it doesn't match our
                  * allocation bits, or if its not cached.
@@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
                  * However if we are re-searching with an ideal block group
                  * picked out then we don't care that the block group is cached.
                  */
-               if (block_group && block_group_bits(block_group, flags) &&
+               if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
                     block_group->cached != BTRFS_CACHE_NO) {
                         down_read(&space_info->groups_sem);
                         if (list_empty(&block_group->list) ||
@@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
                                 btrfs_put_block_group(block_group);
                                 up_read(&space_info->groups_sem);
                         } else {
-                               ffe_ctl.index = btrfs_bg_flags_to_raid_index(
-                                               block_group->flags);
-                               btrfs_lock_block_group(block_group, delalloc);
+                               ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+                                                       block_group->flags);
+                               btrfs_lock_block_group(block_group,
+                                                      ffe_ctl->delalloc);
                                 goto have_block_group;
                         }
                 } else if (block_group) {
@@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
                 }
         }
  search:
-       ffe_ctl.have_caching_bg = false;
-       if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
-           ffe_ctl.index == 0)
+       ffe_ctl->have_caching_bg = false;
+       if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+           ffe_ctl->index == 0)
                 full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group,
-                           &space_info->block_groups[ffe_ctl.index], list) {
+                           &space_info->block_groups[ffe_ctl->index], list) {
                 struct btrfs_block_group *bg_ret;
  
                 /* If the block group is read-only, we can skip it entirely. */
                 if (unlikely(block_group->ro)) {
-                       if (for_treelog)
+                       if (ffe_ctl->for_treelog)
                                 btrfs_clear_treelog_bg(block_group);
+                       if (ffe_ctl->for_data_reloc)
+                               btrfs_clear_data_reloc_bg(block_group);
                         continue;
                 }
  
-               btrfs_grab_block_group(block_group, delalloc);
-               ffe_ctl.search_start = block_group->start;
+               btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+               ffe_ctl->search_start = block_group->start;
  
                 /*
                  * this can happen if we end up cycling through all the
                  * raid types, but we want to make sure we only allocate
                  * for the proper type.
                  */
-               if (!block_group_bits(block_group, flags)) {
+               if (!block_group_bits(block_group, ffe_ctl->flags)) {
                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4310,7 @@ search:
                          * doesn't provide them, bail.  This does allow us to
                          * fill raid0 from raid1.
                          */
-                       if ((flags & extra) && !(block_group->flags & extra))
+                       if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
                                 goto loop;
  
                         /*
@@ -4250,14 +4318,14 @@ search:
                          * It's possible that we have MIXED_GROUP flag but no
                          * block group is mixed.  Just skip such block group.
                          */
-                       btrfs_release_block_group(block_group, delalloc);
+                       btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                         continue;
                 }
  
  have_block_group:
-               ffe_ctl.cached = btrfs_block_group_done(block_group);
-               if (unlikely(!ffe_ctl.cached)) {
-                       ffe_ctl.have_caching_bg = true;
+               ffe_ctl->cached = btrfs_block_group_done(block_group);
+               if (unlikely(!ffe_ctl->cached)) {
+                       ffe_ctl->have_caching_bg = true;
                         ret = btrfs_cache_block_group(block_group, 0);
  
                         /*
@@ -4280,10 +4348,11 @@ have_block_group:
                         goto loop;
  
                 bg_ret = NULL;
-               ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+               ret = do_allocation(block_group, ffe_ctl, &bg_ret);
                 if (ret == 0) {
                         if (bg_ret && bg_ret != block_group) {
-                               btrfs_release_block_group(block_group, delalloc);
+                               btrfs_release_block_group(block_group,
+                                                         ffe_ctl->delalloc);
                                 block_group = bg_ret;
                         }
                 } else if (ret == -EAGAIN) {
@@ -4293,46 +4362,49 @@ have_block_group:
                 }
  
                 /* Checks */
-               ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
-                                            fs_info->stripesize);
+               ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+                                                fs_info->stripesize);
  
                 /* move on to the next group */
-               if (ffe_ctl.search_start + num_bytes >
+               if (ffe_ctl->search_start + ffe_ctl->num_bytes >
                     block_group->start + block_group->length) {
                         btrfs_add_free_space_unused(block_group,
-                                           ffe_ctl.found_offset, num_bytes);
+                                           ffe_ctl->found_offset,
+                                           ffe_ctl->num_bytes);
                         goto loop;
                 }
  
-               if (ffe_ctl.found_offset < ffe_ctl.search_start)
+               if (ffe_ctl->found_offset < ffe_ctl->search_start)
                         btrfs_add_free_space_unused(block_group,
-                                       ffe_ctl.found_offset,
-                                       ffe_ctl.search_start - ffe_ctl.found_offset);
+                                       ffe_ctl->found_offset,
+                                       ffe_ctl->search_start - ffe_ctl->found_offset);
  
-               ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
-                               num_bytes, delalloc);
+               ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+                                              ffe_ctl->num_bytes,
+                                              ffe_ctl->delalloc);
                 if (ret == -EAGAIN) {
                         btrfs_add_free_space_unused(block_group,
-                                       ffe_ctl.found_offset, num_bytes);
+                                       ffe_ctl->found_offset,
+                                       ffe_ctl->num_bytes);
                         goto loop;
                 }
                 btrfs_inc_block_group_reservations(block_group);
  
                 /* we are all good, lets return */
-               ins->objectid = ffe_ctl.search_start;
-               ins->offset = num_bytes;
+               ins->objectid = ffe_ctl->search_start;
+               ins->offset = ffe_ctl->num_bytes;
  
-               trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
-                                          num_bytes);
-               btrfs_release_block_group(block_group, delalloc);
+               trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+                                          ffe_ctl->num_bytes);
+               btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                 break;
  loop:
-               release_block_group(block_group, &ffe_ctl, delalloc);
+               release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
                 cond_resched();
         }
         up_read(&space_info->groups_sem);
  
-       ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+       ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
         if (ret > 0)
                 goto search;
  
@@ -4341,12 +4413,12 @@ loop:
                  * Use ffe_ctl->total_free_space as fallback if we can't find
                  * any contiguous hole.
                  */
-               if (!ffe_ctl.max_extent_size)
-                       ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+               if (!ffe_ctl->max_extent_size)
+                       ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
                 spin_lock(&space_info->lock);
-               space_info->max_extent_size = ffe_ctl.max_extent_size;
+               space_info->max_extent_size = ffe_ctl->max_extent_size;
                 spin_unlock(&space_info->lock);
-               ins->offset = ffe_ctl.max_extent_size;
+               ins->offset = ffe_ctl->max_extent_size;
         } else if (ret == -ENOSPC) {
                 ret = cache_block_group_error;
         }
@@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                          struct btrfs_key *ins, int is_data, int delalloc)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
+       struct find_free_extent_ctl ffe_ctl = {};
         bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
         bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+       bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
  
         flags = get_alloc_profile_by_root(root, is_data);
  again:
         WARN_ON(num_bytes < fs_info->sectorsize);
-       ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
-                              hint_byte, ins, flags, delalloc);
+
+       ffe_ctl.ram_bytes = ram_bytes;
+       ffe_ctl.num_bytes = num_bytes;
+       ffe_ctl.min_alloc_size = min_alloc_size;
+       ffe_ctl.empty_size = empty_size;
+       ffe_ctl.flags = flags;
+       ffe_ctl.delalloc = delalloc;
+       ffe_ctl.hint_byte = hint_byte;
+       ffe_ctl.for_treelog = for_treelog;
+       ffe_ctl.for_data_reloc = for_data_reloc;
+
+       ret = find_free_extent(root, ins, &ffe_ctl);
         if (!ret && !is_data) {
                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
         } else if (ret == -ENOSPC) {
@@ -4431,8 +4515,8 @@ again:
  
                         sinfo = btrfs_find_space_info(fs_info, flags);
                         btrfs_err(fs_info,
-                       "allocation failed flags %llu, wanted %llu tree-log %d",
-                                 flags, num_bytes, for_treelog);
+       "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+                                 flags, num_bytes, for_treelog, for_data_reloc);
                         if (sinfo)
                                 btrfs_dump_space_info(fs_info, sinfo,
                                                       num_bytes, 1);
@@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
+       ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
                         ins->objectid, ins->offset);
@@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                 return ret;
  
         ret = btrfs_update_block_group(trans, extent_key.objectid,
-                                      fs_info->nodesize, 1);
+                                      fs_info->nodesize, true);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
                         extent_key.objectid, extent_key.offset);
@@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  
         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
                                ins->objectid, ins->offset, 0);
-       btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+       btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+                           offset, 0, false);
         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
  
         return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
  
                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
                                        ins.objectid, ins.offset, parent);
-               generic_ref.real_root = root->root_key.objectid;
-               btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+               btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+                                   root->root_key.objectid, false);
                 btrfs_ref_tree_mod(fs_info, &generic_ref);
                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
                 if (ret)
@@ -5265,7 +5350,8 @@ skip:
  
                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                                        fs_info->nodesize, parent);
-               btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+                                   0, false);
                 ret = btrfs_free_extent(trans, &ref);
                 if (ret)
                         goto out_unlock;
@@ -5750,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                 return -ENOMEM;
         }
  
-       btrfs_assert_tree_locked(parent);
+       btrfs_assert_tree_write_locked(parent);
         parent_level = btrfs_header_level(parent);
         atomic_inc(&parent->refs);
         path->nodes[parent_level] = parent;
         path->slots[parent_level] = btrfs_header_nritems(parent);
  
-       btrfs_assert_tree_locked(node);
+       btrfs_assert_tree_write_locked(node);
         level = btrfs_header_level(node);
         path->nodes[level] = node;
         path->slots[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index aaddd72..4e03a6d 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -241,7 +241,7 @@ int __init extent_io_init(void)
                 return -ENOMEM;
  
         if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
-                       offsetof(struct btrfs_io_bio, bio),
+                       offsetof(struct btrfs_bio, bio),
                         BIOSET_NEED_BVECS))
                 goto free_buffer_cache;
  
@@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
  
  /*
   * Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes.  @Start and @end are used to return the range,
+ * more than @max_bytes.
   *
- * Return: true if we find something
- *         false if nothing was in the tree
+ * @start:     The original start bytenr to search.
+ *             Will store the extent range start bytenr.
+ * @end:       The original end bytenr of the search range
+ *             Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
   */
  EXPORT_FOR_TESTS
  noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
                                     u64 *end)
  {
         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       const u64 orig_start = *start;
+       const u64 orig_end = *end;
         u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
         u64 delalloc_start;
         u64 delalloc_end;
@@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
         int ret;
         int loops = 0;
  
+       /* Caller should pass a valid @end to indicate the search range end */
+       ASSERT(orig_end > orig_start);
+
+       /* The range should at least cover part of the page */
+       ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+                orig_end <= page_offset(locked_page)));
  again:
         /* step one, find a bunch of delalloc bytes starting at start */
         delalloc_start = *start;
         delalloc_end = 0;
         found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
                                           max_bytes, &cached_state);
-       if (!found || delalloc_end <= *start) {
+       if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
                 *start = delalloc_start;
-               *end = delalloc_end;
+
+               /* @delalloc_end can be -1, never go beyond @orig_end */
+               *end = min(delalloc_end, orig_end);
                 free_extent_state(cached_state);
                 return false;
         }
@@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree,
   * currently, there can be no more than two copies of every data bit. thus,
   * exactly one rewrite is required.
   */
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-                     u64 length, u64 logical, struct page *page,
-                     unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+                            u64 length, u64 logical, struct page *page,
+                            unsigned int pg_offset, int mirror_num)
  {
         struct bio *bio;
         struct btrfs_device *dev;
         u64 map_length = 0;
         u64 sector;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         int ret;
  
         ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
         if (btrfs_is_zoned(fs_info))
                 return btrfs_repair_one_zone(fs_info, logical);
  
-       bio = btrfs_io_bio_alloc(1);
+       bio = btrfs_bio_alloc(1);
         bio->bi_iter.bi_size = 0;
         map_length = length;
  
         /*
-        * Avoid races with device replace and make sure our bbio has devices
+        * Avoid races with device replace and make sure our bioc has devices
          * associated to its stripes that don't go away while we are doing the
          * read repair operation.
          */
@@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                  * stripe's dev and sector.
                  */
                 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-                                     &map_length, &bbio, 0);
+                                     &map_length, &bioc, 0);
                 if (ret) {
                         btrfs_bio_counter_dec(fs_info);
                         bio_put(bio);
                         return -EIO;
                 }
-               ASSERT(bbio->mirror_num == 1);
+               ASSERT(bioc->mirror_num == 1);
         } else {
                 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-                                     &map_length, &bbio, mirror_num);
+                                     &map_length, &bioc, mirror_num);
                 if (ret) {
                         btrfs_bio_counter_dec(fs_info);
                         bio_put(bio);
                         return -EIO;
                 }
-               BUG_ON(mirror_num != bbio->mirror_num);
+               BUG_ON(mirror_num != bioc->mirror_num);
         }
  
-       sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+       sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
         bio->bi_iter.bi_sector = sector;
-       dev = bbio->stripes[bbio->mirror_num - 1].dev;
-       btrfs_put_bbio(bbio);
+       dev = bioc->stripes[bioc->mirror_num - 1].dev;
+       btrfs_put_bioc(bioc);
         if (!dev || !dev->bdev ||
             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
                 btrfs_bio_counter_dec(fs_info);
@@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-       struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+       struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
         const int icsum = bio_offset >> fs_info->sectorsize_bits;
         struct bio *repair_bio;
-       struct btrfs_io_bio *repair_io_bio;
+       struct btrfs_bio *repair_bbio;
         blk_status_t status;
  
         btrfs_debug(fs_info,
@@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode,
                 return -EIO;
         }
  
-       repair_bio = btrfs_io_bio_alloc(1);
-       repair_io_bio = btrfs_io_bio(repair_bio);
+       repair_bio = btrfs_bio_alloc(1);
+       repair_bbio = btrfs_bio(repair_bio);
         repair_bio->bi_opf = REQ_OP_READ;
         repair_bio->bi_end_io = failed_bio->bi_end_io;
         repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
         repair_bio->bi_private = failed_bio->bi_private;
  
-       if (failed_io_bio->csum) {
+       if (failed_bbio->csum) {
                 const u32 csum_size = fs_info->csum_size;
  
-               repair_io_bio->csum = repair_io_bio->csum_inline;
-               memcpy(repair_io_bio->csum,
-                      failed_io_bio->csum + csum_size * icsum, csum_size);
+               repair_bbio->csum = repair_bbio->csum_inline;
+               memcpy(repair_bbio->csum,
+                      failed_bbio->csum + csum_size * icsum, csum_size);
         }
  
         bio_add_page(repair_bio, page, failrec->len, pgoff);
-       repair_io_bio->logical = failrec->start;
-       repair_io_bio->iter = repair_bio->bi_iter;
+       repair_bbio->iter = repair_bio->bi_iter;
  
         btrfs_debug(btrfs_sb(inode->i_sb),
                     "repair read error: submitting new read to mirror %d",
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
  static void end_bio_extent_readpage(struct bio *bio)
  {
         struct bio_vec *bvec;
-       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct btrfs_bio *bbio = btrfs_bio(bio);
         struct extent_io_tree *tree, *failure_tree;
         struct processed_extent processed = { 0 };
         /*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
                 btrfs_debug(fs_info,
                         "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
                         bio->bi_iter.bi_sector, bio->bi_status,
-                       io_bio->mirror_num);
+                       bbio->mirror_num);
                 tree = &BTRFS_I(inode)->io_tree;
                 failure_tree = &BTRFS_I(inode)->io_failure_tree;
  
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
                 end = start + bvec->bv_len - 1;
                 len = bvec->bv_len;
  
-               mirror = io_bio->mirror_num;
+               mirror = bbio->mirror_num;
                 if (likely(uptodate)) {
                         if (is_data_inode(inode)) {
-                               error_bitmap = btrfs_verify_data_csum(io_bio,
+                               error_bitmap = btrfs_verify_data_csum(bbio,
                                                 bio_offset, page, start, end);
                                 ret = error_bitmap;
                         } else {
-                               ret = btrfs_validate_metadata_buffer(io_bio,
+                               ret = btrfs_validate_metadata_buffer(bbio,
                                         page, start, end, mirror);
                         }
                         if (ret)
@@ -3106,7 +3123,7 @@ readpage_ok:
         }
         /* Release the last extent */
         endio_readpage_release_extent(&processed, NULL, 0, 0, false);
-       btrfs_io_bio_free_csum(io_bio);
+       btrfs_bio_free_csum(bbio);
         bio_put(bio);
  }
  
@@ -3115,53 +3132,43 @@ readpage_ok:
   * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
   * 'bio' because use of __GFP_ZERO is not supported.
   */
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
  {
-       memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+       memset(bbio, 0, offsetof(struct btrfs_bio, bio));
  }
  
  /*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail.  We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
   */
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
  {
         struct bio *bio;
  
-       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
-       bio->bi_iter.bi_sector = first_byte >> 9;
-       btrfs_io_bio_init(btrfs_io_bio(bio));
+       ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+       bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+       btrfs_bio_init(btrfs_bio(bio));
         return bio;
  }
  
  struct bio *btrfs_bio_clone(struct bio *bio)
  {
-       struct btrfs_io_bio *btrfs_bio;
+       struct btrfs_bio *bbio;
         struct bio *new;
  
         /* Bio allocation backed by a bioset does not fail */
         new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
-       btrfs_bio = btrfs_io_bio(new);
-       btrfs_io_bio_init(btrfs_bio);
-       btrfs_bio->iter = bio->bi_iter;
+       bbio = btrfs_bio(new);
+       btrfs_bio_init(bbio);
+       bbio->iter = bio->bi_iter;
         return new;
  }
  
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
-       struct bio *bio;
-
-       /* Bio allocation backed by a bioset does not fail */
-       bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
-       btrfs_io_bio_init(btrfs_io_bio(bio));
-       return bio;
-}
-
  struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
  {
         struct bio *bio;
-       struct btrfs_io_bio *btrfs_bio;
+       struct btrfs_bio *bbio;
  
         ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
  
@@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
         bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
         ASSERT(bio);
  
-       btrfs_bio = btrfs_io_bio(bio);
-       btrfs_io_bio_init(btrfs_bio);
+       bbio = btrfs_bio(bio);
+       btrfs_bio_init(bbio);
  
         bio_trim(bio, offset >> 9, size >> 9);
-       btrfs_bio->iter = bio->bi_iter;
+       bbio->iter = bio->bi_iter;
         return bio;
  }
  
@@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode,
         struct bio *bio;
         int ret;
  
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
         /*
          * For compressed page range, its disk_bytenr is always @disk_bytenr
          * passed in, no matter if we have added any range into previous bio.
          */
         if (bio_flags & EXTENT_BIO_COMPRESSED)
-               bio = btrfs_bio_alloc(disk_bytenr);
+               bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
         else
-               bio = btrfs_bio_alloc(disk_bytenr + offset);
+               bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
         bio_ctrl->bio = bio;
         bio_ctrl->bio_flags = bio_flags;
         bio->bi_end_io = end_io_func;
@@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
         if (wbc) {
                 struct block_device *bdev;
  
-               bdev = fs_info->fs_devices->latest_bdev;
+               bdev = fs_info->fs_devices->latest_dev->bdev;
                 bio_set_dev(bio, bdev);
                 wbc_init_bio(wbc, bio);
         }
@@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
                         goto error;
                 }
  
-               btrfs_io_bio(bio)->device = device;
+               btrfs_bio(bio)->device = device;
         }
         return 0;
  error:
@@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                 bool force_bio_submit = false;
                 u64 disk_bytenr;
  
+               ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
                 if (cur >= last_byte) {
                         struct extent_state *cached = NULL;
  
@@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc,
   */
  static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
                 struct page *page, struct writeback_control *wbc,
-               u64 delalloc_start, unsigned long *nr_written)
+               unsigned long *nr_written)
  {
-       u64 page_end = delalloc_start + PAGE_SIZE - 1;
-       bool found;
+       const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+       u64 delalloc_start = page_offset(page);
         u64 delalloc_to_write = 0;
-       u64 delalloc_end = 0;
         int ret;
         int page_started = 0;
  
+       while (delalloc_start < page_end) {
+               u64 delalloc_end = page_end;
+               bool found;
  
-       while (delalloc_end < page_end) {
                 found = find_lock_delalloc_range(&inode->vfs_inode, page,
                                                &delalloc_start,
                                                &delalloc_end);
@@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
                                  struct page *page, u64 *start, u64 *end)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       struct btrfs_subpage_info *spi = fs_info->subpage_info;
         u64 orig_start = *start;
         /* Declare as unsigned long so we can use bitmap ops */
-       unsigned long dirty_bitmap;
         unsigned long flags;
-       int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
-       int range_start_bit = nbits;
+       int range_start_bit;
         int range_end_bit;
  
         /*
@@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
                 return;
         }
  
+       range_start_bit = spi->dirty_offset +
+                         (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
         /* We should have the page locked, but just in case */
         spin_lock_irqsave(&subpage->lock, flags);
-       dirty_bitmap = subpage->dirty_bitmap;
+       bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+                              spi->dirty_offset + spi->bitmap_nr_bits);
         spin_unlock_irqrestore(&subpage->lock, flags);
  
-       bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
-                              BTRFS_SUBPAGE_BITMAP_SIZE);
+       range_start_bit -= spi->dirty_offset;
+       range_end_bit -= spi->dirty_offset;
+
         *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
         *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
  }
@@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                               struct extent_page_data *epd)
  {
         struct inode *inode = page->mapping->host;
-       u64 start = page_offset(page);
-       u64 page_end = start + PAGE_SIZE - 1;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       const u64 page_start = page_offset(page);
+       const u64 page_end = page_start + PAGE_SIZE - 1;
         int ret;
         int nr = 0;
         size_t pg_offset;
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
         }
  
         if (!epd->extent_locked) {
-               ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
-                                        &nr_written);
+               ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
                 if (ret == 1)
                         return 0;
                 if (ret)
@@ -4141,8 +4155,20 @@ done:
          * capable of that.
          */
         if (PageError(page))
-               end_extent_writepage(page, ret, start, page_end);
-       unlock_page(page);
+               end_extent_writepage(page, ret, page_start, page_end);
+       if (epd->extent_locked) {
+               /*
+                * If epd->extent_locked, it's from extent_write_locked_range(),
+                * the page can either be locked by lock_page() or
+                * process_one_page().
+                * Let btrfs_page_unlock_writer() handle both cases.
+                */
+               ASSERT(wbc);
+               btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+                                        wbc->range_end + 1 - wbc->range_start);
+       } else {
+               unlock_page(page);
+       }
         ASSERT(ret <= 0);
         return ret;
  }
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
  
  static void end_extent_buffer_writeback(struct extent_buffer *eb)
  {
+       if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+               btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
         clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
         smp_mb__after_atomic();
         wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page,
         int submitted = 0;
         u64 page_start = page_offset(page);
         int bit_start = 0;
-       const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
         int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
         int ret;
  
         /* Lock and write each dirty extent buffers in the range */
-       while (bit_start < nbits) {
+       while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
                 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
                 struct extent_buffer *eb;
                 unsigned long flags;
@@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page,
                         break;
                 }
                 spin_lock_irqsave(&subpage->lock, flags);
-               if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+               if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+                             subpage->bitmaps)) {
                         spin_unlock_irqrestore(&subpage->lock, flags);
                         spin_unlock(&page->mapping->private_lock);
                         bit_start++;
@@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
                 free_extent_buffer(eb);
                 return ret;
         }
-       if (cache)
+       if (cache) {
+               /* Impiles write in zoned mode */
                 btrfs_put_block_group(cache);
+               /* Mark the last eb in a block group */
+               if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+                       set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+       }
         ret = write_one_eb(eb, wbc, epd);
         free_extent_buffer(eb);
         if (ret < 0)
@@ -4873,7 +4907,7 @@ retry:
          *   extent io tree. Thus we don't want to submit such wild eb
          *   if the fs already has error.
          */
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (!BTRFS_FS_ERROR(fs_info)) {
                 ret = flush_write_bio(&epd);
         } else {
                 ret = -EROFS;
@@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
         return ret;
  }
  
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
-                             int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
  {
+       bool found_error = false;
+       int first_error = 0;
         int ret = 0;
         struct address_space *mapping = inode->i_mapping;
         struct page *page;
-       unsigned long nr_pages = (end - start + PAGE_SIZE) >>
-               PAGE_SHIFT;
-
+       u64 cur = start;
+       unsigned long nr_pages;
+       const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
         struct extent_page_data epd = {
                 .bio_ctrl = { 0 },
                 .extent_locked = 1,
-               .sync_io = mode == WB_SYNC_ALL,
+               .sync_io = 1,
         };
         struct writeback_control wbc_writepages = {
-               .sync_mode      = mode,
-               .nr_to_write    = nr_pages * 2,
+               .sync_mode      = WB_SYNC_ALL,
                 .range_start    = start,
                 .range_end      = end + 1,
                 /* We're called from an async helper function */
@@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
                 .no_cgroup_owner = 1,
         };
  
+       ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+       nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+                  PAGE_SHIFT;
+       wbc_writepages.nr_to_write = nr_pages * 2;
+
         wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
-       while (start <= end) {
-               page = find_get_page(mapping, start >> PAGE_SHIFT);
-               if (clear_page_dirty_for_io(page))
-                       ret = __extent_writepage(page, &wbc_writepages, &epd);
-               else {
-                       btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
-                                       page, start, start + PAGE_SIZE - 1, true);
-                       unlock_page(page);
+       while (cur <= end) {
+               u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+               page = find_get_page(mapping, cur >> PAGE_SHIFT);
+               /*
+                * All pages in the range are locked since
+                * btrfs_run_delalloc_range(), thus there is no way to clear
+                * the page dirty flag.
+                */
+               ASSERT(PageLocked(page));
+               ASSERT(PageDirty(page));
+               clear_page_dirty_for_io(page);
+               ret = __extent_writepage(page, &wbc_writepages, &epd);
+               ASSERT(ret <= 0);
+               if (ret < 0) {
+                       found_error = true;
+                       first_error = ret;
                 }
                 put_page(page);
-               start += PAGE_SIZE;
+               cur = cur_end + 1;
         }
  
-       ASSERT(ret <= 0);
-       if (ret == 0)
+       if (!found_error)
                 ret = flush_write_bio(&epd);
         else
                 end_write_bio(&epd, ret);
  
         wbc_detach_inode(&wbc_writepages);
+       if (found_error)
+               return first_error;
         return ret;
  }
  
  int extent_writepages(struct address_space *mapping,
                       struct writeback_control *wbc)
  {
+       struct inode *inode = mapping->host;
+       const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
+       const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
         int ret = 0;
         struct extent_page_data epd = {
                 .bio_ctrl = { 0 },
@@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping,
                 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
         };
  
+       /*
+        * Allow only a single thread to do the reloc work in zoned mode to
+        * protect the write pointer updates.
+        */
+       if (data_reloc && zoned)
+               btrfs_inode_lock(inode, 0);
         ret = extent_write_cache_pages(mapping, wbc, &epd);
+       if (data_reloc && zoned)
+               btrfs_inode_unlock(inode, 0);
         ASSERT(ret <= 0);
         if (ret < 0) {
                 end_write_bio(&epd, ret);
@@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                  * page, but it may change in the future for 16K page size
                  * support, so we still preallocate the memory in the loop.
                  */
-               ret = btrfs_alloc_subpage(fs_info, &prealloc,
-                                         BTRFS_SUBPAGE_METADATA);
-               if (ret < 0) {
-                       unlock_page(p);
-                       put_page(p);
-                       exists = ERR_PTR(ret);
-                       goto free_eb;
+               if (fs_info->sectorsize < PAGE_SIZE) {
+                       prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+                       if (IS_ERR(prealloc)) {
+                               ret = PTR_ERR(prealloc);
+                               unlock_page(p);
+                               put_page(p);
+                               exists = ERR_PTR(ret);
+                               goto free_eb;
+                       }
                 }
  
                 spin_lock(&mapping->private_lock);
@@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
         }
  }
  
+#define GANG_LOOKUP_SIZE       16
  static struct extent_buffer *get_next_extent_buffer(
                 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
  {
-       struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+       struct extent_buffer *gang[GANG_LOOKUP_SIZE];
         struct extent_buffer *found = NULL;
         u64 page_start = page_offset(page);
-       int ret;
-       int i;
+       u64 cur = page_start;
  
         ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
-       ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
         lockdep_assert_held(&fs_info->buffer_lock);
  
-       ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
-                       bytenr >> fs_info->sectorsize_bits,
-                       PAGE_SIZE / fs_info->nodesize);
-       for (i = 0; i < ret; i++) {
-               /* Already beyond page end */
-               if (gang[i]->start >= page_start + PAGE_SIZE)
-                       break;
-               /* Found one */
-               if (gang[i]->start >= bytenr) {
-                       found = gang[i];
-                       break;
+       while (cur < page_start + PAGE_SIZE) {
+               int ret;
+               int i;
+
+               ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+                               (void **)gang, cur >> fs_info->sectorsize_bits,
+                               min_t(unsigned int, GANG_LOOKUP_SIZE,
+                                     PAGE_SIZE / fs_info->nodesize));
+               if (ret == 0)
+                       goto out;
+               for (i = 0; i < ret; i++) {
+                       /* Already beyond page end */
+                       if (gang[i]->start >= page_start + PAGE_SIZE)
+                               goto out;
+                       /* Found one */
+                       if (gang[i]->start >= bytenr) {
+                               found = gang[i];
+                               goto out;
+                       }
                 }
+               cur = gang[ret - 1]->start + gang[ret - 1]->len;
         }
+out:
         return found;
  }
  
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index 53abdc2..0399cf8 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@ enum {
         /* write IO error */
         EXTENT_BUFFER_WRITE_ERR,
         EXTENT_BUFFER_NO_CHECK,
+       EXTENT_BUFFER_ZONE_FINISH,
  };
  
  /* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                       struct btrfs_bio_ctrl *bio_ctrl,
                       unsigned int read_flags, u64 *prev_em_start);
  int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
-                             int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
  int extent_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
  int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
  void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                   struct page *locked_page,
                                   u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
  struct bio *btrfs_bio_clone(struct bio *bio);
  struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
  
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-                     u64 length, u64 logical, struct page *page,
-                     unsigned int pg_offset, int mirror_num);
  void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
  int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
  
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index 4a8e02f..5a36add 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
         int i;
  
         for (i = 0; i < map->num_stripes; i++) {
-               struct btrfs_bio_stripe *stripe = &map->stripes[i];
+               struct btrfs_io_stripe *stripe = &map->stripes[i];
                 struct btrfs_device *device = stripe->dev;
  
                 set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
         int i;
  
         for (i = 0; i < map->num_stripes; i++) {
-               struct btrfs_bio_stripe *stripe = &map->stripes[i];
+               struct btrfs_io_stripe *stripe = &map->stripes[i];
                 struct btrfs_device *device = stripe->dev;
  
                 __clear_extent_bit(&device->alloc_state, stripe->physical,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index 0b9401a..d1cbb64 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
   * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
   *       checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
   *       NULL, the checksum buffer is allocated and returned in
- *       btrfs_io_bio(bio)->csum instead.
+ *       btrfs_bio(bio)->csum instead.
   *
   * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
   */
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
                 return BLK_STS_RESOURCE;
  
         if (!dst) {
-               struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+               struct btrfs_bio *bbio = btrfs_bio(bio);
  
                 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-                       btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
-                                                       GFP_NOFS);
-                       if (!btrfs_bio->csum) {
+                       bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+                       if (!bbio->csum) {
                                 btrfs_free_path(path);
                                 return BLK_STS_RESOURCE;
                         }
                 } else {
-                       btrfs_bio->csum = btrfs_bio->csum_inline;
+                       bbio->csum = bbio->csum_inline;
                 }
-               csum = btrfs_bio->csum;
+               csum = bbio->csum;
         } else {
                 csum = dst;
         }
@@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
                                 index = 0;
                         }
  
-                       data = kmap_atomic(bvec.bv_page);
-                       crypto_shash_digest(shash, data + bvec.bv_offset
-                                           + (i * fs_info->sectorsize),
+                       data = bvec_kmap_local(&bvec);
+                       crypto_shash_digest(shash,
+                                           data + (i * fs_info->sectorsize),
                                             fs_info->sectorsize,
                                             sums->sums + index);
-                       kunmap_atomic(data);
+                       kunmap_local(data);
                         index += fs_info->csum_size;
                         offset += fs_info->sectorsize;
                         this_sum_bytes += fs_info->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index a176236..9a3db13 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
  /*
   * unlocks pages after btrfs_file_write is done with them
   */
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+                            struct page **pages, size_t num_pages,
+                            u64 pos, u64 copied)
  {
         size_t i;
+       u64 block_start = round_down(pos, fs_info->sectorsize);
+       u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+       ASSERT(block_len <= U32_MAX);
         for (i = 0; i < num_pages; i++) {
                 /* page checked is some magic around finding pages that
                  * have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
                  * accessed as prepare_pages should have marked them accessed
                  * in prepare_pages via find_or_create_page()
                  */
-               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+                                              block_len);
                 unlock_page(pages[i]);
                 put_page(pages[i]);
         }
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
                 struct page *p = pages[i];
  
                 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
-               ClearPageChecked(p);
+               btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
                 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
         }
  
@@ -869,7 +876,8 @@ next_slot:
                                 btrfs_init_data_ref(&ref,
                                                 root->root_key.objectid,
                                                 new_key.objectid,
-                                               args->start - extent_offset);
+                                               args->start - extent_offset,
+                                               0, false);
                                 ret = btrfs_inc_extent_ref(trans, &ref);
                                 BUG_ON(ret); /* -ENOMEM */
                         }
@@ -955,7 +963,8 @@ delete_extent_item:
                                 btrfs_init_data_ref(&ref,
                                                 root->root_key.objectid,
                                                 key.objectid,
-                                               key.offset - extent_offset);
+                                               key.offset - extent_offset, 0,
+                                               false);
                                 ret = btrfs_free_extent(trans, &ref);
                                 BUG_ON(ret); /* -ENOMEM */
                                 args->bytes_found += extent_end - key.offset;
@@ -1020,8 +1029,7 @@ delete_extent_item:
                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
                                 path->slots[0]++;
                 }
-               setup_items_for_insert(root, path, &key,
-                                      &args->extent_item_size, 1);
+               btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
                 args->extent_inserted = true;
         }
  
@@ -1232,7 +1240,7 @@ again:
                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
                                        num_bytes, 0);
                 btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
-                                   orig_offset);
+                                   orig_offset, 0, false);
                 ret = btrfs_inc_extent_ref(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1265,8 @@ again:
         other_end = 0;
         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                                num_bytes, 0);
-       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+                           0, false);
         if (extent_mergeable(leaf, path->slots[0] + 1,
                              ino, bytenr, orig_offset,
                              &other_start, &other_end)) {
@@ -1844,7 +1853,7 @@ again:
  
                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
                 if (ret) {
-                       btrfs_drop_pages(pages, num_pages);
+                       btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
                         break;
                 }
  
@@ -1852,7 +1861,7 @@ again:
                 if (only_release_metadata)
                         btrfs_check_nocow_unlock(BTRFS_I(inode));
  
-               btrfs_drop_pages(pages, num_pages);
+               btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
  
                 cond_resched();
  
@@ -2012,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
          * have opened a file as writable, we have to stop this write operation
          * to ensure consistency.
          */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+       if (BTRFS_FS_ERROR(inode->root->fs_info))
                 return -EROFS;
  
         if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2620,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
                                        extent_info->disk_len, 0);
                 ref_offset = extent_info->file_offset - extent_info->data_offset;
                 btrfs_init_data_ref(&ref, root->root_key.objectid,
-                                   btrfs_ino(inode), ref_offset);
+                                   btrfs_ino(inode), ref_offset, 0, false);
                 ret = btrfs_inc_extent_ref(trans, &ref);
         }
  
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index da0eee7..f3fee88 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
  #include "delalloc-space.h"
  #include "block-group.h"
  #include "discard.h"
+#include "subpage.h"
  
  #define BITS_PER_BITMAP                (PAGE_SIZE * 8UL)
  #define MAX_CACHE_BYTES_PER_GIG        SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
  
         for (i = 0; i < io_ctl->num_pages; i++) {
                 if (io_ctl->pages[i]) {
-                       ClearPageChecked(io_ctl->pages[i]);
+                       btrfs_page_clear_checked(io_ctl->fs_info,
+                                       io_ctl->pages[i],
+                                       page_offset(io_ctl->pages[i]),
+                                       PAGE_SIZE);
                         unlock_page(io_ctl->pages[i]);
                         put_page(io_ctl->pages[i]);
                 }
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
         u64 offset = bytenr - block_group->start;
         u64 to_free, to_unusable;
         const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+       bool initial = (size == block_group->length);
+       u64 reclaimable_unusable;
+
+       WARN_ON(!initial && offset + size > block_group->zone_capacity);
  
         spin_lock(&ctl->tree_lock);
         if (!used)
                 to_free = size;
+       else if (initial)
+               to_free = block_group->zone_capacity;
         else if (offset >= block_group->alloc_offset)
                 to_free = size;
         else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
                 spin_unlock(&block_group->lock);
         }
  
+       reclaimable_unusable = block_group->zone_unusable -
+                              (block_group->length - block_group->zone_capacity);
         /* All the region is now unusable. Mark it as unused and reclaim */
         if (block_group->zone_unusable == block_group->length) {
                 btrfs_mark_bg_unused(block_group);
         } else if (bg_reclaim_threshold &&
-                  block_group->zone_unusable >=
-                  div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+                  reclaimable_unusable >=
+                  div_factor_fine(block_group->zone_capacity,
+                                  bg_reclaim_threshold)) {
                 btrfs_mark_bg_to_reclaim(block_group);
         }
  
@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
          * out the free space after the allocation offset.
          */
         if (btrfs_is_zoned(fs_info)) {
-               btrfs_info(fs_info, "free space %llu",
-                          block_group->length - block_group->alloc_offset);
+               btrfs_info(fs_info, "free space %llu active %d",
+                          block_group->zone_capacity - block_group->alloc_offset,
+                          block_group->zone_is_active);
                 return;
         }
  
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 954b53a..b8c911a 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -457,11 +457,10 @@ struct async_chunk {
         struct list_head extents;
         struct cgroup_subsys_state *blkcg_css;
         struct btrfs_work work;
-       atomic_t *pending;
+       struct async_cow *async_cow;
  };
  
  struct async_cow {
-       /* Number of chunks in flight; must be first in the structure */
         atomic_t num_chunks;
         struct async_chunk chunks[];
  };
@@ -492,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
   */
  static inline bool inode_can_compress(struct btrfs_inode *inode)
  {
-       /* Subpage doesn't support compression yet */
-       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
-               return false;
         if (inode->flags & BTRFS_INODE_NODATACOW ||
             inode->flags & BTRFS_INODE_NODATASUM)
                 return false;
@@ -516,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
                         btrfs_ino(inode));
                 return 0;
         }
+       /*
+        * Special check for subpage.
+        *
+        * We lock the full page then run each delalloc range in the page, thus
+        * for the following case, we will hit some subpage specific corner case:
+        *
+        * 0            32K             64K
+        * |    |///////|       |///////|
+        *              \- A            \- B
+        *
+        * In above case, both range A and range B will try to unlock the full
+        * page [0, 64K), causing the one finished later will have page
+        * unlocked already, triggering various page lock requirement BUG_ON()s.
+        *
+        * So here we add an artificial limit that subpage compression can only
+        * if the range is fully page aligned.
+        *
+        * In theory we only need to ensure the first page is fully covered, but
+        * the tailing partial page will be locked until the full compression
+        * finishes, delaying the write of other range.
+        *
+        * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+        * first to prevent any submitted async extent to unlock the full page.
+        * By this, we can ensure for subpage case that only the last async_cow
+        * will unlock the full page.
+        */
+       if (fs_info->sectorsize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(end + 1, PAGE_SIZE))
+                       return 0;
+       }
+
         /* force compress */
         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
                 return 1;
@@ -617,13 +645,24 @@ again:
         total_compressed = actual_end - start;
  
         /*
-        * skip compression for a small file range(<=blocksize) that
+        * Skip compression for a small file range(<=blocksize) that
          * isn't an inline extent, since it doesn't save disk space at all.
          */
         if (total_compressed <= blocksize &&
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                 goto cleanup_and_bail_uncompressed;
  
+       /*
+        * For subpage case, we require full page alignment for the sector
+        * aligned range.
+        * Thus we must also check against @actual_end, not just @end.
+        */
+       if (blocksize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+                       goto cleanup_and_bail_uncompressed;
+       }
+
         total_compressed = min_t(unsigned long, total_compressed,
                         BTRFS_MAX_UNCOMPRESSED);
         total_in = 0;
@@ -761,7 +800,7 @@ cont:
                  * win, compare the page count read with the blocks on disk,
                  * compression must free at least one sector size
                  */
-               total_in = ALIGN(total_in, PAGE_SIZE);
+               total_in = round_up(total_in, fs_info->sectorsize);
                 if (total_compressed + blocksize <= total_in) {
                         compressed_extents++;
  
@@ -842,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent)
         async_extent->pages = NULL;
  }
  
-/*
- * phase two of compressed writeback.  This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued.  We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+                                    struct async_extent *async_extent,
+                                    struct page *locked_page)
  {
-       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct async_extent *async_extent;
-       u64 alloc_hint = 0;
-       struct btrfs_key ins;
-       struct extent_map *em;
-       struct btrfs_root *root = inode->root;
-       struct extent_io_tree *io_tree = &inode->io_tree;
-       int ret = 0;
-
-again:
-       while (!list_empty(&async_chunk->extents)) {
-               async_extent = list_entry(async_chunk->extents.next,
-                                         struct async_extent, list);
-               list_del(&async_extent->list);
-
-retry:
-               lock_extent(io_tree, async_extent->start,
-                           async_extent->start + async_extent->ram_size - 1);
-               /* did the compression code fall back to uncompressed IO? */
-               if (!async_extent->pages) {
-                       int page_started = 0;
-                       unsigned long nr_written = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
+       unsigned long nr_written = 0;
+       int page_started = 0;
+       int ret;
  
-                       /* allocate blocks */
-                       ret = cow_file_range(inode, async_chunk->locked_page,
-                                            async_extent->start,
-                                            async_extent->start +
-                                            async_extent->ram_size - 1,
-                                            &page_started, &nr_written, 0);
+       /*
+        * Call cow_file_range() to run the delalloc range directly, since we
+        * won't go to NOCOW or async path again.
+        *
+        * Also we call cow_file_range() with @unlock_page == 0, so that we
+        * can directly submit them without interruption.
+        */
+       ret = cow_file_range(inode, locked_page, start, end, &page_started,
+                            &nr_written, 0);
+       /* Inline extent inserted, page gets unlocked and everything is done */
+       if (page_started) {
+               ret = 0;
+               goto out;
+       }
+       if (ret < 0) {
+               if (locked_page)
+                       unlock_page(locked_page);
+               goto out;
+       }
  
-                       /* JDM XXX */
+       ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+       /* All pages will be unlocked, including @locked_page */
+out:
+       kfree(async_extent);
+       return ret;
+}
  
-                       /*
-                        * if page_started, cow_file_range inserted an
-                        * inline extent and took care of all the unlocking
-                        * and IO for us.  Otherwise, we need to submit
-                        * all those pages down to the drive.
-                        */
-                       if (!page_started && !ret)
-                               extent_write_locked_range(&inode->vfs_inode,
-                                                 async_extent->start,
-                                                 async_extent->start +
-                                                 async_extent->ram_size - 1,
-                                                 WB_SYNC_ALL);
-                       else if (ret && async_chunk->locked_page)
-                               unlock_page(async_chunk->locked_page);
-                       kfree(async_extent);
-                       cond_resched();
-                       continue;
-               }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+                                  struct async_chunk *async_chunk,
+                                  struct async_extent *async_extent,
+                                  u64 *alloc_hint)
+{
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_key ins;
+       struct page *locked_page = NULL;
+       struct extent_map *em;
+       int ret = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
  
-               ret = btrfs_reserve_extent(root, async_extent->ram_size,
-                                          async_extent->compressed_size,
-                                          async_extent->compressed_size,
-                                          0, alloc_hint, &ins, 1, 1);
-               if (ret) {
-                       free_async_extent_pages(async_extent);
+       /*
+        * If async_chunk->locked_page is in the async_extent range, we need to
+        * handle it.
+        */
+       if (async_chunk->locked_page) {
+               u64 locked_page_start = page_offset(async_chunk->locked_page);
+               u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
  
-                       if (ret == -ENOSPC) {
-                               unlock_extent(io_tree, async_extent->start,
-                                             async_extent->start +
-                                             async_extent->ram_size - 1);
+               if (!(start >= locked_page_end || end <= locked_page_start))
+                       locked_page = async_chunk->locked_page;
+       }
+       lock_extent(io_tree, start, end);
  
-                               /*
-                                * we need to redirty the pages if we decide to
-                                * fallback to uncompressed IO, otherwise we
-                                * will not submit these pages down to lower
-                                * layers.
-                                */
-                               extent_range_redirty_for_io(&inode->vfs_inode,
-                                               async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1);
+       /* We have fall back to uncompressed write */
+       if (!async_extent->pages)
+               return submit_uncompressed_range(inode, async_extent, locked_page);
  
-                               goto retry;
-                       }
-                       goto out_free;
-               }
+       ret = btrfs_reserve_extent(root, async_extent->ram_size,
+                                  async_extent->compressed_size,
+                                  async_extent->compressed_size,
+                                  0, *alloc_hint, &ins, 1, 1);
+       if (ret) {
+               free_async_extent_pages(async_extent);
                 /*
-                * here we're doing allocation and writeback of the
-                * compressed pages
+                * Here we used to try again by going back to non-compressed
+                * path for ENOSPC.  But we can't reserve space even for
+                * compressed size, how could it work for uncompressed size
+                * which requires larger size?  So here we directly go error
+                * path.
                  */
-               em = create_io_em(inode, async_extent->start,
-                                 async_extent->ram_size, /* len */
-                                 async_extent->start, /* orig_start */
-                                 ins.objectid, /* block_start */
-                                 ins.offset, /* block_len */
-                                 ins.offset, /* orig_block_len */
-                                 async_extent->ram_size, /* ram_bytes */
-                                 async_extent->compress_type,
-                                 BTRFS_ORDERED_COMPRESSED);
-               if (IS_ERR(em))
-                       /* ret value is not necessary due to void function */
-                       goto out_free_reserve;
-               free_extent_map(em);
-
-               ret = btrfs_add_ordered_extent_compress(inode,
-                                               async_extent->start,
-                                               ins.objectid,
-                                               async_extent->ram_size,
-                                               ins.offset,
-                                               async_extent->compress_type);
-               if (ret) {
-                       btrfs_drop_extent_cache(inode, async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1, 0);
-                       goto out_free_reserve;
-               }
-               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               goto out_free;
+       }
+
+       /* Here we're doing allocation and writeback of the compressed pages */
+       em = create_io_em(inode, start,
+                         async_extent->ram_size,       /* len */
+                         start,                        /* orig_start */
+                         ins.objectid,                 /* block_start */
+                         ins.offset,                   /* block_len */
+                         ins.offset,                   /* orig_block_len */
+                         async_extent->ram_size,       /* ram_bytes */
+                         async_extent->compress_type,
+                         BTRFS_ORDERED_COMPRESSED);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out_free_reserve;
+       }
+       free_extent_map(em);
  
-               /*
-                * clear dirty, set writeback and unlock the pages.
-                */
-               extent_clear_unlock_delalloc(inode, async_extent->start,
-                               async_extent->start +
-                               async_extent->ram_size - 1,
-                               NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
-                               PAGE_UNLOCK | PAGE_START_WRITEBACK);
-               if (btrfs_submit_compressed_write(inode, async_extent->start,
-                                   async_extent->ram_size,
-                                   ins.objectid,
-                                   ins.offset, async_extent->pages,
-                                   async_extent->nr_pages,
-                                   async_chunk->write_flags,
-                                   async_chunk->blkcg_css)) {
-                       struct page *p = async_extent->pages[0];
-                       const u64 start = async_extent->start;
-                       const u64 end = start + async_extent->ram_size - 1;
-
-                       p->mapping = inode->vfs_inode.i_mapping;
-                       btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, false);
-
-                       p->mapping = NULL;
-                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
-                                                    PAGE_END_WRITEBACK |
-                                                    PAGE_SET_ERROR);
-                       free_async_extent_pages(async_extent);
-               }
-               alloc_hint = ins.objectid + ins.offset;
-               kfree(async_extent);
-               cond_resched();
+       ret = btrfs_add_ordered_extent_compress(inode, start,   /* file_offset */
+                                       ins.objectid,           /* disk_bytenr */
+                                       async_extent->ram_size, /* num_bytes */
+                                       ins.offset,             /* disk_num_bytes */
+                                       async_extent->compress_type);
+       if (ret) {
+               btrfs_drop_extent_cache(inode, start, end, 0);
+               goto out_free_reserve;
         }
-       return;
+       btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+       /* Clear dirty, set writeback and unlock the pages. */
+       extent_clear_unlock_delalloc(inode, start, end,
+                       NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+                       PAGE_UNLOCK | PAGE_START_WRITEBACK);
+       if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+                           async_extent->ram_size,     /* num_bytes */
+                           ins.objectid,               /* disk_bytenr */
+                           ins.offset,                 /* compressed_len */
+                           async_extent->pages,        /* compressed_pages */
+                           async_extent->nr_pages,
+                           async_chunk->write_flags,
+                           async_chunk->blkcg_css)) {
+               const u64 start = async_extent->start;
+               const u64 end = start + async_extent->ram_size - 1;
+
+               btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+               extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                            PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+               free_async_extent_pages(async_extent);
+       }
+       *alloc_hint = ins.objectid + ins.offset;
+       kfree(async_extent);
+       return ret;
+
  out_free_reserve:
         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
  out_free:
-       extent_clear_unlock_delalloc(inode, async_extent->start,
-                                    async_extent->start +
-                                    async_extent->ram_size - 1,
+       extent_clear_unlock_delalloc(inode, start, end,
                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                      EXTENT_DELALLOC_NEW |
                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1009,7 +1030,39 @@ out_free:
                                      PAGE_END_WRITEBACK | PAGE_SET_ERROR);
         free_async_extent_pages(async_extent);
         kfree(async_extent);
-       goto again;
+       return ret;
+}
+
+/*
+ * Phase two of compressed writeback.  This is the ordered portion of the code,
+ * which only gets called in the order the work was queued.  We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct async_extent *async_extent;
+       u64 alloc_hint = 0;
+       int ret = 0;
+
+       while (!list_empty(&async_chunk->extents)) {
+               u64 extent_start;
+               u64 ram_size;
+
+               async_extent = list_entry(async_chunk->extents.next,
+                                         struct async_extent, list);
+               list_del(&async_extent->list);
+               extent_start = async_extent->start;
+               ram_size = async_extent->ram_size;
+
+               ret = submit_one_async_extent(inode, async_chunk, async_extent,
+                                             &alloc_hint);
+               btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+                           inode->root->root_key.objectid,
+                           btrfs_ino(inode), extent_start, ram_size, ret);
+       }
  }
  
  static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1152,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
          * fails during the stage where it updates the bytenr of file extent
          * items.
          */
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+       if (btrfs_is_data_reloc_root(root))
                 min_alloc_size = num_bytes;
         else
                 min_alloc_size = fs_info->sectorsize;
@@ -1188,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
                 if (ret)
                         goto out_drop_extent_cache;
  
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+               if (btrfs_is_data_reloc_root(root)) {
                         ret = btrfs_reloc_clone_csums(inode, start,
                                                       cur_alloc_size);
                         /*
@@ -1327,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
  static noinline void async_cow_free(struct btrfs_work *work)
  {
         struct async_chunk *async_chunk;
+       struct async_cow *async_cow;
  
         async_chunk = container_of(work, struct async_chunk, work);
         if (async_chunk->inode)
                 btrfs_add_delayed_iput(async_chunk->inode);
         if (async_chunk->blkcg_css)
                 css_put(async_chunk->blkcg_css);
-       /*
-        * Since the pointer to 'pending' is at the beginning of the array of
-        * async_chunk's, freeing it ensures the whole array has been freed.
-        */
-       if (atomic_dec_and_test(async_chunk->pending))
-               kvfree(async_chunk->pending);
+
+       async_cow = async_chunk->async_cow;
+       if (atomic_dec_and_test(&async_cow->num_chunks))
+               kvfree(async_cow);
  }
  
  static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1399,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
                  * lightweight reference for the callback lifetime
                  */
                 ihold(&inode->vfs_inode);
-               async_chunk[i].pending = &ctx->num_chunks;
+               async_chunk[i].async_cow = ctx;
                 async_chunk[i].inode = &inode->vfs_inode;
                 async_chunk[i].start = start;
                 async_chunk[i].end = cur_end;
@@ -1472,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
  
         __set_page_dirty_nobuffers(locked_page);
         account_page_redirty(locked_page);
-       extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+       extent_write_locked_range(&inode->vfs_inode, start, end);
         *page_started = 1;
  
         return 0;
@@ -1505,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
                            int *page_started, unsigned long *nr_written)
  {
         const bool is_space_ino = btrfs_is_free_space_inode(inode);
-       const bool is_reloc_ino = (inode->root->root_key.objectid ==
-                                  BTRFS_DATA_RELOC_TREE_OBJECTID);
+       const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
         const u64 range_bytes = end + 1 - start;
         struct extent_io_tree *io_tree = &inode->io_tree;
         u64 range_start = start;
@@ -1868,8 +1918,7 @@ out_check:
                         btrfs_dec_nocow_writers(fs_info, disk_bytenr);
                 nocow = false;
  
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                         /*
                          * Error handled later, as we must prevent
                          * extent_clear_unlock_delalloc() in error handler
@@ -1948,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
         int ret;
         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
  
+       /*
+        * The range must cover part of the @locked_page, or the returned
+        * @page_started can confuse the caller.
+        */
+       ASSERT(!(end <= page_offset(locked_page) ||
+                start >= page_offset(locked_page) + PAGE_SIZE));
+
         if (should_nocow(inode, start, end)) {
-               ASSERT(!zoned);
+               /*
+                * Normally on a zoned device we're only doing COW writes, but
+                * in case of relocation on a zoned filesystem we have taken
+                * precaution, that we're only writing sequentially. It's safe
+                * to use run_delalloc_nocow() here, like for  regular
+                * preallocated inodes.
+                */
+               ASSERT(!zoned ||
+                      (zoned && btrfs_is_data_reloc_root(inode->root)));
                 ret = run_delalloc_nocow(inode, locked_page, start, end,
                                          page_started, nr_written);
         } else if (!inode_can_compress(inode) ||
@@ -2208,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
                 if (btrfs_is_testing(fs_info))
                         return;
  
-               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (!btrfs_is_data_reloc_root(root) &&
                     do_list && !(state->state & EXTENT_NORESERVE) &&
                     (*bits & EXTENT_CLEAR_DATA_RESV))
                         btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2236,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
  }
  
  /*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-                            unsigned long bio_flags)
-{
-       struct inode *inode = page->mapping->host;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       u64 logical = bio->bi_iter.bi_sector << 9;
-       u32 bio_len = bio->bi_iter.bi_size;
-       struct extent_map *em;
-       int ret = 0;
-       struct btrfs_io_geometry geom;
-
-       if (bio_flags & EXTENT_BIO_COMPRESSED)
-               return 0;
-
-       em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
-       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
-       if (ret < 0)
-               goto out;
-
-       if (geom.len < bio_len + size)
-               ret = 1;
-out:
-       free_extent_map(em);
-       return ret;
-}
-
-/*
   * in order to insert checksums into the metadata in large chunks,
   * we wait until bio submission time.   All the pages in the bio are
   * checksummed and sums are attached onto the ordered extent record.
@@ -2533,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
                 goto mapit;
         } else if (async && !skip_sum) {
                 /* csum items have already been cloned */
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                         goto mapit;
                 /* we're doing a write, do the async checksumming */
                 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@ -2766,7 +2788,7 @@ out_page:
                 clear_page_dirty_for_io(page);
                 SetPageError(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
         unlock_page(page);
         put_page(page);
         kfree(fixup);
@@ -2821,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
          * page->mapping outside of the page lock.
          */
         ihold(inode);
-       SetPageChecked(page);
+       btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
         get_page(page);
         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
         fixup->page = page;
@@ -3012,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                 goto out;
         }
  
-       if (ordered_extent->bdev)
+       /* A valid bdev implies a write on a sequential zone */
+       if (ordered_extent->bdev) {
                 btrfs_rewrite_logical_zoned(ordered_extent);
+               btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+                                       ordered_extent->disk_num_bytes);
+       }
  
         btrfs_free_io_failure_record(inode, start, end);
  
@@ -3210,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
   *
   * The length of such check is always one sector size.
   */
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
                            u32 bio_offset, struct page *page, u32 pgoff,
                            u64 start)
  {
@@ -3226,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
         ASSERT(pgoff + len <= PAGE_SIZE);
  
         offset_sectors = bio_offset >> fs_info->sectorsize_bits;
-       csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+       csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
  
         kaddr = kmap_atomic(page);
         shash->tfm = fs_info->csum_shash;
@@ -3240,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
         return 0;
  zeroit:
         btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-                                   io_bio->mirror_num);
-       if (io_bio->device)
-               btrfs_dev_stat_inc_and_print(io_bio->device,
+                                   bbio->mirror_num);
+       if (bbio->device)
+               btrfs_dev_stat_inc_and_print(bbio->device,
                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
         memset(kaddr + pgoff, 1, len);
         flush_dcache_page(page);
@@ -3262,33 +3288,29 @@ zeroit:
   * Return a bitmap where bit set means a csum mismatch, and bit not set means
   * csum match.
   */
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-                                   struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+                                   u32 bio_offset, struct page *page,
+                                   u64 start, u64 end)
  {
         struct inode *inode = page->mapping->host;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         const u32 sectorsize = root->fs_info->sectorsize;
         u32 pg_off;
         unsigned int result = 0;
  
-       if (PageChecked(page)) {
-               ClearPageChecked(page);
+       if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+               btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
                 return 0;
         }
  
         /*
-        * For subpage case, above PageChecked is not safe as it's not subpage
-        * compatible.
-        * But for now only cow fixup and compressed read utilize PageChecked
-        * flag, while in this context we can easily use io_bio->csum to
-        * determine if we really need to do csum verification.
-        *
-        * So for now, just exit if io_bio->csum is NULL, as it means it's
-        * compressed read, and its compressed data csum has already been
-        * verified.
+        * This only happens for NODATASUM or compressed read.
+        * Normally this should be covered by above check for compressed read
+        * or the next check for NODATASUM.  Just do a quicker exit here.
          */
-       if (io_bio->csum == NULL)
+       if (bbio->csum == NULL)
                 return 0;
  
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@ -3305,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                 u64 file_offset = pg_off + page_offset(page);
                 int ret;
  
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (btrfs_is_data_reloc_root(root) &&
                     test_range_bit(io_tree, file_offset,
                                    file_offset + sectorsize - 1,
                                    EXTENT_NODATASUM, 1, NULL)) {
@@ -3315,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                                           EXTENT_NODATASUM);
                         continue;
                 }
-               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+               ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
                                       page_offset(page) + pg_off);
                 if (ret < 0) {
                         const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -4006,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
          * without delay
          */
         if (!btrfs_is_free_space_inode(inode)
-           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+           && !btrfs_is_data_reloc_root(root)
             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
                 btrfs_update_root_times(trans, root);
  
@@ -4036,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
   * also drops the back refs in the inode to the directory
   */
  static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
                                 struct btrfs_inode *dir,
                                 struct btrfs_inode *inode,
                                 const char *name, int name_len)
  {
+       struct btrfs_root *root = dir->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_path *path;
         int ret = 0;
@@ -4100,19 +4122,9 @@ skip_backref:
                 goto err;
         }
  
-       ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
-                       dir_ino);
-       if (ret != 0 && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, ret);
-               goto err;
-       }
-
-       ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
-                       index);
-       if (ret == -ENOENT)
-               ret = 0;
-       else if (ret)
-               btrfs_abort_transaction(trans, ret);
+       btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+                                  dir_ino);
+       btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
  
         /*
          * If we have a pending delayed iput we could end up with the final iput
@@ -4140,15 +4152,14 @@ out:
  }
  
  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
                        struct btrfs_inode *dir, struct btrfs_inode *inode,
                        const char *name, int name_len)
  {
         int ret;
-       ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+       ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
         if (!ret) {
                 drop_nlink(&inode->vfs_inode);
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, inode->root, inode);
         }
         return ret;
  }
@@ -4177,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
  
  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
  {
-       struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_trans_handle *trans;
         struct inode *inode = d_inode(dentry);
         int ret;
@@ -4189,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                         0);
  
-       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                         dentry->d_name.len);
         if (ret)
@@ -4203,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
  
  out:
         btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
         return ret;
  }
  
@@ -4370,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
         struct inode *inode;
         u64 objectid = 0;
  
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (!BTRFS_FS_ERROR(fs_info))
                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
  
         spin_lock(&root->inode_lock);
@@ -4554,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
  {
         struct inode *inode = d_inode(dentry);
         int err = 0;
-       struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_trans_handle *trans;
         u64 last_unlink_trans;
  
@@ -4579,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
  
         /* now the directory is empty */
-       err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       err = btrfs_unlink_inode(trans, BTRFS_I(dir),
                         BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                         dentry->d_name.len);
         if (!err) {
@@ -4600,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
         }
  out:
         btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
  
         return err;
  }
@@ -4909,9 +4918,9 @@ delete:
  
                         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
                                         extent_start, extent_num_bytes, 0);
-                       ref.real_root = root->root_key.objectid;
                         btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                       ino, extent_offset);
+                                       ino, extent_offset,
+                                       root->root_key.objectid, false);
                         ret = btrfs_free_extent(trans, &ref);
                         if (ret) {
                                 btrfs_abort_transaction(trans, ret);
@@ -5107,7 +5116,8 @@ again:
                                      len);
                 flush_dcache_page(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, block_start,
+                                block_end + 1 - block_start);
         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
         unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
  
@@ -6437,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         struct btrfs_inode_ref *ref;
         struct btrfs_key key[2];
         u32 sizes[2];
-       int nitems = name ? 2 : 1;
+       struct btrfs_item_batch batch;
         unsigned long ptr;
         unsigned int nofs_flag;
         int ret;
@@ -6529,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                 goto fail;
         }
  
-       ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+       batch.keys = &key[0];
+       batch.data_sizes = &sizes[0];
+       batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+       batch.nr = name ? 2 : 1;
+       ret = btrfs_insert_empty_items(trans, root, path, &batch);
         if (ret != 0)
                 goto fail_unlock;
  
@@ -7963,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
                 iomap->type = IOMAP_MAPPED;
         }
         iomap->offset = start;
-       iomap->bdev = fs_info->fs_devices->latest_bdev;
+       iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
         iomap->length = len;
  
         if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -8040,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
  
         if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
                 __endio_write_update_ordered(BTRFS_I(dip->inode),
-                                            dip->logical_offset,
+                                            dip->file_offset,
                                              dip->bytes,
                                              !dip->dio_bio->bi_status);
         } else {
                 unlock_extent(&BTRFS_I(dip->inode)->io_tree,
-                             dip->logical_offset,
-                             dip->logical_offset + dip->bytes - 1);
+                             dip->file_offset,
+                             dip->file_offset + dip->bytes - 1);
         }
  
         bio_endio(dip->dio_bio);
@@ -8074,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
         return ret;
  }
  
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
-                                            struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+                                            struct btrfs_bio *bbio,
                                              const bool uptodate)
  {
+       struct inode *inode = dip->inode;
         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
         const u32 sectorsize = fs_info->sectorsize;
         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8085,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
         struct bio_vec bvec;
         struct bvec_iter iter;
-       u64 start = io_bio->logical;
+       const u64 orig_file_offset = dip->file_offset;
+       u64 start = orig_file_offset;
         u32 bio_offset = 0;
         blk_status_t err = BLK_STS_OK;
  
-       __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+       __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
                 unsigned int i, nr_sectors, pgoff;
  
                 nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@ -8097,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                 for (i = 0; i < nr_sectors; i++) {
                         ASSERT(pgoff < PAGE_SIZE);
                         if (uptodate &&
-                           (!csum || !check_data_csum(inode, io_bio,
+                           (!csum || !check_data_csum(inode, bbio,
                                                        bio_offset, bvec.bv_page,
                                                        pgoff, start))) {
                                 clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8107,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                         } else {
                                 int ret;
  
-                               ASSERT((start - io_bio->logical) < UINT_MAX);
+                               ASSERT((start - orig_file_offset) < UINT_MAX);
                                 ret = btrfs_repair_one_sector(inode,
-                                               &io_bio->bio,
-                                               start - io_bio->logical,
+                                               &bbio->bio,
+                                               start - orig_file_offset,
                                                 bvec.bv_page, pgoff,
-                                               start, io_bio->mirror_num,
+                                               start, bbio->mirror_num,
                                                 submit_dio_repair_bio);
                                 if (ret)
                                         err = errno_to_blk_status(ret);
@@ -8153,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
                            bio->bi_opf, bio->bi_iter.bi_sector,
                            bio->bi_iter.bi_size, err);
  
-       if (bio_op(bio) == REQ_OP_READ) {
-               err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
-                                              !err);
-       }
+       if (bio_op(bio) == REQ_OP_READ)
+               err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
  
         if (err)
                 dip->dio_bio->bi_status = err;
  
-       btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+       btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
  
         bio_put(bio);
         btrfs_dio_private_put(dip);
@@ -8203,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
         } else {
                 u64 csum_offset;
  
-               csum_offset = file_offset - dip->logical_offset;
+               csum_offset = file_offset - dip->file_offset;
                 csum_offset >>= fs_info->sectorsize_bits;
                 csum_offset *= fs_info->csum_size;
-               btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+               btrfs_bio(bio)->csum = dip->csums + csum_offset;
         }
  map:
         ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8241,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
                 return NULL;
  
         dip->inode = inode;
-       dip->logical_offset = file_offset;
+       dip->file_offset = file_offset;
         dip->bytes = dio_bio->bi_iter.bi_size;
         dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
         dip->dio_bio = dio_bio;
@@ -8322,7 +8336,6 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
                 bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
                 bio->bi_private = dip;
                 bio->bi_end_io = btrfs_end_dio_bio;
-               btrfs_io_bio(bio)->logical = file_offset;
  
                 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                         status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8696,9 +8709,9 @@ next:
          * did something wrong.
          */
         ASSERT(!PageOrdered(page));
+       btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
         if (!inode_evicting)
                 __btrfs_releasepage(page, GFP_NOFS);
-       ClearPageChecked(page);
         clear_page_extent_mapped(page);
  }
  
@@ -8842,7 +8855,7 @@ again:
                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
                 flush_dcache_page(page);
         }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
  
@@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
         WARN_ON(inode->block_rsv.reserved);
         WARN_ON(inode->block_rsv.size);
         WARN_ON(inode->outstanding_extents);
-       WARN_ON(inode->delalloc_bytes);
-       WARN_ON(inode->new_delalloc_bytes);
+       if (!S_ISDIR(vfs_inode->i_mode)) {
+               WARN_ON(inode->delalloc_bytes);
+               WARN_ON(inode->new_delalloc_bytes);
+       }
         WARN_ON(inode->csum_bytes);
         WARN_ON(inode->defrag_bytes);
  
@@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
         } else { /* src is an inode */
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                            BTRFS_I(old_dentry->d_inode),
                                            old_dentry->d_name.name,
                                            old_dentry->d_name.len);
@@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
                 ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
         } else { /* dest is an inode */
-               ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                            BTRFS_I(new_dentry->d_inode),
                                            new_dentry->d_name.name,
                                            new_dentry->d_name.len);
@@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
                  */
                 btrfs_pin_log_trans(root);
                 log_pinned = true;
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                         BTRFS_I(d_inode(old_dentry)),
                                         old_dentry->d_name.name,
                                         old_dentry->d_name.len);
@@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
                         ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
                         BUG_ON(new_inode->i_nlink == 0);
                 } else {
-                       ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                                  BTRFS_I(d_inode(new_dentry)),
                                                  new_dentry->d_name.name,
                                                  new_dentry->d_name.len);
@@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
         };
         struct btrfs_fs_info *fs_info = root->fs_info;
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
  
         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
         struct list_head splice;
         int ret;
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
  
         INIT_LIST_HEAD(&splice);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 36ff713..02ff085 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
  #include "space-info.h"
  #include "delalloc-space.h"
  #include "block-group.h"
+#include "subpage.h"
  
  #ifdef CONFIG_64BIT
  /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
         compat_uptr_t clone_sources;    /* in */
         __u64 parent_root;              /* in */
         __u64 flags;                    /* in */
-       __u64 reserved[4];              /* in */
+       __u32 version;                  /* in */
+       __u8  reserved[28];             /* in */
  } __attribute__ ((__packed__));
  
  #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
         return ret;
  }
  
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map *em = NULL;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       u64 end;
-
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
-       read_unlock(&em_tree->lock);
-
-       if (em) {
-               end = extent_map_end(em);
-               free_extent_map(em);
-               if (end - offset > thresh)
-                       return 0;
-       }
-       /* if we already have a nice delalloc here, just stop */
-       thresh /= 2;
-       end = count_range_bits(io_tree, &offset, offset + thresh,
-                              thresh, EXTENT_DELALLOC, 1);
-       if (end >= thresh)
-               return 0;
-       return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
-                           struct inode *inode, u64 newer_than,
-                           u64 *off, u32 thresh)
-{
-       struct btrfs_path *path;
-       struct btrfs_key min_key;
-       struct extent_buffer *leaf;
-       struct btrfs_file_extent_item *extent;
-       int type;
-       int ret;
-       u64 ino = btrfs_ino(BTRFS_I(inode));
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       min_key.objectid = ino;
-       min_key.type = BTRFS_EXTENT_DATA_KEY;
-       min_key.offset = *off;
-
-       while (1) {
-               ret = btrfs_search_forward(root, &min_key, path, newer_than);
-               if (ret != 0)
-                       goto none;
-process_slot:
-               if (min_key.objectid != ino)
-                       goto none;
-               if (min_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto none;
-
-               leaf = path->nodes[0];
-               extent = btrfs_item_ptr(leaf, path->slots[0],
-                                       struct btrfs_file_extent_item);
-
-               type = btrfs_file_extent_type(leaf, extent);
-               if (type == BTRFS_FILE_EXTENT_REG &&
-                   btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
-                   check_defrag_in_cache(inode, min_key.offset, thresh)) {
-                       *off = min_key.offset;
-                       btrfs_free_path(path);
-                       return 0;
-               }
-
-               path->slots[0]++;
-               if (path->slots[0] < btrfs_header_nritems(leaf)) {
-                       btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
-                       goto process_slot;
-               }
-
-               if (min_key.offset == (u64)-1)
-                       goto none;
-
-               min_key.offset++;
-               btrfs_release_path(path);
-       }
-none:
-       btrfs_free_path(path);
-       return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+                                              bool locked)
  {
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct extent_map *em;
-       u64 len = PAGE_SIZE;
+       const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
  
         /*
          * hopefully we have this extent in the tree already, try without
          * the full extent lock
          */
         read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, start, len);
+       em = lookup_extent_mapping(em_tree, start, sectorsize);
         read_unlock(&em_tree->lock);
  
         if (!em) {
                 struct extent_state *cached = NULL;
-               u64 end = start + len - 1;
+               u64 end = start + sectorsize - 1;
  
                 /* get the big lock and read metadata off disk */
-               lock_extent_bits(io_tree, start, end, &cached);
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
-               unlock_extent_cached(io_tree, start, end, &cached);
+               if (!locked)
+                       lock_extent_bits(io_tree, start, end, &cached);
+               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+               if (!locked)
+                       unlock_extent_cached(io_tree, start, end, &cached);
  
                 if (IS_ERR(em))
                         return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
         return em;
  }
  
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+                                    bool locked)
  {
         struct extent_map *next;
         bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
         if (em->start + em->len >= i_size_read(inode))
                 return false;
  
-       next = defrag_lookup_extent(inode, em->start + em->len);
+       next = defrag_lookup_extent(inode, em->start + em->len, locked);
         if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
                 ret = false;
         else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
         return ret;
  }
  
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
-                              u64 *last_len, u64 *skip, u64 *defrag_end,
-                              int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+                                           pgoff_t index)
  {
-       struct extent_map *em;
-       int ret = 1;
-       bool next_mergeable = true;
-       bool prev_mergeable = true;
+       struct address_space *mapping = inode->vfs_inode.i_mapping;
+       gfp_t mask = btrfs_alloc_write_mask(mapping);
+       u64 page_start = (u64)index << PAGE_SHIFT;
+       u64 page_end = page_start + PAGE_SIZE - 1;
+       struct extent_state *cached_state = NULL;
+       struct page *page;
+       int ret;
+
+again:
+       page = find_or_create_page(mapping, index, mask);
+       if (!page)
+               return ERR_PTR(-ENOMEM);
  
         /*
-        * make sure that once we start defragging an extent, we keep on
-        * defragging it
+        * Since we can defragment files opened read-only, we can encounter
+        * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+        * can't do I/O using huge pages yet, so return an error for now.
+        * Filesystem transparent huge pages are typically only used for
+        * executables that explicitly enable them, so this isn't very
+        * restrictive.
          */
-       if (start < *defrag_end)
-               return 1;
+       if (PageCompound(page)) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(-ETXTBSY);
+       }
  
-       *skip = 0;
+       ret = set_page_extent_mapped(page);
+       if (ret < 0) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(ret);
+       }
  
-       em = defrag_lookup_extent(inode, start);
-       if (!em)
-               return 0;
+       /* Wait for any existing ordered extent in the range */
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
  
-       /* this will cover holes, and inline extents */
-       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-               ret = 0;
-               goto out;
-       }
+               lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+               ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+               unlock_extent_cached(&inode->io_tree, page_start, page_end,
+                                    &cached_state);
+               if (!ordered)
+                       break;
  
-       if (!*defrag_end)
-               prev_mergeable = false;
+               unlock_page(page);
+               btrfs_start_ordered_extent(ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               lock_page(page);
+               /*
+                * We unlocked the page above, so we need check if it was
+                * released or not.
+                */
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+       }
  
-       next_mergeable = defrag_check_next_extent(inode, em);
-       /*
-        * we hit a real extent, if it is big or the next extent is not a
-        * real extent, don't bother defragging it
-        */
-       if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-           (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
-               ret = 0;
-out:
         /*
-        * last_len ends up being a counter of how many bytes we've defragged.
-        * every time we choose not to defrag an extent, we reset *last_len
-        * so that the next tiny extent will force a defrag.
-        *
-        * The end result of this is that tiny extents before a single big
-        * extent will force at least part of that big extent to be defragged.
+        * Now the page range has no ordered extent any more.  Read the page to
+        * make it uptodate.
          */
-       if (ret) {
-               *defrag_end = extent_map_end(em);
-       } else {
-               *last_len = 0;
-               *skip = extent_map_end(em);
-               *defrag_end = 0;
+       if (!PageUptodate(page)) {
+               btrfs_readpage(NULL, page);
+               lock_page(page);
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
         }
-
-       free_extent_map(em);
-       return ret;
+       return page;
  }
  
+struct defrag_target_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
  /*
- * it doesn't do much good to defrag one or two pages
- * at a time.  This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
   *
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start:        file offset to lookup
+ * @len:          length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ *                will be ignored
+ * @newer_than:    only defrag extents newer than this value
+ * @do_compress:   whether the defrag is doing compression
+ *                if true, @extent_thresh will be ignored and all regular
+ *                file extents meeting @newer_than will be targets.
+ * @locked:       if the range has already held extent lock
+ * @target_list:   list of targets file extents
   */
-static int cluster_pages_for_defrag(struct inode *inode,
-                                   struct page **pages,
-                                   unsigned long start_index,
-                                   unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+                                 u64 start, u64 len, u32 extent_thresh,
+                                 u64 newer_than, bool do_compress,
+                                 bool locked, struct list_head *target_list)
  {
-       unsigned long file_end;
-       u64 isize = i_size_read(inode);
-       u64 page_start;
-       u64 page_end;
-       u64 page_cnt;
-       u64 start = (u64)start_index << PAGE_SHIFT;
-       u64 search_start;
-       int ret;
-       int i;
-       int i_done;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_state *cached_state = NULL;
-       struct extent_io_tree *tree;
-       struct extent_changeset *data_reserved = NULL;
-       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+       u64 cur = start;
+       int ret = 0;
  
-       file_end = (isize - 1) >> PAGE_SHIFT;
-       if (!isize || start_index > file_end)
-               return 0;
+       while (cur < start + len) {
+               struct extent_map *em;
+               struct defrag_target_range *new;
+               bool next_mergeable = true;
+               u64 range_len;
  
-       page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+               em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+               if (!em)
+                       break;
  
-       ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
-                       start, page_cnt << PAGE_SHIFT);
-       if (ret)
-               return ret;
-       i_done = 0;
-       tree = &BTRFS_I(inode)->io_tree;
+               /* Skip hole/inline/preallocated extents */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+                   test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       goto next;
  
-       /* step one, lock all the pages */
-       for (i = 0; i < page_cnt; i++) {
-               struct page *page;
-again:
-               page = find_or_create_page(inode->i_mapping,
-                                          start_index + i, mask);
-               if (!page)
-                       break;
+               /* Skip older extent */
+               if (em->generation < newer_than)
+                       goto next;
  
-               ret = set_page_extent_mapped(page);
-               if (ret < 0) {
-                       unlock_page(page);
-                       put_page(page);
-                       break;
+               /*
+                * For do_compress case, we want to compress all valid file
+                * extents, thus no @extent_thresh or mergeable check.
+                */
+               if (do_compress)
+                       goto add;
+
+               /* Skip too large extent */
+               if (em->len >= extent_thresh)
+                       goto next;
+
+               next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+                                                         locked);
+               if (!next_mergeable) {
+                       struct defrag_target_range *last;
+
+                       /* Empty target list, no way to merge with last entry */
+                       if (list_empty(target_list))
+                               goto next;
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       /* Not mergeable with last entry */
+                       if (last->start + last->len != cur)
+                               goto next;
+
+                       /* Mergeable, fall through to add it to @target_list. */
                 }
  
-               page_start = page_offset(page);
-               page_end = page_start + PAGE_SIZE - 1;
-               while (1) {
-                       lock_extent_bits(tree, page_start, page_end,
-                                        &cached_state);
-                       ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
-                                                             page_start);
-                       unlock_extent_cached(tree, page_start, page_end,
-                                            &cached_state);
-                       if (!ordered)
-                               break;
-
-                       unlock_page(page);
-                       btrfs_start_ordered_extent(ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       lock_page(page);
-                       /*
-                        * we unlocked the page above, so we need check if
-                        * it was released or not.
-                        */
-                       if (page->mapping != inode->i_mapping) {
-                               unlock_page(page);
-                               put_page(page);
-                               goto again;
+add:
+               range_len = min(extent_map_end(em), start + len) - cur;
+               /*
+                * This one is a good target, check if it can be merged into
+                * last range of the target list.
+                */
+               if (!list_empty(target_list)) {
+                       struct defrag_target_range *last;
+
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       ASSERT(last->start + last->len <= cur);
+                       if (last->start + last->len == cur) {
+                               /* Mergeable, enlarge the last entry */
+                               last->len += range_len;
+                               goto next;
                         }
+                       /* Fall through to allocate a new entry */
                 }
  
-               if (!PageUptodate(page)) {
-                       btrfs_readpage(NULL, page);
-                       lock_page(page);
-                       if (!PageUptodate(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               ret = -EIO;
-                               break;
-                       }
+               /* Allocate new defrag_target_range */
+               new = kmalloc(sizeof(*new), GFP_NOFS);
+               if (!new) {
+                       free_extent_map(em);
+                       ret = -ENOMEM;
+                       break;
                 }
+               new->start = cur;
+               new->len = range_len;
+               list_add_tail(&new->list, target_list);
  
-               if (page->mapping != inode->i_mapping) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto again;
+next:
+               cur = extent_map_end(em);
+               free_extent_map(em);
+       }
+       if (ret < 0) {
+               struct defrag_target_range *entry;
+               struct defrag_target_range *tmp;
+
+               list_for_each_entry_safe(entry, tmp, target_list, list) {
+                       list_del_init(&entry->list);
+                       kfree(entry);
                 }
+       }
+       return ret;
+}
+
+#define CLUSTER_SIZE   (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode:     target inode
+ * @target:    target range to defrag
+ * @pages:     locked pages covering the defrag range
+ * @nr_pages:  number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ *   Pages should be locked, no ordered extent in the pages range,
+ *   no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+                                   struct defrag_target_range *target,
+                                   struct page **pages, int nr_pages,
+                                   struct extent_state **cached_state)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_changeset *data_reserved = NULL;
+       const u64 start = target->start;
+       const u64 len = target->len;
+       unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+       unsigned long start_index = start >> PAGE_SHIFT;
+       unsigned long first_index = page_index(pages[0]);
+       int ret = 0;
+       int i;
+
+       ASSERT(last_index - first_index + 1 <= nr_pages);
+
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+       if (ret < 0)
+               return ret;
+       clear_extent_bit(&inode->io_tree, start, start + len - 1,
+                        EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                        EXTENT_DEFRAG, 0, 0, cached_state);
+       set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
  
-               pages[i] = page;
-               i_done++;
+       /* Update the page status */
+       for (i = start_index - first_index; i <= last_index - first_index; i++) {
+               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
         }
-       if (!i_done || ret)
-               goto out;
+       btrfs_delalloc_release_extents(inode, len);
+       extent_changeset_free(data_reserved);
  
-       if (!(inode->i_sb->s_flags & SB_ACTIVE))
-               goto out;
+       return ret;
+}
  
-       /*
-        * so now we have a nice long stream of locked
-        * and up to date pages, lets wait on them
-        */
-       for (i = 0; i < i_done; i++)
-               wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+                           u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+       struct extent_state *cached_state = NULL;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       struct page **pages;
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+       u64 start_index = start >> PAGE_SHIFT;
+       unsigned int nr_pages = last_index - start_index + 1;
+       int ret = 0;
+       int i;
+
+       ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+       ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
  
-       page_start = page_offset(pages[0]);
-       page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+       if (!pages)
+               return -ENOMEM;
  
-       lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                        page_start, page_end - 1, &cached_state);
+       /* Prepare all pages */
+       for (i = 0; i < nr_pages; i++) {
+               pages[i] = defrag_prepare_one_page(inode, start_index + i);
+               if (IS_ERR(pages[i])) {
+                       ret = PTR_ERR(pages[i]);
+                       pages[i] = NULL;
+                       goto free_pages;
+               }
+       }
+       for (i = 0; i < nr_pages; i++)
+               wait_on_page_writeback(pages[i]);
  
+       /* Lock the pages range */
+       lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+                        (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                        &cached_state);
         /*
-        * When defragmenting we skip ranges that have holes or inline extents,
-        * (check should_defrag_range()), to avoid unnecessary IO and wasting
-        * space. At btrfs_defrag_file(), we check if a range should be defragged
-        * before locking the inode and then, if it should, we trigger a sync
-        * page cache readahead - we lock the inode only after that to avoid
-        * blocking for too long other tasks that possibly want to operate on
-        * other file ranges. But before we were able to get the inode lock,
-        * some other task may have punched a hole in the range, or we may have
-        * now an inline extent, in which case we should not defrag. So check
-        * for that here, where we have the inode and the range locked, and bail
-        * out if that happened.
+        * Now we have a consistent view about the extent map, re-check
+        * which range really needs to be defragged.
+        *
+        * And this time we have extent locked already, pass @locked = true
+        * so that we won't relock the extent range and cause deadlock.
          */
-       search_start = page_start;
-       while (search_start < page_end) {
-               struct extent_map *em;
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, true,
+                                    &target_list);
+       if (ret < 0)
+               goto unlock_extent;
  
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
-                                     page_end - search_start);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out_unlock_range;
-               }
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                       free_extent_map(em);
-                       /* Ok, 0 means we did not defrag anything */
-                       ret = 0;
-                       goto out_unlock_range;
+       list_for_each_entry(entry, &target_list, list) {
+               ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+                                              &cached_state);
+               if (ret < 0)
+                       break;
+       }
+
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
+       }
+unlock_extent:
+       unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+                            (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                            &cached_state);
+free_pages:
+       for (i = 0; i < nr_pages; i++) {
+               if (pages[i]) {
+                       unlock_page(pages[i]);
+                       put_page(pages[i]);
                 }
-               search_start = extent_map_end(em);
-               free_extent_map(em);
         }
+       kfree(pages);
+       return ret;
+}
  
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
-                         page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                         EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+                             struct file_ra_state *ra,
+                             u64 start, u32 len, u32 extent_thresh,
+                             u64 newer_than, bool do_compress,
+                             unsigned long *sectors_defragged,
+                             unsigned long max_sectors)
+{
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       int ret;
  
-       if (i_done != page_cnt) {
-               spin_lock(&BTRFS_I(inode)->lock);
-               btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
-               spin_unlock(&BTRFS_I(inode)->lock);
-               btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                               start, (page_cnt - i_done) << PAGE_SHIFT, true);
-       }
+       BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, false,
+                                    &target_list);
+       if (ret < 0)
+               goto out;
  
+       list_for_each_entry(entry, &target_list, list) {
+               u32 range_len = entry->len;
  
-       set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
-                         &cached_state);
+               /* Reached the limit */
+               if (max_sectors && max_sectors == *sectors_defragged)
+                       break;
  
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
+               if (max_sectors)
+                       range_len = min_t(u32, range_len,
+                               (max_sectors - *sectors_defragged) * sectorsize);
  
-       for (i = 0; i < i_done; i++) {
-               clear_page_dirty_for_io(pages[i]);
-               ClearPageChecked(pages[i]);
-               set_page_dirty(pages[i]);
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+               if (ra)
+                       page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+                               ra, NULL, entry->start >> PAGE_SHIFT,
+                               ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+                               (entry->start >> PAGE_SHIFT) + 1);
+               /*
+                * Here we may not defrag any range if holes are punched before
+                * we locked the pages.
+                * But that's fine, it only affects the @sectors_defragged
+                * accounting.
+                */
+               ret = defrag_one_range(inode, entry->start, range_len,
+                                      extent_thresh, newer_than, do_compress);
+               if (ret < 0)
+                       break;
+               *sectors_defragged += range_len;
         }
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
-       return i_done;
-
-out_unlock_range:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
  out:
-       for (i = 0; i < i_done; i++) {
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
         }
-       btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                       start, page_cnt << PAGE_SHIFT, true);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
         return ret;
-
  }
  
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode:        inode to be defragged
+ * @ra:                   readahead state (can be NUL)
+ * @range:        defrag options including range and flags
+ * @newer_than:           minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ *                will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                       struct btrfs_ioctl_defrag_range_args *range,
                       u64 newer_than, unsigned long max_to_defrag)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct file_ra_state *ra = NULL;
-       unsigned long last_index;
+       unsigned long sectors_defragged = 0;
         u64 isize = i_size_read(inode);
-       u64 last_len = 0;
-       u64 skip = 0;
-       u64 defrag_end = 0;
-       u64 newer_off = range->start;
-       unsigned long i;
-       unsigned long ra_index = 0;
-       int ret;
-       int defrag_count = 0;
+       u64 cur;
+       u64 last_byte;
+       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+       bool ra_allocated = false;
         int compress_type = BTRFS_COMPRESS_ZLIB;
+       int ret = 0;
         u32 extent_thresh = range->extent_thresh;
-       unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
-       unsigned long cluster = max_cluster;
-       u64 new_align = ~((u64)SZ_128K - 1);
-       struct page **pages = NULL;
-       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
  
         if (isize == 0)
                 return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
         if (extent_thresh == 0)
                 extent_thresh = SZ_256K;
  
+       if (range->start + range->len > range->start) {
+               /* Got a specific range */
+               last_byte = min(isize, range->start + range->len) - 1;
+       } else {
+               /* Defrag until file end */
+               last_byte = isize - 1;
+       }
+
         /*
-        * If we were not given a file, allocate a readahead context. As
+        * If we were not given a ra, allocate a readahead context. As
          * readahead is just an optimization, defrag will work without it so
          * we don't error out.
          */
-       if (!file) {
+       if (!ra) {
+               ra_allocated = true;
                 ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                 if (ra)
                         file_ra_state_init(ra, inode->i_mapping);
-       } else {
-               ra = &file->f_ra;
-       }
-
-       pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto out_ra;
-       }
-
-       /* find the last page to defrag */
-       if (range->start + range->len > range->start) {
-               last_index = min_t(u64, isize - 1,
-                        range->start + range->len - 1) >> PAGE_SHIFT;
-       } else {
-               last_index = (isize - 1) >> PAGE_SHIFT;
-       }
-
-       if (newer_than) {
-               ret = find_new_extents(root, inode, newer_than,
-                                      &newer_off, SZ_64K);
-               if (!ret) {
-                       range->start = newer_off;
-                       /*
-                        * we always align our defrag to help keep
-                        * the extents in the file evenly spaced
-                        */
-                       i = (newer_off & new_align) >> PAGE_SHIFT;
-               } else
-                       goto out_ra;
-       } else {
-               i = range->start >> PAGE_SHIFT;
         }
-       if (!max_to_defrag)
-               max_to_defrag = last_index - i + 1;
  
-       /*
-        * make writeback starts from i, so the defrag range can be
-        * written sequentially.
-        */
-       if (i < inode->i_mapping->writeback_index)
-               inode->i_mapping->writeback_index = i;
-
-       while (i <= last_index && defrag_count < max_to_defrag &&
-              (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
-               /*
-                * make sure we stop running if someone unmounts
-                * the FS
-                */
-               if (!(inode->i_sb->s_flags & SB_ACTIVE))
-                       break;
-
-               if (btrfs_defrag_cancelled(fs_info)) {
-                       btrfs_debug(fs_info, "defrag_file cancelled");
-                       ret = -EAGAIN;
-                       goto error;
-               }
+       /* Align the range */
+       cur = round_down(range->start, fs_info->sectorsize);
+       last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
  
-               if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
-                                        extent_thresh, &last_len, &skip,
-                                        &defrag_end, do_compress)){
-                       unsigned long next;
-                       /*
-                        * the should_defrag function tells us how much to skip
-                        * bump our counter by the suggested amount
-                        */
-                       next = DIV_ROUND_UP(skip, PAGE_SIZE);
-                       i = max(i + 1, next);
-                       continue;
-               }
+       while (cur < last_byte) {
+               u64 cluster_end;
  
-               if (!newer_than) {
-                       cluster = (PAGE_ALIGN(defrag_end) >>
-                                  PAGE_SHIFT) - i;
-                       cluster = min(cluster, max_cluster);
-               } else {
-                       cluster = max_cluster;
-               }
+               /* The cluster size 256K should always be page aligned */
+               BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
  
-               if (i + cluster > ra_index) {
-                       ra_index = max(i, ra_index);
-                       if (ra)
-                               page_cache_sync_readahead(inode->i_mapping, ra,
-                                               file, ra_index, cluster);
-                       ra_index += cluster;
-               }
+               /* We want the cluster end at page boundary when possible */
+               cluster_end = (((cur >> PAGE_SHIFT) +
+                              (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+               cluster_end = min(cluster_end, last_byte);
  
                 btrfs_inode_lock(inode, 0);
                 if (IS_SWAPFILE(inode)) {
                         ret = -ETXTBSY;
-               } else {
-                       if (do_compress)
-                               BTRFS_I(inode)->defrag_compress = compress_type;
-                       ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+                       btrfs_inode_unlock(inode, 0);
+                       break;
                 }
-               if (ret < 0) {
+               if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
                         btrfs_inode_unlock(inode, 0);
-                       goto out_ra;
+                       break;
                 }
-
-               defrag_count += ret;
-               balance_dirty_pages_ratelimited(inode->i_mapping);
+               if (do_compress)
+                       BTRFS_I(inode)->defrag_compress = compress_type;
+               ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+                               cluster_end + 1 - cur, extent_thresh,
+                               newer_than, do_compress,
+                               &sectors_defragged, max_to_defrag);
                 btrfs_inode_unlock(inode, 0);
-
-               if (newer_than) {
-                       if (newer_off == (u64)-1)
-                               break;
-
-                       if (ret > 0)
-                               i += ret;
-
-                       newer_off = max(newer_off + 1,
-                                       (u64)i << PAGE_SHIFT);
-
-                       ret = find_new_extents(root, inode, newer_than,
-                                              &newer_off, SZ_64K);
-                       if (!ret) {
-                               range->start = newer_off;
-                               i = (newer_off & new_align) >> PAGE_SHIFT;
-                       } else {
-                               break;
-                       }
-               } else {
-                       if (ret > 0) {
-                               i += ret;
-                               last_len += ret << PAGE_SHIFT;
-                       } else {
-                               i++;
-                               last_len = 0;
-                       }
-               }
+               if (ret < 0)
+                       break;
+               cur = cluster_end + 1;
         }
  
-       ret = defrag_count;
-error:
-       if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
-               filemap_flush(inode->i_mapping);
-               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
+       if (ra_allocated)
+               kfree(ra);
+       if (sectors_defragged) {
+               /*
+                * We have defragged some sectors, for compression case they
+                * need to be written back immediately.
+                */
+               if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
                         filemap_flush(inode->i_mapping);
+                       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                                    &BTRFS_I(inode)->runtime_flags))
+                               filemap_flush(inode->i_mapping);
+               }
+               if (range->compress_type == BTRFS_COMPRESS_LZO)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+               else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+               ret = sectors_defragged;
         }
-
-       if (range->compress_type == BTRFS_COMPRESS_LZO) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
-       } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
-       }
-
-out_ra:
         if (do_compress) {
                 btrfs_inode_lock(inode, 0);
                 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
                 btrfs_inode_unlock(inode, 0);
         }
-       if (!file)
-               kfree(ra);
-       kfree(pages);
         return ret;
  }
  
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
  static noinline int btrfs_ioctl_resize(struct file *file,
                                         void __user *arg)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                 btrfs_info(fs_info, "resizing devid %llu", devid);
         }
  
-       device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       args.devid = devid;
+       device = btrfs_find_device(fs_info->fs_devices, &args);
         if (!device) {
                 btrfs_info(fs_info, "resizer unable to find device %llu",
                            devid);
@@ -3136,12 +3097,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                 goto out;
         }
  
-       /* Subpage defrag will be supported in later commits */
-       if (root->fs_info->sectorsize < PAGE_SIZE) {
-               ret = -ENOTTY;
-               goto out;
-       }
-
         switch (inode->i_mode & S_IFMT) {
         case S_IFDIR:
                 if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3131,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                         /* the rest are all set to zero by kzalloc */
                         range.len = (u64)-1;
                 }
-               ret = btrfs_defrag_file(file_inode(file), file,
+               ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
                                         &range, BTRFS_OLDEST_GENERATION, 0);
                 if (ret > 0)
                         ret = 0;
@@ -3220,6 +3175,7 @@ out:
  
  static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3187,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
-
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
-               goto err_drop;
+               goto out;
         }
  
         if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
                 ret = -EOPNOTSUPP;
                 goto out;
         }
+
         vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
-       if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
-           strcmp("cancel", vol_args->name) == 0)
+       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+               args.devid = vol_args->devid;
+       } else if (!strcmp("cancel", vol_args->name)) {
                 cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
  
         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                            cancel);
         if (ret)
-               goto out;
-       /* Exclusive operation is now claimed */
+               goto err_drop;
  
-       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
-               ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
-       else
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+       /* Exclusive operation is now claimed */
+       ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
  
         btrfs_exclop_finish(fs_info);
  
@@ -3271,17 +3231,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
                         btrfs_info(fs_info, "device deleted: %s",
                                         vol_args->name);
         }
-out:
-       kfree(vol_args);
  err_drop:
         mnt_drop_write_file(file);
         if (bdev)
                 blkdev_put(bdev, mode);
+out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
         return ret;
  }
  
  static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct inode *inode = file_inode(file);
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3255,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
-
         vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args)) {
-               ret = PTR_ERR(vol_args);
-               goto out_drop_write;
-       }
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+
         vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-       cancel = (strcmp("cancel", vol_args->name) == 0);
+       if (!strcmp("cancel", vol_args->name)) {
+               cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
  
         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                            cancel);
         if (ret == 0) {
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+               ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
                 if (!ret)
                         btrfs_info(fs_info, "disk deleted %s", vol_args->name);
                 btrfs_exclop_finish(fs_info);
         }
  
-       kfree(vol_args);
-out_drop_write:
         mnt_drop_write_file(file);
         if (bdev)
                 blkdev_put(bdev, mode);
+out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
         return ret;
  }
  
@@ -3379,22 +3347,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
  static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
                                  void __user *arg)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_ioctl_dev_info_args *di_args;
         struct btrfs_device *dev;
         int ret = 0;
-       char *s_uuid = NULL;
  
         di_args = memdup_user(arg, sizeof(*di_args));
         if (IS_ERR(di_args))
                 return PTR_ERR(di_args);
  
+       args.devid = di_args->devid;
         if (!btrfs_is_empty_uuid(di_args->uuid))
-               s_uuid = di_args->uuid;
+               args.uuid = di_args->uuid;
  
         rcu_read_lock();
-       dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
-                               NULL);
-
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (!dev) {
                 ret = -ENODEV;
                 goto out;
@@ -4430,7 +4397,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
                                                 void __user *arg)
  {
         struct btrfs_ioctl_quota_rescan_args qsa = {0};
-       int ret = 0;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -4441,9 +4407,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
         }
  
         if (copy_to_user(arg, &qsa, sizeof(qsa)))
-               ret = -EFAULT;
+               return -EFAULT;
  
-       return ret;
+       return 0;
  }
  
  static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h

index a2e1f1f..bbc4553 100644 (file)
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
  struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
  
  #ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
-       lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+       lockdep_assert_held_write(&eb->lock);
  }
  #else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
  #endif
  
  void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c

index 295bbc1..65cb076 100644 (file)
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -32,19 +32,19 @@
   *     payload.
   *     One regular LZO compressed extent can have one or more segments.
   *     For inlined LZO compressed extent, only one segment is allowed.
- *     One segment represents at most one page of uncompressed data.
+ *     One segment represents at most one sector of uncompressed data.
   *
   * 2.1 Segment header
   *     Fixed size. LZO_LEN (4) bytes long, LE32.
   *     Records the total size of the segment (not including the header).
- *     Segment header never crosses page boundary, thus it's possible to
- *     have at most 3 padding zeros at the end of the page.
+ *     Segment header never crosses sector boundary, thus it's possible to
+ *     have at most 3 padding zeros at the end of the sector.
   *
   * 2.2 Data Payload
- *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- *     which is 4419 for a 4KiB page.
+ *     Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ *     which is 4419 for a 4KiB sectorsize.
   *
- * Example:
+ * Example with 4K sectorsize:
   * Page 1:
   *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
   * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
@@ -112,170 +112,174 @@ static inline size_t read_compress_length(const char *buf)
         return le32_to_cpu(dlen);
  }
  
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ *   If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+                                       size_t compressed_size,
+                                       struct page **out_pages,
+                                       u32 *cur_out,
+                                       const u32 sectorsize)
+{
+       u32 sector_bytes_left;
+       u32 orig_out;
+       struct page *cur_page;
+       char *kaddr;
+
+       /*
+        * We never allow a segment header crossing sector boundary, previous
+        * run should ensure we have enough space left inside the sector.
+        */
+       ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+       cur_page = out_pages[*cur_out / PAGE_SIZE];
+       /* Allocate a new page */
+       if (!cur_page) {
+               cur_page = alloc_page(GFP_NOFS);
+               if (!cur_page)
+                       return -ENOMEM;
+               out_pages[*cur_out / PAGE_SIZE] = cur_page;
+       }
+
+       kaddr = kmap(cur_page);
+       write_compress_length(kaddr + offset_in_page(*cur_out),
+                             compressed_size);
+       *cur_out += LZO_LEN;
+
+       orig_out = *cur_out;
+
+       /* Copy compressed data */
+       while (*cur_out - orig_out < compressed_size) {
+               u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+                                    orig_out + compressed_size - *cur_out);
+
+               kunmap(cur_page);
+               cur_page = out_pages[*cur_out / PAGE_SIZE];
+               /* Allocate a new page */
+               if (!cur_page) {
+                       cur_page = alloc_page(GFP_NOFS);
+                       if (!cur_page)
+                               return -ENOMEM;
+                       out_pages[*cur_out / PAGE_SIZE] = cur_page;
+               }
+               kaddr = kmap(cur_page);
+
+               memcpy(kaddr + offset_in_page(*cur_out),
+                      compressed_data + *cur_out - orig_out, copy_len);
+
+               *cur_out += copy_len;
+       }
+
+       /*
+        * Check if we can fit the next segment header into the remaining space
+        * of the sector.
+        */
+       sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+       if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+               goto out;
+
+       /* The remaining size is not enough, pad it with zeros */
+       memset(kaddr + offset_in_page(*cur_out), 0,
+              sector_bytes_left);
+       *cur_out += sector_bytes_left;
+
+out:
+       kunmap(cur_page);
+       return 0;
+}
+
  int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                 u64 start, struct page **pages, unsigned long *out_pages,
                 unsigned long *total_in, unsigned long *total_out)
  {
         struct workspace *workspace = list_entry(ws, struct workspace, list);
+       const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+       struct page *page_in = NULL;
+       char *sizes_ptr;
         int ret = 0;
-       char *data_in;
-       char *cpage_out, *sizes_ptr;
-       int nr_pages = 0;
-       struct page *in_page = NULL;
-       struct page *out_page = NULL;
-       unsigned long bytes_left;
-       unsigned long len = *total_out;
-       unsigned long nr_dest_pages = *out_pages;
-       const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-       size_t in_len;
-       size_t out_len;
-       char *buf;
-       unsigned long tot_in = 0;
-       unsigned long tot_out = 0;
-       unsigned long pg_bytes_left;
-       unsigned long out_offset;
-       unsigned long bytes;
+       /* Points to the file offset of input data */
+       u64 cur_in = start;
+       /* Points to the current output byte */
+       u32 cur_out = 0;
+       u32 len = *total_out;
  
         *out_pages = 0;
         *total_out = 0;
         *total_in = 0;
  
-       in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       data_in = kmap(in_page);
-
         /*
-        * store the size of all chunks of compressed data in
-        * the first 4 bytes
+        * Skip the header for now, we will later come back and write the total
+        * compressed size
          */
-       out_page = alloc_page(GFP_NOFS);
-       if (out_page == NULL) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       cpage_out = kmap(out_page);
-       out_offset = LZO_LEN;
-       tot_out = LZO_LEN;
-       pages[0] = out_page;
-       nr_pages = 1;
-       pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
-       /* compress at most one page of data each time */
-       in_len = min(len, PAGE_SIZE);
-       while (tot_in < len) {
-               ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
-                                      &out_len, workspace->mem);
-               if (ret != LZO_E_OK) {
-                       pr_debug("BTRFS: lzo in loop returned %d\n",
-                              ret);
+       cur_out += LZO_LEN;
+       while (cur_in < start + len) {
+               char *data_in;
+               const u32 sectorsize_mask = sectorsize - 1;
+               u32 sector_off = (cur_in - start) & sectorsize_mask;
+               u32 in_len;
+               size_t out_len;
+
+               /* Get the input page first */
+               if (!page_in) {
+                       page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+                       ASSERT(page_in);
+               }
+
+               /* Compress at most one sector of data each time */
+               in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+               ASSERT(in_len);
+               data_in = kmap(page_in);
+               ret = lzo1x_1_compress(data_in +
+                                      offset_in_page(cur_in), in_len,
+                                      workspace->cbuf, &out_len,
+                                      workspace->mem);
+               kunmap(page_in);
+               if (ret < 0) {
+                       pr_debug("BTRFS: lzo in loop returned %d\n", ret);
                         ret = -EIO;
                         goto out;
                 }
  
-               /* store the size of this chunk of compressed data */
-               write_compress_length(cpage_out + out_offset, out_len);
-               tot_out += LZO_LEN;
-               out_offset += LZO_LEN;
-               pg_bytes_left -= LZO_LEN;
-
-               tot_in += in_len;
-               tot_out += out_len;
-
-               /* copy bytes from the working buffer into the pages */
-               buf = workspace->cbuf;
-               while (out_len) {
-                       bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
-                       memcpy(cpage_out + out_offset, buf, bytes);
-
-                       out_len -= bytes;
-                       pg_bytes_left -= bytes;
-                       buf += bytes;
-                       out_offset += bytes;
-
-                       /*
-                        * we need another page for writing out.
-                        *
-                        * Note if there's less than 4 bytes left, we just
-                        * skip to a new page.
-                        */
-                       if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
-                           pg_bytes_left == 0) {
-                               if (pg_bytes_left) {
-                                       memset(cpage_out + out_offset, 0,
-                                              pg_bytes_left);
-                                       tot_out += pg_bytes_left;
-                               }
-
-                               /* we're done, don't allocate new page */
-                               if (out_len == 0 && tot_in >= len)
-                                       break;
-
-                               kunmap(out_page);
-                               if (nr_pages == nr_dest_pages) {
-                                       out_page = NULL;
-                                       ret = -E2BIG;
-                                       goto out;
-                               }
-
-                               out_page = alloc_page(GFP_NOFS);
-                               if (out_page == NULL) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               cpage_out = kmap(out_page);
-                               pages[nr_pages++] = out_page;
-
-                               pg_bytes_left = PAGE_SIZE;
-                               out_offset = 0;
-                       }
-               }
+               ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+                                                  pages, &cur_out, sectorsize);
+               if (ret < 0)
+                       goto out;
  
-               /* we're making it bigger, give up */
-               if (tot_in > 8192 && tot_in < tot_out) {
+               cur_in += in_len;
+
+               /*
+                * Check if we're making it bigger after two sectors.  And if
+                * it is so, give up.
+                */
+               if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
                         ret = -E2BIG;
                         goto out;
                 }
  
-               /* we're all done */
-               if (tot_in >= len)
-                       break;
-
-               if (tot_out > max_out)
-                       break;
-
-               bytes_left = len - tot_in;
-               kunmap(in_page);
-               put_page(in_page);
-
-               start += PAGE_SIZE;
-               in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-               data_in = kmap(in_page);
-               in_len = min(bytes_left, PAGE_SIZE);
-       }
-
-       if (tot_out >= tot_in) {
-               ret = -E2BIG;
-               goto out;
+               /* Check if we have reached page boundary */
+               if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+                       put_page(page_in);
+                       page_in = NULL;
+               }
         }
  
-       /* store the size of all chunks of compressed data */
+       /* Store the size of all chunks of compressed data */
         sizes_ptr = kmap_local_page(pages[0]);
-       write_compress_length(sizes_ptr, tot_out);
+       write_compress_length(sizes_ptr, cur_out);
         kunmap_local(sizes_ptr);
  
         ret = 0;
-       *total_out = tot_out;
-       *total_in = tot_in;
+       *total_out = cur_out;
+       *total_in = cur_in - start;
  out:
-       *out_pages = nr_pages;
-       if (out_page)
-               kunmap(out_page);
-
-       if (in_page) {
-               kunmap(in_page);
-               put_page(in_page);
-       }
-
+       *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
         return ret;
  }
  
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c

index d8d268c..0e239a4 100644 (file)
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
  };
  
  struct btrfs_raid_bio {
-       struct btrfs_fs_info *fs_info;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
  
         /* while we're doing rmw on a stripe
          * we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
  static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
  {
         btrfs_init_work(&rbio->work, work_func, NULL, NULL);
-       btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+       btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
  }
  
  /*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
   */
  static int rbio_bucket(struct btrfs_raid_bio *rbio)
  {
-       u64 num = rbio->bbio->raid_map[0];
+       u64 num = rbio->bioc->raid_map[0];
  
         /*
          * we shift down quite a bit.  We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
                 return;
  
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
         h = table->table + bucket;
  
         /* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
                 return;
  
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
  
         spin_lock_irqsave(&table->cache_lock, flags);
         __remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
                 return;
  
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
  
         spin_lock_irqsave(&table->cache_lock, flags);
         spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
             test_bit(RBIO_CACHE_BIT, &cur->flags))
                 return 0;
  
-       if (last->bbio->raid_map[0] !=
-           cur->bbio->raid_map[0])
+       if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
                 return 0;
  
         /* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
         struct btrfs_raid_bio *cache_drop = NULL;
         int ret = 0;
  
-       h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+       h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
  
         spin_lock_irqsave(&h->lock, flags);
         list_for_each_entry(cur, &h->hash_list, hash_list) {
-               if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+               if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
                         continue;
  
                 spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
         int keep_cache = 0;
  
         bucket = rbio_bucket(rbio);
-       h = rbio->fs_info->stripe_hash_table->table + bucket;
+       h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
  
         if (list_empty(&rbio->plug_list))
                 cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                 }
         }
  
-       btrfs_put_bbio(rbio->bbio);
+       btrfs_put_bioc(rbio->bioc);
         kfree(rbio);
  }
  
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
         struct bio *extra;
  
         if (rbio->generic_bio_cnt)
-               btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+               btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
  
         /*
          * At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
  
         /* OK, we have read all the stripes we need to. */
         max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
-                    0 : rbio->bbio->max_errors;
+                    0 : rbio->bioc->max_errors;
         if (atomic_read(&rbio->error) > max_errors)
                 err = BLK_STS_IOERR;
  
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
   * this does not allocate any pages for rbio->pages.
   */
  static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
-                                        struct btrfs_bio *bbio,
+                                        struct btrfs_io_context *bioc,
                                          u64 stripe_len)
  {
         struct btrfs_raid_bio *rbio;
         int nr_data = 0;
-       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
         int num_pages = rbio_nr_pages(stripe_len, real_stripes);
         int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
         void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
         spin_lock_init(&rbio->bio_list_lock);
         INIT_LIST_HEAD(&rbio->stripe_cache);
         INIT_LIST_HEAD(&rbio->hash_list);
-       rbio->bbio = bbio;
-       rbio->fs_info = fs_info;
+       rbio->bioc = bioc;
         rbio->stripe_len = stripe_len;
         rbio->nr_pages = num_pages;
         rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
         CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
  #undef  CONSUME_ALLOC
  
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
                 nr_data = real_stripes - 1;
-       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+       else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
                 nr_data = real_stripes - 2;
         else
                 BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
         struct bio *last = bio_list->tail;
         int ret;
         struct bio *bio;
-       struct btrfs_bio_stripe *stripe;
+       struct btrfs_io_stripe *stripe;
         u64 disk_start;
  
-       stripe = &rbio->bbio->stripes[stripe_nr];
+       stripe = &rbio->bioc->stripes[stripe_nr];
         disk_start = stripe->physical + (page_index << PAGE_SHIFT);
  
         /* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
         }
  
         /* put a new bio on the list */
-       bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
-       btrfs_io_bio(bio)->device = stripe->dev;
+       bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+       btrfs_bio(bio)->device = stripe->dev;
         bio->bi_iter.bi_size = 0;
         bio_set_dev(bio, stripe->dev->bdev);
         bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
                 int i = 0;
  
                 start = bio->bi_iter.bi_sector << 9;
-               stripe_offset = start - rbio->bbio->raid_map[0];
+               stripe_offset = start - rbio->bioc->raid_map[0];
                 page_index = stripe_offset >> PAGE_SHIFT;
  
                 if (bio_flagged(bio, BIO_CLONED))
-                       bio->bi_iter = btrfs_io_bio(bio)->iter;
+                       bio->bi_iter = btrfs_bio(bio)->iter;
  
                 bio_for_each_segment(bvec, bio, iter) {
                         rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
   */
  static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
  {
-       struct btrfs_bio *bbio = rbio->bbio;
+       struct btrfs_io_context *bioc = rbio->bioc;
         void **pointers = rbio->finish_pointers;
         int nr_data = rbio->nr_data;
         int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                 }
         }
  
-       if (likely(!bbio->num_tgtdevs))
+       if (likely(!bioc->num_tgtdevs))
                 goto write_data;
  
         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-               if (!bbio->tgtdev_map[stripe])
+               if (!bioc->tgtdev_map[stripe])
                         continue;
  
                 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                         }
  
                         ret = rbio_add_io_page(rbio, &bio_list, page,
-                                              rbio->bbio->tgtdev_map[stripe],
+                                              rbio->bioc->tgtdev_map[stripe],
                                                pagenr, rbio->stripe_len);
                         if (ret)
                                 goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
  {
         u64 physical = bio->bi_iter.bi_sector;
         int i;
-       struct btrfs_bio_stripe *stripe;
+       struct btrfs_io_stripe *stripe;
  
         physical <<= 9;
  
-       for (i = 0; i < rbio->bbio->num_stripes; i++) {
-               stripe = &rbio->bbio->stripes[i];
+       for (i = 0; i < rbio->bioc->num_stripes; i++) {
+               stripe = &rbio->bioc->stripes[i];
                 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
                     stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
                         return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
         int i;
  
         for (i = 0; i < rbio->nr_data; i++) {
-               u64 stripe_start = rbio->bbio->raid_map[i];
+               u64 stripe_start = rbio->bioc->raid_map[i];
  
                 if (in_range(logical, stripe_start, rbio->stripe_len))
                         return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
         if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                 goto cleanup;
  
         /*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
         }
  
         /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
          */
         atomic_set(&rbio->stripes_pending, bios_to_read);
         while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
                 bio->bi_end_io = raid_rmw_end_io;
                 bio->bi_opf = REQ_OP_READ;
  
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
  
                 submit_bio(bio);
         }
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
  /*
   * our main entry point for writes from the rest of the FS.
   */
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
-                       struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+                       u64 stripe_len)
  {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
         struct btrfs_raid_bio *rbio;
         struct btrfs_plug_cb *plug = NULL;
         struct blk_plug_cb *cb;
         int ret;
  
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
         if (IS_ERR(rbio)) {
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                 return PTR_ERR(rbio);
         }
         bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                 }
  
                 /* all raid6 handling here */
-               if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+               if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
                         /*
                          * single failure, rebuild from parity raid5
                          * style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                          * here due to a crc mismatch and we can't give them the
                          * data they want
                          */
-                       if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
-                               if (rbio->bbio->raid_map[faila] ==
+                       if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+                               if (rbio->bioc->raid_map[faila] ==
                                     RAID5_P_STRIPE) {
                                         err = BLK_STS_IOERR;
                                         goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                                 goto pstripe;
                         }
  
-                       if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+                       if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
                                 raid6_datap_recov(rbio->real_stripes,
                                                   PAGE_SIZE, faila, pointers);
                         } else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
         if (!atomic_dec_and_test(&rbio->stripes_pending))
                 return;
  
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                 rbio_orig_end_io(rbio, BLK_STS_IOERR);
         else
                 __raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                  * were up to date, or we might have no bios to read because
                  * the devices were gone.
                  */
-               if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+               if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
                         __raid_recover_end_io(rbio);
                         return 0;
                 } else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
         }
  
         /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
          */
         atomic_set(&rbio->stripes_pending, bios_to_read);
         while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                 bio->bi_end_io = raid_recover_end_io;
                 bio->bi_opf = REQ_OP_READ;
  
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
  
                 submit_bio(bio);
         }
@@ -2116,22 +2114,22 @@ cleanup:
   * so we assume the bio they send down corresponds to a failed part
   * of the drive.
   */
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 stripe_len,
-                         int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 stripe_len, int mirror_num, int generic_io)
  {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
         struct btrfs_raid_bio *rbio;
         int ret;
  
         if (generic_io) {
-               ASSERT(bbio->mirror_num == mirror_num);
-               btrfs_io_bio(bio)->mirror_num = mirror_num;
+               ASSERT(bioc->mirror_num == mirror_num);
+               btrfs_bio(bio)->mirror_num = mirror_num;
         }
  
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
         if (IS_ERR(rbio)) {
                 if (generic_io)
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                 return PTR_ERR(rbio);
         }
  
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
         rbio->faila = find_logical_bio_stripe(rbio, bio);
         if (rbio->faila == -1) {
                 btrfs_warn(fs_info,
-       "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
                            __func__, bio->bi_iter.bi_sector << 9,
-                          (u64)bio->bi_iter.bi_size, bbio->map_type);
+                          (u64)bio->bi_iter.bi_size, bioc->map_type);
                 if (generic_io)
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                 kfree(rbio);
                 return -EIO;
         }
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
                 btrfs_bio_counter_inc_noblocked(fs_info);
                 rbio->generic_bio_cnt = 1;
         } else {
-               btrfs_get_bbio(bbio);
+               btrfs_get_bioc(bioc);
         }
  
         /*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
  /*
   * The following code is used to scrub/replace the parity stripe
   *
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
   *
   * Note: We need make sure all the pages that add into the scrub/replace
   * raid bio are correct and not be changed during the scrub/replace. That
   * is those pages just hold metadata or file data with checksum.
   */
  
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len,
-                              struct btrfs_device *scrub_dev,
-                              unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+                               struct btrfs_io_context *bioc,
+                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors)
  {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
         struct btrfs_raid_bio *rbio;
         int i;
  
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
         if (IS_ERR(rbio))
                 return NULL;
         bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
  
         /*
-        * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+        * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
          * to the end position, so this search can start from the first parity
          * stripe.
          */
         for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
-               if (bbio->stripes[i].dev == scrub_dev) {
+               if (bioc->stripes[i].dev == scrub_dev) {
                         rbio->scrubp = i;
                         break;
                 }
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
         bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
  
         /*
-        * We have already increased bio_counter when getting bbio, record it
+        * We have already increased bio_counter when getting bioc, record it
          * so we can free it at rbio_orig_end_io().
          */
         rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
         int stripe_offset;
         int index;
  
-       ASSERT(logical >= rbio->bbio->raid_map[0]);
-       ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+       ASSERT(logical >= rbio->bioc->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
                                 rbio->stripe_len * rbio->nr_data);
-       stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+       stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
         index = stripe_offset >> PAGE_SHIFT;
         rbio->bio_pages[index] = page;
  }
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
  static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
                                          int need_check)
  {
-       struct btrfs_bio *bbio = rbio->bbio;
+       struct btrfs_io_context *bioc = rbio->bioc;
         void **pointers = rbio->finish_pointers;
         unsigned long *pbitmap = rbio->finish_pbitmap;
         int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
         else
                 BUG();
  
-       if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+       if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
                 is_replace = 1;
                 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
         }
@@ -2435,7 +2433,7 @@ writeback:
  
                 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
                 ret = rbio_add_io_page(rbio, &bio_list, page,
-                                      bbio->tgtdev_map[rbio->scrubp],
+                                      bioc->tgtdev_map[rbio->scrubp],
                                        pagenr, rbio->stripe_len);
                 if (ret)
                         goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
   */
  static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
  {
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                 goto cleanup;
  
         if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
                  * the data, so the capability of the repair is declined.
                  * (In the case of RAID5, we can not repair anything)
                  */
-               if (dfail > rbio->bbio->max_errors - 1)
+               if (dfail > rbio->bioc->max_errors - 1)
                         goto cleanup;
  
                 /*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
         }
  
         /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
          */
         atomic_set(&rbio->stripes_pending, bios_to_read);
         while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
                 bio->bi_end_io = raid56_parity_scrub_end_io;
                 bio->bi_opf = REQ_OP_READ;
  
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
  
                 submit_bio(bio);
         }
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
  /* The following code is used for dev replace of a missing RAID 5/6 device. */
  
  struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 length)
  {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
         struct btrfs_raid_bio *rbio;
  
-       rbio = alloc_rbio(fs_info, bbio, length);
+       rbio = alloc_rbio(fs_info, bioc, length);
         if (IS_ERR(rbio))
                 return NULL;
  
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
         }
  
         /*
-        * When we get bbio, we have already increased bio_counter, record it
+        * When we get bioc, we have already increased bio_counter, record it
          * so we can free it at rbio_orig_end_io()
          */
         rbio->generic_bio_cnt = 1;
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h

index 2503485..72c00fc 100644 (file)
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
  struct btrfs_raid_bio;
  struct btrfs_device;
  
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 stripe_len,
-                         int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+                       u64 stripe_len);
  
  void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
                             u64 logical);
  
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len,
-                              struct btrfs_device *scrub_dev,
-                              unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+                               struct btrfs_io_context *bioc, u64 stripe_len,
+                               struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors);
  void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
  
  struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 length);
  void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
  
  int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index 06713a8..eb96fdc 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ start_machine:
  }
  
  static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
-                                         struct btrfs_bio *bbio)
+                                         struct btrfs_io_context *bioc)
  {
         struct btrfs_fs_info *fs_info = dev->fs_info;
         int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
         kref_init(&zone->refcnt);
         zone->elems = 0;
         zone->device = dev; /* our device always sits at index 0 */
-       for (i = 0; i < bbio->num_stripes; ++i) {
+       for (i = 0; i < bioc->num_stripes; ++i) {
                 /* bounds have already been checked */
-               zone->devs[i] = bbio->stripes[i].dev;
+               zone->devs[i] = bioc->stripes[i].dev;
         }
-       zone->ndevs = bbio->num_stripes;
+       zone->ndevs = bioc->num_stripes;
  
         spin_lock(&fs_info->reada_lock);
         ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
         int ret;
         struct reada_extent *re = NULL;
         struct reada_extent *re_exist = NULL;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         struct btrfs_device *dev;
         struct btrfs_device *prev_dev;
         u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
          */
         length = fs_info->nodesize;
         ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio, 0);
-       if (ret || !bbio || length < fs_info->nodesize)
+                             &length, &bioc, 0);
+       if (ret || !bioc || length < fs_info->nodesize)
                 goto error;
  
-       if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+       if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
                 btrfs_err(fs_info,
                            "readahead: more than %d copies not supported",
                            BTRFS_MAX_MIRRORS);
                 goto error;
         }
  
-       real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
         for (nzones = 0; nzones < real_stripes; ++nzones) {
                 struct reada_zone *zone;
  
-               dev = bbio->stripes[nzones].dev;
+               dev = bioc->stripes[nzones].dev;
  
                 /* cannot read ahead on missing device. */
                 if (!dev->bdev)
                         continue;
  
-               zone = reada_find_zone(dev, logical, bbio);
+               zone = reada_find_zone(dev, logical, bioc);
                 if (!zone)
                         continue;
  
@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
         if (!have_zone)
                 goto error;
  
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
         return re;
  
  error:
@@ -488,7 +488,7 @@ error:
                 kref_put(&zone->refcnt, reada_zone_release);
                 spin_unlock(&fs_info->reada_lock);
         }
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
         kfree(re);
         return re_exist;
  }
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c

index d2062d5..e2b9f86 100644 (file)
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
  
         if (generic_ref->type == BTRFS_REF_METADATA) {
                 if (!parent)
-                       ref_root = generic_ref->tree_ref.root;
+                       ref_root = generic_ref->tree_ref.owning_root;
                 owner = generic_ref->tree_ref.level;
         } else if (!parent) {
-               ref_root = generic_ref->data_ref.ref_root;
+               ref_root = generic_ref->data_ref.owning_root;
                 owner = generic_ref->data_ref.ino;
                 offset = generic_ref->data_ref.offset;
         }
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c

index 9b08143..e0f93b3 100644 (file)
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
         }
  
         btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
         btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
  out_unlock:
         if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
  static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                              struct inode *dst, u64 dst_loff)
  {
-       int ret;
+       int ret = 0;
         u64 i, tail_len, chunk_count;
         struct btrfs_root *root_dst = BTRFS_I(dst)->root;
  
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 914d403..33a0ee7 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -25,6 +25,7 @@
  #include "backref.h"
  #include "misc.h"
  #include "subpage.h"
+#include "zoned.h"
  
  /*
   * Relocation overview
@@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                 key.offset -= btrfs_file_extent_offset(leaf, fi);
                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
                                        num_bytes, parent);
-               ref.real_root = root->root_key.objectid;
                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                   key.objectid, key.offset);
+                                   key.objectid, key.offset,
+                                   root->root_key.objectid, false);
                 ret = btrfs_inc_extent_ref(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
  
                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                                        num_bytes, parent);
-               ref.real_root = root->root_key.objectid;
                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                   key.objectid, key.offset);
+                                   key.objectid, key.offset,
+                                   root->root_key.objectid, false);
                 ret = btrfs_free_extent(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1368,8 @@ again:
  
                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
                                        blocksize, path->nodes[level]->start);
-               ref.skip_qgroup = true;
-               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+                                   0, true);
                 ret = btrfs_inc_extent_ref(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1377,8 @@ again:
                 }
                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
                                        blocksize, 0);
-               ref.skip_qgroup = true;
-               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+                                   true);
                 ret = btrfs_inc_extent_ref(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1387,8 @@ again:
  
                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
                                        blocksize, path->nodes[level]->start);
-               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
-               ref.skip_qgroup = true;
+               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+                                   0, true);
                 ret = btrfs_free_extent(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1397,8 @@ again:
  
                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
                                        blocksize, 0);
-               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
-               ref.skip_qgroup = true;
+               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+                                   0, true);
                 ret = btrfs_free_extent(trans, &ref);
                 if (ret) {
                         btrfs_abort_transaction(trans, ret);
@@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
                                                node->eb->start, blocksize,
                                                upper->eb->start);
-                       ref.real_root = root->root_key.objectid;
                         btrfs_init_tree_ref(&ref, node->level,
-                                           btrfs_header_owner(upper->eb));
+                                           btrfs_header_owner(upper->eb),
+                                           root->root_key.objectid, false);
                         ret = btrfs_inc_extent_ref(trans, &ref);
                         if (!ret)
                                 ret = btrfs_drop_subtree(trans, root, eb,
@@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                         list_add_tail(&node->list, &rc->backref_cache.changed);
                 } else {
                         path->lowest_level = node->level;
+                       if (root == root->fs_info->chunk_root)
+                               btrfs_reserve_chunk_metadata(trans, false);
                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
                         btrfs_release_path(path);
+                       if (root == root->fs_info->chunk_root)
+                               btrfs_trans_release_chunk_metadata(trans);
                         if (ret > 0)
                                 ret = 0;
                 }
@@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
         if (ret)
                 return ret;
  
-       /*
-        * On a zoned filesystem, we cannot preallocate the file region.
-        * Instead, we dirty and fiemap_write the region.
-        */
-       if (btrfs_is_zoned(inode->root->fs_info)) {
-               struct btrfs_root *root = inode->root;
-               struct btrfs_trans_handle *trans;
-
-               end = cluster->end - offset + 1;
-               trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
-                       return PTR_ERR(trans);
-
-               inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
-               i_size_write(&inode->vfs_inode, end);
-               ret = btrfs_update_inode(trans, root, inode);
-               if (ret) {
-                       btrfs_abort_transaction(trans, ret);
-                       btrfs_end_transaction(trans);
-                       return ret;
-               }
-
-               return btrfs_end_transaction(trans);
-       }
-
         btrfs_inode_lock(&inode->vfs_inode, 0);
         for (nr = 0; nr < cluster->nr; nr++) {
                 start = cluster->boundary[nr] - offset;
@@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
         return ret;
  }
  
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
-                        u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+                               u64 start, u64 end, u64 block_start)
  {
         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
         struct extent_map *em;
@@ -3084,7 +3063,6 @@ release_page:
  static int relocate_file_extent_cluster(struct inode *inode,
                                         struct file_extent_cluster *cluster)
  {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 offset = BTRFS_I(inode)->index_cnt;
         unsigned long index;
         unsigned long last_index;
@@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
  
         file_ra_state_init(ra, inode->i_mapping);
  
-       ret = setup_extent_mapping(inode, cluster->start - offset,
+       ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
                                    cluster->end - offset, cluster->start);
         if (ret)
                 goto out;
@@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
         for (index = (cluster->start - offset) >> PAGE_SHIFT;
              index <= last_index && !ret; index++)
                 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
-       if (btrfs_is_zoned(fs_info) && !ret)
-               ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
         if (ret == 0)
                 WARN_ON(cluster_nr != cluster->nr);
  out:
@@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
         struct btrfs_path *path;
         struct btrfs_inode_item *item;
         struct extent_buffer *leaf;
-       u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
         int ret;
  
-       if (btrfs_is_zoned(trans->fs_info))
-               flags &= ~BTRFS_INODE_PREALLOC;
-
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
         btrfs_set_inode_generation(leaf, item, 1);
         btrfs_set_inode_size(leaf, item, 0);
         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-       btrfs_set_inode_flags(leaf, item, flags);
+       btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                         BTRFS_INODE_PREALLOC);
         btrfs_mark_buffer_dirty(leaf);
  out:
         btrfs_free_path(path);
@@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
                                  rc->block_group->start,
                                  rc->block_group->length);
  
+       ret = btrfs_zone_finish(rc->block_group);
+       WARN_ON(ret && ret != -EAGAIN);
+
         while (1) {
                 int finishes_stage;
  
@@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
         if (!rc)
                 return 0;
  
-       BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
-              root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+       BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
  
         level = btrfs_header_level(buf);
         if (btrfs_header_generation(buf) <=
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 088641b..cf82ea6 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -57,7 +57,7 @@ struct scrub_ctx;
  
  struct scrub_recover {
         refcount_t              refs;
-       struct btrfs_bio        *bbio;
+       struct btrfs_io_context *bioc;
         u64                     map_length;
  };
  
@@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
  static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
  {
         return spage->recover &&
-              (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+              (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
  }
  
  static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
  {
         if (refcount_dec_and_test(&recover->refs)) {
                 btrfs_bio_counter_dec(fs_info);
-               btrfs_put_bbio(recover->bbio);
+               btrfs_put_bioc(recover->bioc);
                 kfree(recover);
         }
  }
@@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                         sblock_other = sblocks_for_recheck + mirror_index;
                 } else {
                         struct scrub_recover *r = sblock_bad->pagev[0]->recover;
-                       int max_allowed = r->bbio->num_stripes -
-                                               r->bbio->num_tgtdevs;
+                       int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
  
                         if (mirror_index >= max_allowed)
                                 break;
@@ -1218,14 +1217,14 @@ out:
         return 0;
  }
  
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
  {
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
                 return 2;
-       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+       else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
                 return 3;
         else
-               return (int)bbio->num_stripes;
+               return (int)bioc->num_stripes;
  }
  
  static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
         u64 flags = original_sblock->pagev[0]->flags;
         u64 have_csum = original_sblock->pagev[0]->have_csum;
         struct scrub_recover *recover;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
         u64 sublen;
         u64 mapped_length;
         u64 stripe_offset;
@@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
         while (length > 0) {
                 sublen = min_t(u64, length, fs_info->sectorsize);
                 mapped_length = sublen;
-               bbio = NULL;
+               bioc = NULL;
  
                 /*
                  * With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                  */
                 btrfs_bio_counter_inc_blocked(fs_info);
                 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &mapped_length, &bbio);
-               if (ret || !bbio || mapped_length < sublen) {
-                       btrfs_put_bbio(bbio);
+                                      logical, &mapped_length, &bioc);
+               if (ret || !bioc || mapped_length < sublen) {
+                       btrfs_put_bioc(bioc);
                         btrfs_bio_counter_dec(fs_info);
                         return -EIO;
                 }
  
                 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                 if (!recover) {
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                         btrfs_bio_counter_dec(fs_info);
                         return -ENOMEM;
                 }
  
                 refcount_set(&recover->refs, 1);
-               recover->bbio = bbio;
+               recover->bioc = bioc;
                 recover->map_length = mapped_length;
  
                 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
  
-               nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+               nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
  
                 for (mirror_index = 0; mirror_index < nmirrors;
                      mirror_index++) {
@@ -1348,17 +1347,17 @@ leave_nomem:
                                        sctx->fs_info->csum_size);
  
                         scrub_stripe_index_and_offset(logical,
-                                                     bbio->map_type,
-                                                     bbio->raid_map,
+                                                     bioc->map_type,
+                                                     bioc->raid_map,
                                                       mapped_length,
-                                                     bbio->num_stripes -
-                                                     bbio->num_tgtdevs,
+                                                     bioc->num_stripes -
+                                                     bioc->num_tgtdevs,
                                                       mirror_index,
                                                       &stripe_index,
                                                       &stripe_offset);
-                       spage->physical = bbio->stripes[stripe_index].physical +
+                       spage->physical = bioc->stripes[stripe_index].physical +
                                          stripe_offset;
-                       spage->dev = bbio->stripes[stripe_index].dev;
+                       spage->dev = bioc->stripes[stripe_index].dev;
  
                         BUG_ON(page_index >= original_sblock->page_count);
                         spage->physical_for_dev_replace =
@@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
         bio->bi_end_io = scrub_bio_wait_endio;
  
         mirror_num = spage->sblock->pagev[0]->mirror_num;
-       ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+       ret = raid56_parity_recover(bio, spage->recover->bioc,
                                     spage->recover->map_length,
                                     mirror_num, 0);
         if (ret)
@@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
         if (!first_page->dev->bdev)
                 goto out;
  
-       bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
         bio_set_dev(bio, first_page->dev->bdev);
  
         for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                 }
  
                 WARN_ON(!spage->page);
-               bio = btrfs_io_bio_alloc(1);
+               bio = btrfs_bio_alloc(1);
                 bio_set_dev(bio, spage->dev->bdev);
  
                 bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                         return -EIO;
                 }
  
-               bio = btrfs_io_bio_alloc(1);
+               bio = btrfs_bio_alloc(1);
                 bio_set_dev(bio, spage_bad->dev->bdev);
                 bio->bi_iter.bi_sector = spage_bad->physical >> 9;
                 bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1675,7 @@ again:
                 sbio->dev = sctx->wr_tgtdev;
                 bio = sbio->bio;
                 if (!bio) {
-                       bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+                       bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
                         sbio->bio = bio;
                 }
  
@@ -2102,7 +2101,7 @@ again:
                 sbio->dev = spage->dev;
                 bio = sbio->bio;
                 if (!bio) {
-                       bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+                       bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
                         sbio->bio = bio;
                 }
  
@@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
         struct btrfs_fs_info *fs_info = sctx->fs_info;
         u64 length = sblock->page_count * PAGE_SIZE;
         u64 logical = sblock->pagev[0]->logical;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         struct bio *bio;
         struct btrfs_raid_bio *rbio;
         int ret;
@@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
  
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio);
-       if (ret || !bbio || !bbio->raid_map)
-               goto bbio_out;
+                              &length, &bioc);
+       if (ret || !bioc || !bioc->raid_map)
+               goto bioc_out;
  
         if (WARN_ON(!sctx->is_dev_replace ||
-                   !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+                   !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
                 /*
                  * We shouldn't be scrubbing a missing device. Even for dev
                  * replace, we should only get here for RAID 5/6. We either
                  * managed to mount something with no mirrors remaining or
                  * there's a bug in scrub_remap_extent()/btrfs_map_block().
                  */
-               goto bbio_out;
+               goto bioc_out;
         }
  
-       bio = btrfs_io_bio_alloc(0);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
         bio->bi_iter.bi_sector = logical >> 9;
         bio->bi_private = sblock;
         bio->bi_end_io = scrub_missing_raid56_end_io;
  
-       rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+       rbio = raid56_alloc_missing_rbio(bio, bioc, length);
         if (!rbio)
                 goto rbio_out;
  
@@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
  
  rbio_out:
         bio_put(bio);
-bbio_out:
+bioc_out:
         btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
         spin_lock(&sctx->stat_lock);
         sctx->stat.malloc_errors++;
         spin_unlock(&sctx->stat_lock);
@@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
         struct btrfs_fs_info *fs_info = sctx->fs_info;
         struct bio *bio;
         struct btrfs_raid_bio *rbio;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         u64 length;
         int ret;
  
@@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
  
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
-                              &length, &bbio);
-       if (ret || !bbio || !bbio->raid_map)
-               goto bbio_out;
+                              &length, &bioc);
+       if (ret || !bioc || !bioc->raid_map)
+               goto bioc_out;
  
-       bio = btrfs_io_bio_alloc(0);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
         bio->bi_iter.bi_sector = sparity->logic_start >> 9;
         bio->bi_private = sparity;
         bio->bi_end_io = scrub_parity_bio_endio;
  
-       rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
-                                             length, sparity->scrub_dev,
+       rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+                                             sparity->scrub_dev,
                                               sparity->dbitmap,
                                               sparity->nsectors);
         if (!rbio)
@@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
  
  rbio_out:
         bio_put(bio);
-bbio_out:
+bioc_out:
         btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
         bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                   sparity->nsectors);
         spin_lock(&sctx->stat_lock);
@@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
         struct btrfs_root *root = fs_info->extent_root;
         struct btrfs_root *csum_root = fs_info->csum_root;
         struct btrfs_extent_item *extent;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         u64 flags;
         int ret;
         int slot;
@@ -3044,22 +3043,22 @@ again:
                                                        extent_len);
  
                         mapped_length = extent_len;
-                       bbio = NULL;
+                       bioc = NULL;
                         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
-                                       extent_logical, &mapped_length, &bbio,
+                                       extent_logical, &mapped_length, &bioc,
                                         0);
                         if (!ret) {
-                               if (!bbio || mapped_length < extent_len)
+                               if (!bioc || mapped_length < extent_len)
                                         ret = -EIO;
                         }
                         if (ret) {
-                               btrfs_put_bbio(bbio);
+                               btrfs_put_bioc(bioc);
                                 goto out;
                         }
-                       extent_physical = bbio->stripes[0].physical;
-                       extent_mirror_num = bbio->mirror_num;
-                       extent_dev = bbio->stripes[0].dev;
-                       btrfs_put_bbio(bbio);
+                       extent_physical = bioc->stripes[0].physical;
+                       extent_mirror_num = bioc->mirror_num;
+                       extent_dev = bioc->stripes[0].dev;
+                       btrfs_put_bioc(bioc);
  
                         ret = btrfs_lookup_csums_range(csum_root,
                                                 extent_logical,
@@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
         int     ret;
         struct btrfs_fs_info *fs_info = sctx->fs_info;
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return -EROFS;
  
         /* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                     u64 end, struct btrfs_scrub_progress *progress,
                     int readonly, int is_dev_replace)
  {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
         struct scrub_ctx *sctx;
         int ret;
         struct btrfs_device *dev;
@@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                 goto out_free_ctx;
  
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
                      !is_dev_replace)) {
                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
  int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                          struct btrfs_scrub_progress *progress)
  {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
         struct btrfs_device *dev;
         struct scrub_ctx *sctx = NULL;
  
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (dev)
                 sctx = dev->scrub_ctx;
         if (sctx)
@@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
                                int *extent_mirror_num)
  {
         u64 mapped_length;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         int ret;
  
         mapped_length = extent_len;
         ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
-                             &mapped_length, &bbio, 0);
-       if (ret || !bbio || mapped_length < extent_len ||
-           !bbio->stripes[0].dev->bdev) {
-               btrfs_put_bbio(bbio);
+                             &mapped_length, &bioc, 0);
+       if (ret || !bioc || mapped_length < extent_len ||
+           !bioc->stripes[0].dev->bdev) {
+               btrfs_put_bioc(bioc);
                 return;
         }
  
-       *extent_physical = bbio->stripes[0].physical;
-       *extent_mirror_num = bbio->mirror_num;
-       *extent_dev = bbio->stripes[0].dev;
-       btrfs_put_bbio(bbio);
+       *extent_physical = bioc->stripes[0].physical;
+       *extent_mirror_num = bioc->mirror_num;
+       *extent_dev = bioc->stripes[0].dev;
+       btrfs_put_bioc(bioc);
  }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index 72f9b86..040324d 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -84,6 +84,8 @@ struct send_ctx {
         u64 total_send_size;
         u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
         u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
+       /* Protocol version compatibility requested */
+       u32 proto;
  
         struct btrfs_root *send_root;
         struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
                    sctx->parent_root->root_key.objectid : 0));
  }
  
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+       switch (sctx->proto) {
+       case 1:  return cmd < __BTRFS_SEND_C_MAX_V1;
+       case 2:  return cmd < __BTRFS_SEND_C_MAX_V2;
+       default: return false;
+       }
+}
+
  static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
  
  static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
         if (S_ISDIR(sctx->cur_inode_mode)) {
                 ret = did_create_dir(sctx, sctx->cur_ino);
                 if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+                       return ret;
+               else if (ret > 0)
+                       return 0;
         }
  
-       ret = send_create_inode(sctx, sctx->cur_ino);
-       if (ret < 0)
-               goto out;
-
-out:
-       return ret;
+       return send_create_inode(sctx, sctx->cur_ino);
  }
  
  struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
  
         sctx->flags = arg->flags;
  
+       if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+               if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+                       ret = -EPROTO;
+                       goto out;
+               }
+               /* Zero means "use the highest version" */
+               sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+       } else {
+               sctx->proto = 1;
+       }
+
         sctx->send_filp = fget(arg->send_fd);
         if (!sctx->send_filp) {
                 ret = -EBADF;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h

index de91488..23bcefc 100644 (file)
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
  enum btrfs_send_cmd {
         BTRFS_SEND_C_UNSPEC,
  
+       /* Version 1 */
         BTRFS_SEND_C_SUBVOL,
         BTRFS_SEND_C_SNAPSHOT,
  
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
  
         BTRFS_SEND_C_END,
         BTRFS_SEND_C_UPDATE_EXTENT,
+       __BTRFS_SEND_C_MAX_V1,
+
+       /* Version 2 */
+       __BTRFS_SEND_C_MAX_V2,
+
+       /* End */
         __BTRFS_SEND_C_MAX,
  };
  #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c

index aa5be0b..48d77f3 100644 (file)
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
  {
         struct reserve_ticket *ticket;
         u64 tickets_id = space_info->tickets_id;
+       const bool aborted = BTRFS_FS_ERROR(fs_info);
  
         trace_btrfs_fail_all_tickets(fs_info, space_info);
  
@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                 ticket = list_first_entry(&space_info->tickets,
                                           struct reserve_ticket, list);
  
-               if (ticket->steal &&
+               if (!aborted && ticket->steal &&
                     steal_from_global_rsv(fs_info, space_info, ticket))
                         return true;
  
-               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+               if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                         btrfs_info(fs_info, "failing ticket with %llu bytes",
                                    ticket->bytes);
  
                 remove_ticket(space_info, ticket);
-               ticket->error = -ENOSPC;
+               if (aborted)
+                       ticket->error = -EIO;
+               else
+                       ticket->error = -ENOSPC;
                 wake_up(&ticket->wait);
  
                 /*
@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                  * here to see if we can make progress with the next ticket in
                  * the list.
                  */
-               btrfs_try_granting_tickets(fs_info, space_info);
+               if (!aborted)
+                       btrfs_try_granting_tickets(fs_info, space_info);
         }
         return (tickets_id != space_info->tickets_id);
  }
@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                         spin_unlock(&space_info->lock);
                         return;
                 }
+
+               /* Something happened, fail everything and bail. */
+               if (BTRFS_FS_ERROR(fs_info))
+                       goto aborted_fs;
                 last_tickets_id = space_info->tickets_id;
                 spin_unlock(&space_info->lock);
         }
@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                         } else {
                                 flush_state = 0;
                         }
+
+                       /* Something happened, fail everything and bail. */
+                       if (BTRFS_FS_ERROR(fs_info))
+                               goto aborted_fs;
+
                 }
                 spin_unlock(&space_info->lock);
         }
+       return;
+
+aborted_fs:
+       maybe_fail_all_tickets(fs_info, space_info);
+       space_info->flush = 0;
+       spin_unlock(&space_info->lock);
  }
  
  void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c

index cb10e56..29bd8c7 100644 (file)
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -63,11 +63,41 @@
   *   This means a slightly higher tree locking latency.
   */
  
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+       unsigned int cur = 0;
+       unsigned int nr_bits;
+
+       ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+       nr_bits = PAGE_SIZE / sectorsize;
+       subpage_info->bitmap_nr_bits = nr_bits;
+
+       subpage_info->uptodate_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->error_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->dirty_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->writeback_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->ordered_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->checked_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->total_nr_bits = cur;
+}
+
  int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
                          struct page *page, enum btrfs_subpage_type type)
  {
-       struct btrfs_subpage *subpage = NULL;
-       int ret;
+       struct btrfs_subpage *subpage;
  
         /*
          * We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
          */
         if (page->mapping)
                 ASSERT(PageLocked(page));
+
         /* Either not subpage, or the page already has private attached */
         if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
                 return 0;
  
-       ret = btrfs_alloc_subpage(fs_info, &subpage, type);
-       if (ret < 0)
-               return ret;
+       subpage = btrfs_alloc_subpage(fs_info, type);
+       if (IS_ERR(subpage))
+               return  PTR_ERR(subpage);
+
         attach_page_private(page, subpage);
         return 0;
  }
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
         btrfs_free_subpage(subpage);
  }
  
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
-                       struct btrfs_subpage **ret,
-                       enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+                                         enum btrfs_subpage_type type)
  {
-       if (fs_info->sectorsize == PAGE_SIZE)
-               return 0;
+       struct btrfs_subpage *ret;
+       unsigned int real_size;
+
+       ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+       real_size = struct_size(ret, bitmaps,
+                       BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+       ret = kzalloc(real_size, GFP_NOFS);
+       if (!ret)
+               return ERR_PTR(-ENOMEM);
  
-       *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
-       if (!*ret)
-               return -ENOMEM;
-       spin_lock_init(&(*ret)->lock);
+       spin_lock_init(&ret->lock);
         if (type == BTRFS_SUBPAGE_METADATA) {
-               atomic_set(&(*ret)->eb_refs, 0);
+               atomic_set(&ret->eb_refs, 0);
         } else {
-               atomic_set(&(*ret)->readers, 0);
-               atomic_set(&(*ret)->writers, 0);
+               atomic_set(&ret->readers, 0);
+               atomic_set(&ret->writers, 0);
         }
-       return 0;
+       return ret;
  }
  
  void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
         u32 orig_len = *len;
  
         *start = max_t(u64, page_offset(page), orig_start);
-       *len = min_t(u64, page_offset(page) + PAGE_SIZE,
-                    orig_start + orig_len) - *start;
+       /*
+        * For certain call sites like btrfs_drop_pages(), we may have pages
+        * beyond the target range. In that case, just set @len to 0, subpage
+        * helpers can handle @len == 0 without any problem.
+        */
+       if (page_offset(page) >= orig_start + orig_len)
+               *len = 0;
+       else
+               *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+                            orig_start + orig_len) - *start;
  }
  
  void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
  
         btrfs_subpage_assert(fs_info, page, start, len);
  
+       /*
+        * We have call sites passing @lock_page into
+        * extent_clear_unlock_delalloc() for compression path.
+        *
+        * This @locked_page is locked by plain lock_page(), thus its
+        * subpage::writers is 0.  Handle them in a special way.
+        */
+       if (atomic_read(&subpage->writers) == 0)
+               return true;
+
         ASSERT(atomic_read(&subpage->writers) >= nbits);
         return atomic_sub_and_test(nbits, &subpage->writers);
  }
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
                 unlock_page(page);
  }
  
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
-               struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+                                     unsigned int nbits)
  {
-       const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
-       const int nbits = len >> fs_info->sectorsize_bits;
+       unsigned int found_zero;
  
-       btrfs_subpage_assert(fs_info, page, start, len);
+       found_zero = find_next_zero_bit(addr, start + nbits, start);
+       if (found_zero == start + nbits)
+               return true;
+       return false;
+}
  
-       /*
-        * Here nbits can be 16, thus can go beyond u16 range. We make the
-        * first left shift to be calculate in unsigned long (at least u32),
-        * then truncate the result to u16.
-        */
-       return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+                                      unsigned int nbits)
+{
+       unsigned int found_set;
+
+       found_set = find_next_bit(addr, start + nbits, start);
+       if (found_set == start + nbits)
+               return true;
+       return false;
  }
  
+#define subpage_calc_start_bit(fs_info, page, name, start, len)                \
+({                                                                     \
+       unsigned int start_bit;                                         \
+                                                                       \
+       btrfs_subpage_assert(fs_info, page, start, len);                \
+       start_bit = offset_in_page(start) >> fs_info->sectorsize_bits;  \
+       start_bit += fs_info->subpage_info->name##_offset;              \
+       start_bit;                                                      \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name)            \
+       bitmap_test_range_all_set(subpage->bitmaps,                     \
+                       fs_info->subpage_info->name##_offset,           \
+                       fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name)           \
+       bitmap_test_range_all_zero(subpage->bitmaps,                    \
+                       fs_info->subpage_info->name##_offset,           \
+                       fs_info->subpage_info->bitmap_nr_bits)
+
  void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       uptodate, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->uptodate_bitmap |= tmp;
-       if (subpage->uptodate_bitmap == U16_MAX)
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
                 SetPageUptodate(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       uptodate, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->uptodate_bitmap &= ~tmp;
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
         ClearPageUptodate(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       error, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->error_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
         SetPageError(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       error, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->error_bitmap &= ~tmp;
-       if (subpage->error_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
                 ClearPageError(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       dirty, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->dirty_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
         spin_unlock_irqrestore(&subpage->lock, flags);
         set_page_dirty(page);
  }
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       dirty, start, len);
         unsigned long flags;
         bool last = false;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->dirty_bitmap &= ~tmp;
-       if (subpage->dirty_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
                 last = true;
         spin_unlock_irqrestore(&subpage->lock, flags);
         return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       writeback, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->writeback_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
         set_page_writeback(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       writeback, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->writeback_bitmap &= ~tmp;
-       if (subpage->writeback_bitmap == 0) {
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
                 ASSERT(PageWriteback(page));
                 end_page_writeback(page);
         }
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       ordered, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->ordered_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
         SetPageOrdered(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len)
  {
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       ordered, start, len);
         unsigned long flags;
  
         spin_lock_irqsave(&subpage->lock, flags);
-       subpage->ordered_bitmap &= ~tmp;
-       if (subpage->ordered_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
                 ClearPageOrdered(page);
         spin_unlock_irqrestore(&subpage->lock, flags);
  }
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+                              struct page *page, u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       checked, start, len);
+       unsigned long flags;
+
+       spin_lock_irqsave(&subpage->lock, flags);
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+               SetPageChecked(page);
+       spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+                                struct page *page, u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       checked, start, len);
+       unsigned long flags;
+
+       spin_lock_irqsave(&subpage->lock, flags);
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       ClearPageChecked(page);
+       spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
  /*
   * Unlike set/clear which is dependent on each page status, for test all bits
   * are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,       \
                 struct page *page, u64 start, u32 len)                  \
  {                                                                      \
         struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,  \
+                                               name, start, len);      \
         unsigned long flags;                                            \
         bool ret;                                                       \
                                                                         \
         spin_lock_irqsave(&subpage->lock, flags);                       \
-       ret = ((subpage->name##_bitmap & tmp) == tmp);                  \
+       ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit,    \
+                               len >> fs_info->sectorsize_bits);       \
         spin_unlock_irqrestore(&subpage->lock, flags);                  \
         return ret;                                                     \
  }
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
  IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
  IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
  IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
  
  /*
   * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
                          PageWriteback);
  IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
                          PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
  
  /*
   * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
                 return;
  
         ASSERT(PagePrivate(page) && page->private);
-       ASSERT(subpage->dirty_bitmap == 0);
+       ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ *   It should not have any subpage::writers count.
+ *   Can be unlocked by unlock_page().
+ *   This is the most common locked page for __extent_writepage() called
+ *   inside extent_write_cache_pages() or extent_write_full_page().
+ *   Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ *   There is only one caller, all pages except @locked_page for
+ *   extent_write_locked_range().
+ *   In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+                             u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage;
+
+       ASSERT(PageLocked(page));
+       /* For regular page size case, we just unlock the page */
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return unlock_page(page);
+
+       ASSERT(PagePrivate(page) && page->private);
+       subpage = (struct btrfs_subpage *)page->private;
+
+       /*
+        * For subpage case, there are two types of locked page.  With or
+        * without writers number.
+        *
+        * Since we own the page lock, no one else could touch subpage::writers
+        * and we are safe to do several atomic operations without spinlock.
+        */
+       if (atomic_read(&subpage->writers))
+               /* No writers, locked by plain lock_page() */
+               return unlock_page(page);
+
+       /* Have writers, use proper subpage helper to end it */
+       btrfs_page_end_writer_lock(fs_info, page, start, len);
  }
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h

index 0120948..7accb5c 100644 (file)
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,38 @@
  #include <linux/spinlock.h>
  
  /*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset  /- error_offset /- dirty_offset
+ * |                   |               |
+ * v                   v               v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ...  |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
   */
-#define BTRFS_SUBPAGE_BITMAP_SIZE      16
+struct btrfs_subpage_info {
+       /* Number of bits for each bitmap */
+       unsigned int bitmap_nr_bits;
+
+       /* Total number of bits for the whole bitmap */
+       unsigned int total_nr_bits;
+
+       /*
+        * *_start indicates where the bitmap starts, the length is always
+        * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+        */
+       unsigned int uptodate_offset;
+       unsigned int error_offset;
+       unsigned int dirty_offset;
+       unsigned int writeback_offset;
+       unsigned int ordered_offset;
+       unsigned int checked_offset;
+};
  
  /*
   * Structure to trace status of each sector inside a page, attached to
@@ -18,10 +46,6 @@
  struct btrfs_subpage {
         /* Common members for both data and metadata pages */
         spinlock_t lock;
-       u16 uptodate_bitmap;
-       u16 error_bitmap;
-       u16 dirty_bitmap;
-       u16 writeback_bitmap;
         /*
          * Both data and metadata needs to track how many readers are for the
          * page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
                  * manages whether the subpage can be detached.
                  */
                 atomic_t eb_refs;
-               /* Structures only used by data */
-               struct {
-                       atomic_t writers;
  
-                       /* Tracke pending ordered extent in this sector */
-                       u16 ordered_bitmap;
-               };
+               /* Structures only used by data */
+               atomic_t writers;
         };
+       unsigned long bitmaps[];
  };
  
  enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
         BTRFS_SUBPAGE_DATA,
  };
  
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
  int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
                          struct page *page, enum btrfs_subpage_type type);
  void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
                           struct page *page);
  
  /* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
-                       struct btrfs_subpage **ret,
-                       enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+                                         enum btrfs_subpage_type type);
  void btrfs_free_subpage(struct btrfs_subpage *subpage);
  
  void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
  DECLARE_BTRFS_SUBPAGE_OPS(dirty);
  DECLARE_BTRFS_SUBPAGE_OPS(writeback);
  DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
  
  bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                 struct page *page, u64 start, u32 len);
  
  void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
                                  struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+                             u64 start, u32 len);
  
  #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 537d90b..a1c54a2 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
                 goto error_close_devices;
         }
  
-       bdev = fs_devices->latest_bdev;
+       bdev = fs_devices->latest_dev->bdev;
         s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
                  fs_info);
         if (IS_ERR(s)) {
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                 if (ret)
                         goto restore;
         } else {
-               if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+               if (BTRFS_FS_ERROR(fs_info)) {
                         btrfs_err(fs_info,
                                 "Remounting read-write after error is not allowed");
                         ret = -EINVAL;
@@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb)
  static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
-       struct btrfs_device *dev, *first_dev = NULL;
  
         /*
-        * Lightweight locking of the devices. We should not need
-        * device_list_mutex here as we only read the device data and the list
-        * is protected by RCU.  Even if a device is deleted during the list
-        * traversals, we'll get valid data, the freeing callback will wait at
-        * least until the rcu_read_unlock.
+        * There should be always a valid pointer in latest_dev, it may be stale
+        * for a short moment in case it's being deleted but still valid until
+        * the end of RCU grace period.
          */
         rcu_read_lock();
-       list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
-               if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
-                       continue;
-               if (!dev->name)
-                       continue;
-               if (!first_dev || dev->devid < first_dev->devid)
-                       first_dev = dev;
-       }
-
-       if (first_dev)
-               seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
-       else
-               WARN_ON(1);
+       seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
         rcu_read_unlock();
+
         return 0;
  }
  
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c

index 25a6f58..f9eff3b 100644 (file)
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
         } else
                 val = can_modify_feature(fa);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
  }
  
  static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
  static ssize_t rmdir_subvol_show(struct kobject *kobj,
                                  struct kobj_attribute *ka, char *buf)
  {
-       return scnprintf(buf, PAGE_SIZE, "0\n");
+       return sysfs_emit(buf, "0\n");
  }
  BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
  
@@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
                  * This "trick" only works as long as 'enum btrfs_csum_type' has
                  * no holes in it
                  */
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
-                               (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+               ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+                                    btrfs_super_csum_name(i));
  
         }
  
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+       ret += sysfs_emit_at(buf, ret, "\n");
         return ret;
  }
  BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
  static ssize_t send_stream_version_show(struct kobject *kobj,
                                         struct kobj_attribute *ka, char *buf)
  {
-       return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+       return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
  }
  BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
  
@@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
         int i;
  
         for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
-                                (i ? " " : ""), rescue_opts[i]);
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+               ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+       ret += sysfs_emit_at(buf, ret, "\n");
         return ret;
  }
  BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
  
         /* 4K sector size is also supported with 64K page size */
         if (PAGE_SIZE == SZ_64K)
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+               ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
  
         /* Only sectorsize == PAGE_SIZE is now supported */
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+       ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
  
         return ret;
  }
@@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%lld\n",
+       return sysfs_emit(buf, "%lld\n",
                         atomic64_read(&fs_info->discard_ctl.discardable_bytes));
  }
  BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n",
+       return sysfs_emit(buf, "%d\n",
                         atomic_read(&fs_info->discard_ctl.discardable_extents));
  }
  BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       fs_info->discard_ctl.discard_bitmap_bytes);
+       return sysfs_emit(buf, "%llu\n",
+                         fs_info->discard_ctl.discard_bitmap_bytes);
  }
  BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
  
@@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%lld\n",
+       return sysfs_emit(buf, "%lld\n",
                 atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
  }
  BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       fs_info->discard_ctl.discard_extent_bytes);
+       return sysfs_emit(buf, "%llu\n",
+                         fs_info->discard_ctl.discard_extent_bytes);
  }
  BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
  
@@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                       READ_ONCE(fs_info->discard_ctl.iops_limit));
+       return sysfs_emit(buf, "%u\n",
+                         READ_ONCE(fs_info->discard_ctl.iops_limit));
  }
  
  static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                       READ_ONCE(fs_info->discard_ctl.kbps_limit));
+       return sysfs_emit(buf, "%u\n",
+                         READ_ONCE(fs_info->discard_ctl.kbps_limit));
  }
  
  static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       READ_ONCE(fs_info->discard_ctl.max_discard_size));
+       return sysfs_emit(buf, "%llu\n",
+                         READ_ONCE(fs_info->discard_ctl.max_discard_size));
  }
  
  static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
         val = *value_ptr;
         if (lock)
                 spin_unlock(lock);
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+       return sysfs_emit(buf, "%llu\n", val);
  }
  
  static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
                         val += block_group->used;
         }
         up_read(&sinfo->groups_sem);
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+       return sysfs_emit(buf, "%llu\n", val);
  }
  
  /*
@@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
         ssize_t ret;
  
         spin_lock(&fs_info->super_lock);
-       ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+       ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
         spin_unlock(&fs_info->super_lock);
  
         return ret;
@@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
  }
  
  BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        fs_info->super_copy->sectorsize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
  }
  
  BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
  }
  
  BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
         int quota_override;
  
         quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
-       return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+       return sysfs_emit(buf, "%d\n", quota_override);
  }
  
  static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%pU\n",
-                       fs_info->fs_devices->metadata_uuid);
+       return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
  }
  
  BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
         u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
  
-       return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
-                       btrfs_super_csum_name(csum_type),
-                       crypto_shash_driver_name(fs_info->csum_shash));
+       return sysfs_emit(buf, "%s (%s)\n",
+                         btrfs_super_csum_name(csum_type),
+                         crypto_shash_driver_name(fs_info->csum_shash));
  }
  
  BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
                         str = "UNKNOWN\n";
                         break;
         }
-       return scnprintf(buf, PAGE_SIZE, "%s", str);
+       return sysfs_emit(buf, "%s", str);
  }
  BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
  
@@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
  {
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+       return sysfs_emit(buf, "%llu\n", fs_info->generation);
  }
  BTRFS_ATTR(, generation, btrfs_generation_show);
  
@@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
         struct btrfs_fs_info *fs_info = to_fs_info(kobj);
         ssize_t ret;
  
-       ret = scnprintf(buf, PAGE_SIZE, "%d\n",
-                       READ_ONCE(fs_info->bg_reclaim_threshold));
+       ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
  
         return ret;
  }
@@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
  
         val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
  }
  BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
  
@@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
  
         val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
  }
  BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
  
@@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
  
         val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
  }
  BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
  
@@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
         struct btrfs_device *device = container_of(kobj, struct btrfs_device,
                                                    devid_kobj);
  
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                        READ_ONCE(device->scrub_speed_max));
+       return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
  }
  
  static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
  
         val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
  
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
  }
  BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
  
@@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
                                                    devid_kobj);
  
         if (!device->dev_stats_valid)
-               return scnprintf(buf, PAGE_SIZE, "invalid\n");
+               return sysfs_emit(buf, "invalid\n");
  
         /*
          * Print all at once so we get a snapshot of all values from the same
          * time. Keep them in sync and in order of definition of
          * btrfs_dev_stat_values.
          */
-       return scnprintf(buf, PAGE_SIZE,
+       return sysfs_emit(buf,
                 "write_errs %d\n"
                 "read_errs %d\n"
                 "flush_errs %d\n"
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c

index df54cdf..2a95f72 100644 (file)
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
         key.type = BTRFS_EXTENT_CSUM_KEY;
         key.offset = 0;
  
-       setup_items_for_insert(root, path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, path, &key, value_len);
         item = btrfs_item_nr(0);
         write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
                             value_len);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c

index 73e96d5..c2e72e7 100644 (file)
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize)
          */
         set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
         start = 0;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
         found = find_lock_delalloc_range(inode, locked_page, &start,
                                          &end);
         if (!found) {
@@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize)
         }
         set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
         start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
         found = find_lock_delalloc_range(inode, locked_page, &start,
                                          &end);
         if (!found) {
@@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize)
                 goto out_bits;
         }
         start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
         found = find_lock_delalloc_range(inode, locked_page, &start,
                                          &end);
         if (found) {
                 test_err("found range when we shouldn't have");
                 goto out_bits;
         }
-       if (end != (u64)-1) {
+       if (end != test_start + PAGE_SIZE - 1) {
                 test_err("did not return the proper end offset");
                 goto out_bits;
         }
@@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize)
          */
         set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
         start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
         found = find_lock_delalloc_range(inode, locked_page, &start,
                                          &end);
         if (!found) {
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
         /* We unlocked it in the previous test */
         lock_page(locked_page);
         start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
         /*
          * Currently if we fail to find dirty pages in the delalloc range we
          * will adjust max_bytes down to PAGE_SIZE and then re-search.  If
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c

index c9874b1..cac89c3 100644 (file)
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
         key.type = BTRFS_EXTENT_DATA_KEY;
         key.offset = start;
  
-       setup_items_for_insert(root, &path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, &path, &key, value_len);
         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
         btrfs_set_file_extent_generation(leaf, fi, 1);
         btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
         key.type = BTRFS_INODE_ITEM_KEY;
         key.offset = 0;
  
-       setup_items_for_insert(root, &path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, &path, &key, value_len);
  }
  
  /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 14b9fdc..1c3a118 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
         spin_lock(&fs_info->trans_lock);
  loop:
         /* The file system has been taken offline. No new transactions. */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                 spin_unlock(&fs_info->trans_lock);
                 return -EROFS;
         }
@@ -331,7 +331,7 @@ loop:
                  */
                 kfree(cur_trans);
                 goto loop;
-       } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       } else if (BTRFS_FS_ERROR(fs_info)) {
                 spin_unlock(&fs_info->trans_lock);
                 kfree(cur_trans);
                 return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
         bool do_chunk_alloc = false;
         int ret;
  
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                 return ERR_PTR(-EROFS);
  
         if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         if (throttle)
                 btrfs_run_delayed_iputs(info);
  
-       if (TRANS_ABORTED(trans) ||
-           test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+       if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
                 wake_up_process(info->transaction_kthread);
                 if (TRANS_ABORTED(trans))
                         err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                  * abort to prevent writing a new superblock that reflects a
                  * corrupt state (pointing to trees with unwritten nodes/leafs).
                  */
-               if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+               if (BTRFS_FS_ERROR(fs_info)) {
                         ret = -EROFS;
                         goto cleanup_transaction;
                 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index b415c5e..8ab33ca 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,7 +94,7 @@ enum {
  };
  
  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct btrfs_inode *inode,
+                          struct btrfs_inode *inode,
                            int inode_only,
                            struct btrfs_log_ctx *ctx);
  static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +207,7 @@ again:
         }
  
         atomic_inc(&root->log_writers);
-       if (ctx && !ctx->logging_new_name) {
+       if (!ctx->logging_new_name) {
                 int index = root->log_transid % 2;
                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
                 ctx->log_transid = root->log_transid;
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
         return ret;
  }
  
-/*
- * Item overwrite used by replay and tree logging.  eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten.  If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root,
-                                  struct btrfs_path *path,
-                                  struct extent_buffer *eb, int slot,
-                                  struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct extent_buffer *eb, int slot,
+                            struct btrfs_key *key)
  {
         int ret;
         u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
         item_size = btrfs_item_size_nr(eb, slot);
         src_ptr = btrfs_item_ptr_offset(eb, slot);
  
-       /* look for the key in the destination tree */
-       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-       if (ret < 0)
-               return ret;
+       /* Our caller must have done a search for the key for us. */
+       ASSERT(path->nodes[0] != NULL);
+
+       /*
+        * And the slot must point to the exact key or the slot where the key
+        * should be at (the first item with a key greater than 'key')
+        */
+       if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+               struct btrfs_key found_key;
+
+               btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+               ret = btrfs_comp_cpu_keys(&found_key, key);
+               ASSERT(ret >= 0);
+       } else {
+               ret = 1;
+       }
  
         if (ret == 0) {
                 char *src_copy;
@@ -585,6 +583,36 @@ no_copy:
  }
  
  /*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         struct extent_buffer *eb, int slot,
+                         struct btrfs_key *key)
+{
+       int ret;
+
+       /* Look for the key in the destination tree. */
+       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
   * simple helper to read an inode off the disk from a given root
   * This can only be called for subvolume roots and not for the log
   */
@@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                 ins.objectid, ins.offset, 0);
                                 btrfs_init_data_ref(&ref,
                                                 root->root_key.objectid,
-                                               key->objectid, offset);
+                                               key->objectid, offset, 0, false);
                                 ret = btrfs_inc_extent_ref(trans, &ref);
                                 if (ret)
                                         goto out;
@@ -893,11 +921,11 @@ out:
   * item
   */
  static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
                                       struct btrfs_path *path,
                                       struct btrfs_inode *dir,
                                       struct btrfs_dir_item *di)
  {
+       struct btrfs_root *root = dir->root;
         struct inode *inode;
         char *name;
         int name_len;
@@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
         if (ret)
                 goto out;
  
-       ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+       ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
                         name_len);
         if (ret)
                 goto out;
@@ -1091,7 +1119,7 @@ again:
                                 inc_nlink(&inode->vfs_inode);
                                 btrfs_release_path(path);
  
-                               ret = btrfs_unlink_inode(trans, root, dir, inode,
+                               ret = btrfs_unlink_inode(trans, dir, inode,
                                                 victim_name, victim_name_len);
                                 kfree(victim_name);
                                 if (ret)
@@ -1162,7 +1190,7 @@ again:
                                         inc_nlink(&inode->vfs_inode);
                                         btrfs_release_path(path);
  
-                                       ret = btrfs_unlink_inode(trans, root,
+                                       ret = btrfs_unlink_inode(trans,
                                                         BTRFS_I(victim_parent),
                                                         inode,
                                                         victim_name,
@@ -1192,7 +1220,7 @@ next:
         if (IS_ERR(di)) {
                 return PTR_ERR(di);
         } else if (di) {
-               ret = drop_one_dir_item(trans, root, path, dir, di);
+               ret = drop_one_dir_item(trans, path, dir, di);
                 if (ret)
                         return ret;
         }
@@ -1204,7 +1232,7 @@ next:
         if (IS_ERR(di)) {
                 return PTR_ERR(di);
         } else if (di) {
-               ret = drop_one_dir_item(trans, root, path, dir, di);
+               ret = drop_one_dir_item(trans, path, dir, di);
                 if (ret)
                         return ret;
         }
@@ -1324,7 +1352,7 @@ again:
                                 kfree(name);
                                 goto out;
                         }
-                       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                                                  inode, name, namelen);
                         kfree(name);
                         iput(dir);
@@ -1385,10 +1413,11 @@ out:
         return ret;
  }
  
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
                     struct inode *dir, struct inode *inode, const char *name,
                     int namelen, u64 ref_index)
  {
+       struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_dir_item *dir_item;
         struct btrfs_key key;
         struct btrfs_path *path;
@@ -1422,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                 ret = -ENOENT;
                 goto out;
         }
-       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+       ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
                                  name, namelen);
         if (ret)
                 goto out;
@@ -1568,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                         ret = btrfs_inode_ref_exists(inode, dir, key->type,
                                                      name, namelen);
                         if (ret > 0) {
-                               ret = btrfs_unlink_inode(trans, root,
+                               ret = btrfs_unlink_inode(trans,
                                                          BTRFS_I(dir),
                                                          BTRFS_I(inode),
                                                          name, namelen);
@@ -1584,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                 goto out;
  
                         /* insert our name */
-                       ret = add_link(trans, root, dir, inode, name, namelen,
+                       ret = add_link(trans, dir, inode, name, namelen,
                                        ref_index);
                         if (ret)
                                 goto out;
@@ -2021,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
         if (!exists)
                 goto out;
  
-       ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+       ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
         if (ret)
                 goto out;
  
@@ -2251,13 +2280,13 @@ out:
   * to is unlinked
   */
  static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
                                       struct btrfs_path *log_path,
                                       struct inode *dir,
                                       struct btrfs_key *dir_key)
  {
+       struct btrfs_root *root = BTRFS_I(dir)->root;
         int ret;
         struct extent_buffer *eb;
         int slot;
@@ -2318,7 +2347,7 @@ again:
                         }
  
                         inc_nlink(inode);
-                       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                                         BTRFS_I(inode), name, name_len);
                         if (!ret)
                                 ret = btrfs_run_delayed_items(trans);
@@ -2500,7 +2529,9 @@ again:
                 else {
                         ret = find_dir_range(log, path, dirid, key_type,
                                              &range_start, &range_end);
-                       if (ret != 0)
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
                                 break;
                 }
  
@@ -2529,7 +2560,7 @@ again:
                         if (found_key.offset > range_end)
                                 break;
  
-                       ret = check_item_in_log(trans, root, log, path,
+                       ret = check_item_in_log(trans, log, path,
                                                 log_path, dir,
                                                 &found_key);
                         if (ret)
@@ -3037,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root)
  static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
                                         struct btrfs_log_ctx *ctx)
  {
-       if (!ctx)
-               return;
-
         mutex_lock(&root->log_mutex);
         list_del_init(&ctx->list);
         mutex_unlock(&root->log_mutex);
@@ -3328,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
          * writing the super here would result in transid mismatches.  If there
          * is an error here just bail.
          */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                 ret = -EIO;
                 btrfs_set_log_full_commit(trans);
                 btrfs_abort_transaction(trans, ret);
@@ -3452,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
         if (inode->logged_trans == trans->transid)
                 return true;
  
+       if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+               return false;
+
         /*
          * The inode's logged_trans is always 0 when we load it (because it is
          * not persisted in the inode item or elsewhere). So if it is 0, the
@@ -3490,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
   * This optimizations allows us to avoid relogging the entire inode
   * or the entire directory.
   */
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                const char *name, int name_len,
-                                struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct btrfs_inode *dir, u64 index)
  {
         struct btrfs_root *log;
         struct btrfs_dir_item *di;
@@ -3503,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
         u64 dir_ino = btrfs_ino(dir);
  
         if (!inode_logged(trans, dir))
-               return 0;
+               return;
  
         ret = join_running_log_trans(root);
         if (ret)
-               return 0;
+               return;
  
         mutex_lock(&dir->log_mutex);
  
@@ -3555,48 +3586,36 @@ fail:
         btrfs_free_path(path);
  out_unlock:
         mutex_unlock(&dir->log_mutex);
-       if (err == -ENOSPC) {
+       if (err < 0)
                 btrfs_set_log_full_commit(trans);
-               err = 0;
-       } else if (err < 0) {
-               btrfs_abort_transaction(trans, err);
-       }
-
         btrfs_end_log_trans(root);
-
-       return err;
  }
  
  /* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              const char *name, int name_len,
-                              struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct btrfs_inode *inode, u64 dirid)
  {
         struct btrfs_root *log;
         u64 index;
         int ret;
  
         if (!inode_logged(trans, inode))
-               return 0;
+               return;
  
         ret = join_running_log_trans(root);
         if (ret)
-               return 0;
+               return;
         log = root->log_root;
         mutex_lock(&inode->log_mutex);
  
         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
                                   dirid, &index);
         mutex_unlock(&inode->log_mutex);
-       if (ret == -ENOSPC) {
+       if (ret < 0 && ret != -ENOENT)
                 btrfs_set_log_full_commit(trans);
-               ret = 0;
-       } else if (ret < 0 && ret != -ENOENT)
-               btrfs_abort_transaction(trans, ret);
         btrfs_end_log_trans(root);
-
-       return ret;
  }
  
  /*
@@ -3632,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
         return 0;
  }
  
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *log,
+                                struct extent_buffer *src,
+                                struct btrfs_path *dst_path,
+                                int start_slot,
+                                int count)
+{
+       char *ins_data = NULL;
+       struct btrfs_item_batch batch;
+       struct extent_buffer *dst;
+       unsigned long src_offset;
+       unsigned long dst_offset;
+       struct btrfs_key key;
+       u32 item_size;
+       int ret;
+       int i;
+
+       ASSERT(count > 0);
+       batch.nr = count;
+
+       if (count == 1) {
+               btrfs_item_key_to_cpu(src, &key, start_slot);
+               item_size = btrfs_item_size_nr(src, start_slot);
+               batch.keys = &key;
+               batch.data_sizes = &item_size;
+               batch.total_data_size = item_size;
+       } else {
+               struct btrfs_key *ins_keys;
+               u32 *ins_sizes;
+
+               ins_data = kmalloc(count * sizeof(u32) +
+                                  count * sizeof(struct btrfs_key), GFP_NOFS);
+               if (!ins_data)
+                       return -ENOMEM;
+
+               ins_sizes = (u32 *)ins_data;
+               ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+               batch.keys = ins_keys;
+               batch.data_sizes = ins_sizes;
+               batch.total_data_size = 0;
+
+               for (i = 0; i < count; i++) {
+                       const int slot = start_slot + i;
+
+                       btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+                       ins_sizes[i] = btrfs_item_size_nr(src, slot);
+                       batch.total_data_size += ins_sizes[i];
+               }
+       }
+
+       ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+       if (ret)
+               goto out;
+
+       dst = dst_path->nodes[0];
+       /*
+        * Copy all the items in bulk, in a single copy operation. Item data is
+        * organized such that it's placed at the end of a leaf and from right
+        * to left. For example, the data for the second item ends at an offset
+        * that matches the offset where the data for the first item starts, the
+        * data for the third item ends at an offset that matches the offset
+        * where the data of the second items starts, and so on.
+        * Therefore our source and destination start offsets for copy match the
+        * offsets of the last items (highest slots).
+        */
+       dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+       src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+       copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+       btrfs_release_path(dst_path);
+out:
+       kfree(ins_data);
+
+       return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+                                 struct btrfs_inode *inode,
+                                 struct btrfs_path *path,
+                                 struct btrfs_path *dst_path,
+                                 int key_type,
+                                 struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_root *log = inode->root->log_root;
+       struct extent_buffer *src = path->nodes[0];
+       const int nritems = btrfs_header_nritems(src);
+       const u64 ino = btrfs_ino(inode);
+       const bool inode_logged_before = inode_logged(trans, inode);
+       u64 last_logged_key_offset;
+       bool last_found = false;
+       int batch_start = 0;
+       int batch_size = 0;
+       int i;
+
+       if (key_type == BTRFS_DIR_ITEM_KEY)
+               last_logged_key_offset = inode->last_dir_item_offset;
+       else
+               last_logged_key_offset = inode->last_dir_index_offset;
+
+       for (i = path->slots[0]; i < nritems; i++) {
+               struct btrfs_key key;
+               int ret;
+
+               btrfs_item_key_to_cpu(src, &key, i);
+
+               if (key.objectid != ino || key.type != key_type) {
+                       last_found = true;
+                       break;
+               }
+
+               ctx->last_dir_item_offset = key.offset;
+               /*
+                * We must make sure that when we log a directory entry, the
+                * corresponding inode, after log replay, has a matching link
+                * count. For example:
+                *
+                * touch foo
+                * mkdir mydir
+                * sync
+                * ln foo mydir/bar
+                * xfs_io -c "fsync" mydir
+                * <crash>
+                * <mount fs and log replay>
+                *
+                * Would result in a fsync log that when replayed, our file inode
+                * would have a link count of 1, but we get two directory entries
+                * pointing to the same inode. After removing one of the names,
+                * it would not be possible to remove the other name, which
+                * resulted always in stale file handle errors, and would not be
+                * possible to rmdir the parent directory, since its i_size could
+                * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+                * resulting in -ENOTEMPTY errors.
+                */
+               if (!ctx->log_new_dentries) {
+                       struct btrfs_dir_item *di;
+                       struct btrfs_key di_key;
+
+                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+                       btrfs_dir_item_key_to_cpu(src, di, &di_key);
+                       if ((btrfs_dir_transid(src, di) == trans->transid ||
+                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+                           di_key.type != BTRFS_ROOT_ITEM_KEY)
+                               ctx->log_new_dentries = true;
+               }
+
+               if (!inode_logged_before)
+                       goto add_to_batch;
+
+               /*
+                * If we were logged before and have logged dir items, we can skip
+                * checking if any item with a key offset larger than the last one
+                * we logged is in the log tree, saving time and avoiding adding
+                * contention on the log tree.
+                */
+               if (key.offset > last_logged_key_offset)
+                       goto add_to_batch;
+               /*
+                * Check if the key was already logged before. If not we can add
+                * it to a batch for bulk insertion.
+                */
+               ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret > 0) {
+                       btrfs_release_path(dst_path);
+                       goto add_to_batch;
+               }
+
+               /*
+                * Item exists in the log. Overwrite the item in the log if it
+                * has different content or do nothing if it has exactly the same
+                * content. And then flush the current batch if any - do it after
+                * overwriting the current item, or we would deadlock otherwise,
+                * since we are holding a path for the existing item.
+                */
+               ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+               if (ret < 0)
+                       return ret;
+
+               if (batch_size > 0) {
+                       ret = flush_dir_items_batch(trans, log, src, dst_path,
+                                                   batch_start, batch_size);
+                       if (ret < 0)
+                               return ret;
+                       batch_size = 0;
+               }
+               continue;
+add_to_batch:
+               if (batch_size == 0)
+                       batch_start = i;
+               batch_size++;
+       }
+
+       if (batch_size > 0) {
+               int ret;
+
+               ret = flush_dir_items_batch(trans, log, src, dst_path,
+                                           batch_start, batch_size);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return last_found ? 1 : 0;
+}
+
  /*
   * log all the items included in the current transaction for a given
   * directory.  This also creates the range items in the log tree required
   * to replay anything deleted before the fsync
   */
  static noinline int log_dir_items(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, struct btrfs_inode *inode,
+                         struct btrfs_inode *inode,
                           struct btrfs_path *path,
                           struct btrfs_path *dst_path, int key_type,
                           struct btrfs_log_ctx *ctx,
                           u64 min_offset, u64 *last_offset_ret)
  {
         struct btrfs_key min_key;
+       struct btrfs_root *root = inode->root;
         struct btrfs_root *log = root->log_root;
-       struct extent_buffer *src;
         int err = 0;
         int ret;
-       int i;
-       int nritems;
         u64 first_offset = min_offset;
         u64 last_offset = (u64)-1;
         u64 ino = btrfs_ino(inode);
  
-       log = root->log_root;
-
         min_key.objectid = ino;
         min_key.type = key_type;
         min_key.offset = min_offset;
@@ -3730,62 +3949,14 @@ search:
          * from our directory
          */
         while (1) {
-               struct btrfs_key tmp;
-               src = path->nodes[0];
-               nritems = btrfs_header_nritems(src);
-               for (i = path->slots[0]; i < nritems; i++) {
-                       struct btrfs_dir_item *di;
-
-                       btrfs_item_key_to_cpu(src, &min_key, i);
-
-                       if (min_key.objectid != ino || min_key.type != key_type)
-                               goto done;
-
-                       if (need_resched()) {
-                               btrfs_release_path(path);
-                               cond_resched();
-                               goto search;
-                       }
-
-                       ret = overwrite_item(trans, log, dst_path, src, i,
-                                            &min_key);
-                       if (ret) {
+               ret = process_dir_items_leaf(trans, inode, path, dst_path,
+                                            key_type, ctx);
+               if (ret != 0) {
+                       if (ret < 0)
                                 err = ret;
-                               goto done;
-                       }
-
-                       /*
-                        * We must make sure that when we log a directory entry,
-                        * the corresponding inode, after log replay, has a
-                        * matching link count. For example:
-                        *
-                        * touch foo
-                        * mkdir mydir
-                        * sync
-                        * ln foo mydir/bar
-                        * xfs_io -c "fsync" mydir
-                        * <crash>
-                        * <mount fs and log replay>
-                        *
-                        * Would result in a fsync log that when replayed, our
-                        * file inode would have a link count of 1, but we get
-                        * two directory entries pointing to the same inode.
-                        * After removing one of the names, it would not be
-                        * possible to remove the other name, which resulted
-                        * always in stale file handle errors, and would not
-                        * be possible to rmdir the parent directory, since
-                        * its i_size could never decrement to the value
-                        * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
-                        */
-                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
-                       btrfs_dir_item_key_to_cpu(src, di, &tmp);
-                       if (ctx &&
-                           (btrfs_dir_transid(src, di) == trans->transid ||
-                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
-                           tmp.type != BTRFS_ROOT_ITEM_KEY)
-                               ctx->log_new_dentries = true;
+                       goto done;
                 }
-               path->slots[0] = nritems;
+               path->slots[0] = btrfs_header_nritems(path->nodes[0]);
  
                 /*
                  * look ahead to the next item and see if it is also
@@ -3799,21 +3970,26 @@ search:
                                 err = ret;
                         goto done;
                 }
-               btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-               if (tmp.objectid != ino || tmp.type != key_type) {
+               btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+               if (min_key.objectid != ino || min_key.type != key_type) {
                         last_offset = (u64)-1;
                         goto done;
                 }
                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
                         ret = overwrite_item(trans, log, dst_path,
                                              path->nodes[0], path->slots[0],
-                                            &tmp);
+                                            &min_key);
                         if (ret)
                                 err = ret;
                         else
-                               last_offset = tmp.offset;
+                               last_offset = min_key.offset;
                         goto done;
                 }
+               if (need_resched()) {
+                       btrfs_release_path(path);
+                       cond_resched();
+                       goto search;
+               }
         }
  done:
         btrfs_release_path(path);
@@ -3846,7 +4022,7 @@ done:
   * key logged by this transaction.
   */
  static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, struct btrfs_inode *inode,
+                         struct btrfs_inode *inode,
                           struct btrfs_path *path,
                           struct btrfs_path *dst_path,
                           struct btrfs_log_ctx *ctx)
@@ -3856,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
         int ret;
         int key_type = BTRFS_DIR_ITEM_KEY;
  
+       /*
+        * If this is the first time we are being logged in the current
+        * transaction, or we were logged before but the inode was evicted and
+        * reloaded later, in which case its logged_trans is 0, reset the values
+        * of the last logged key offsets. Note that we don't use the helper
+        * function inode_logged() here - that is because the function returns
+        * true after an inode eviction, assuming the worst case as it can not
+        * know for sure if the inode was logged before. So we can not skip key
+        * searches in the case the inode was evicted, because it may not have
+        * been logged in this transaction and may have been logged in a past
+        * transaction, so we need to reset the last dir item and index offsets
+        * to (u64)-1.
+        */
+       if (inode->logged_trans != trans->transid) {
+               inode->last_dir_item_offset = (u64)-1;
+               inode->last_dir_index_offset = (u64)-1;
+       }
  again:
         min_key = 0;
         max_key = 0;
+       if (key_type == BTRFS_DIR_ITEM_KEY)
+               ctx->last_dir_item_offset = inode->last_dir_item_offset;
+       else
+               ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
         while (1) {
-               ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+               ret = log_dir_items(trans, inode, path, dst_path, key_type,
                                 ctx, min_key, &max_key);
                 if (ret)
                         return ret;
@@ -3870,8 +4068,11 @@ again:
         }
  
         if (key_type == BTRFS_DIR_ITEM_KEY) {
+               inode->last_dir_item_offset = ctx->last_dir_item_offset;
                 key_type = BTRFS_DIR_INDEX_KEY;
                 goto again;
+       } else {
+               inode->last_dir_index_offset = ctx->last_dir_item_offset;
         }
         return 0;
  }
@@ -3882,17 +4083,21 @@ again:
   * This cannot be run for file data extents because it does not
   * free the extents they point to.
   */
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *log,
                                   struct btrfs_path *path,
-                                 u64 objectid, int max_key_type)
+                                 struct btrfs_inode *inode,
+                                 int max_key_type)
  {
         int ret;
         struct btrfs_key key;
         struct btrfs_key found_key;
         int start_slot;
  
-       key.objectid = objectid;
+       if (!inode_logged(trans, inode))
+               return 0;
+
+       key.objectid = btrfs_ino(inode);
         key.type = max_key_type;
         key.offset = (u64)-1;
  
@@ -3909,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                       path->slots[0]);
  
-               if (found_key.objectid != objectid)
+               if (found_key.objectid != key.objectid)
                         break;
  
                 found_key.offset = 0;
@@ -3934,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
         return ret;
  }
  
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log_root,
+                               struct btrfs_inode *inode,
+                               u64 new_size, u32 min_type)
+{
+       int ret;
+
+       do {
+               ret = btrfs_truncate_inode_items(trans, log_root, inode,
+                                                new_size, min_type, NULL);
+       } while (ret == -EAGAIN);
+
+       return ret;
+}
+
  static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct extent_buffer *leaf,
                             struct btrfs_inode_item *item,
@@ -4106,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
         int ret;
         struct btrfs_key *ins_keys;
         u32 *ins_sizes;
+       struct btrfs_item_batch batch;
         char *ins_data;
         int i;
         struct list_head ordered_sums;
@@ -4120,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
  
         ins_sizes = (u32 *)ins_data;
         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+       batch.keys = ins_keys;
+       batch.data_sizes = ins_sizes;
+       batch.total_data_size = 0;
+       batch.nr = nr;
  
         for (i = 0; i < nr; i++) {
                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+               batch.total_data_size += ins_sizes[i];
                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
         }
-       ret = btrfs_insert_empty_items(trans, log, dst_path,
-                                      ins_keys, ins_sizes, nr);
+       ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
         if (ret) {
                 kfree(ins_data);
                 return ret;
@@ -4338,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
  }
  
  static int log_one_extent(struct btrfs_trans_handle *trans,
-                         struct btrfs_inode *inode, struct btrfs_root *root,
+                         struct btrfs_inode *inode,
                           const struct extent_map *em,
                           struct btrfs_path *path,
                           struct btrfs_log_ctx *ctx)
  {
         struct btrfs_drop_extents_args drop_args = { 0 };
-       struct btrfs_root *log = root->log_root;
+       struct btrfs_root *log = inode->root->log_root;
         struct btrfs_file_extent_item *fi;
         struct extent_buffer *leaf;
         struct btrfs_map_token token;
@@ -4357,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
-       drop_args.path = path;
-       drop_args.start = em->start;
-       drop_args.end = em->start + em->len;
-       drop_args.replace_extent = true;
-       drop_args.extent_item_size = sizeof(*fi);
-       ret = btrfs_drop_extents(trans, log, inode, &drop_args);
-       if (ret)
-               return ret;
+       /*
+        * If this is the first time we are logging the inode in the current
+        * transaction, we can avoid btrfs_drop_extents(), which is expensive
+        * because it does a deletion search, which always acquires write locks
+        * for extent buffers at levels 2, 1 and 0. This not only wastes time
+        * but also adds significant contention in a log tree, since log trees
+        * are small, with a root at level 2 or 3 at most, due to their short
+        * life span.
+        */
+       if (inode_logged(trans, inode)) {
+               drop_args.path = path;
+               drop_args.start = em->start;
+               drop_args.end = em->start + em->len;
+               drop_args.replace_extent = true;
+               drop_args.extent_item_size = sizeof(*fi);
+               ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+               if (ret)
+                       return ret;
+       }
  
         if (!drop_args.extent_inserted) {
                 key.objectid = btrfs_ino(inode);
@@ -4522,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
                          * Avoid logging extent items logged in past fsync calls
                          * and leading to duplicate keys in the log tree.
                          */
-                       do {
-                               ret = btrfs_truncate_inode_items(trans,
-                                                        root->log_root,
-                                                        inode, truncate_offset,
-                                                        BTRFS_EXTENT_DATA_KEY,
-                                                        NULL);
-                       } while (ret == -EAGAIN);
+                       ret = truncate_inode_items(trans, root->log_root, inode,
+                                                  truncate_offset,
+                                                  BTRFS_EXTENT_DATA_KEY);
                         if (ret)
                                 goto out;
                         dropped_extents = true;
@@ -4555,7 +4787,6 @@ out:
  }
  
  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
                                      struct btrfs_inode *inode,
                                      struct btrfs_path *path,
                                      struct btrfs_log_ctx *ctx)
@@ -4620,7 +4851,7 @@ process:
  
                 write_unlock(&tree->lock);
  
-               ret = log_one_extent(trans, inode, root, em, path, ctx);
+               ret = log_one_extent(trans, inode, em, path, ctx);
                 write_lock(&tree->lock);
                 clear_em_logging(tree, em);
                 free_extent_map(em);
@@ -4709,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
   * with a journal, ext3/4, xfs, f2fs, etc).
   */
  static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
                                 struct btrfs_inode *inode,
                                 struct btrfs_path *path,
                                 struct btrfs_path *dst_path)
  {
+       struct btrfs_root *root = inode->root;
         int ret;
         struct btrfs_key key;
         const u64 ino = btrfs_ino(inode);
@@ -4787,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
   * truncate operation that changes the inode's size.
   */
  static int btrfs_log_holes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
                            struct btrfs_inode *inode,
                            struct btrfs_path *path)
  {
+       struct btrfs_root *root = inode->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_key key;
         const u64 ino = btrfs_ino(inode);
@@ -5067,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                                 if (IS_ERR(inode)) {
                                         ret = PTR_ERR(inode);
                                 } else {
-                                       ret = btrfs_log_inode(trans, root,
+                                       ret = btrfs_log_inode(trans,
                                                       BTRFS_I(inode),
                                                       LOG_OTHER_INODE_ALL,
                                                       ctx);
@@ -5127,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                  * well because during a rename we pin the log and update the
                  * log with the new name before we unpin it.
                  */
-               ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
-                                     LOG_OTHER_INODE, ctx);
+               ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
                 if (ret) {
                         btrfs_add_delayed_iput(inode);
                         continue;
@@ -5239,7 +5469,7 @@ again:
                                         &other_ino, &other_parent);
                         if (ret < 0) {
                                 return ret;
-                       } else if (ret > 0 && ctx &&
+                       } else if (ret > 0 &&
                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
                                 if (ins_nr > 0) {
                                         ins_nr++;
@@ -5339,7 +5569,7 @@ next_key:
   * This handles both files and directories.
   */
  static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct btrfs_inode *inode,
+                          struct btrfs_inode *inode,
                            int inode_only,
                            struct btrfs_log_ctx *ctx)
  {
@@ -5347,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         struct btrfs_path *dst_path;
         struct btrfs_key min_key;
         struct btrfs_key max_key;
-       struct btrfs_root *log = root->log_root;
+       struct btrfs_root *log = inode->root->log_root;
         int err = 0;
         int ret = 0;
         bool fast_search = false;
@@ -5389,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
          * Only run delayed items if we are a directory. We want to make sure
          * all directory indexes hit the fs/subvolume tree so we can find them
          * and figure out which index ranges have to be logged.
-        *
-        * Otherwise commit the delayed inode only if the full sync flag is set,
-        * as we want to make sure an up to date version is in the subvolume
-        * tree so copy_inode_items_to_log() / copy_items() can find it and copy
-        * it to the log tree. For a non full sync, we always log the inode item
-        * based on the in-memory struct btrfs_inode which is always up to date.
          */
-       if (S_ISDIR(inode->vfs_inode.i_mode))
-               ret = btrfs_commit_inode_delayed_items(trans, inode);
-       else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
-               ret = btrfs_commit_inode_delayed_inode(inode);
-
-       if (ret) {
-               btrfs_free_path(path);
-               btrfs_free_path(dst_path);
-               return ret;
+       if (S_ISDIR(inode->vfs_inode.i_mode)) {
+               err = btrfs_commit_inode_delayed_items(trans, inode);
+               if (err)
+                       goto out;
         }
  
         if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5443,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
                 if (inode_only == LOG_INODE_EXISTS)
                         max_key_type = BTRFS_XATTR_ITEM_KEY;
-               ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+               ret = drop_inode_items(trans, log, path, inode, max_key_type);
         } else {
-               if (inode_only == LOG_INODE_EXISTS) {
+               if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
                         /*
                          * Make sure the new inode item we write to the log has
                          * the same isize as the current one (if it exists).
@@ -5467,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                              &inode->runtime_flags)) {
                         if (inode_only == LOG_INODE_EXISTS) {
                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
-                               ret = drop_objectid_items(trans, log, path, ino,
-                                                         max_key.type);
+                               ret = drop_inode_items(trans, log, path, inode,
+                                                      max_key.type);
                         } else {
                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                           &inode->runtime_flags);
                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                           &inode->runtime_flags);
-                               while(1) {
-                                       ret = btrfs_truncate_inode_items(trans,
-                                               log, inode, 0, 0, NULL);
-                                       if (ret != -EAGAIN)
-                                               break;
-                               }
+                               if (inode_logged(trans, inode))
+                                       ret = truncate_inode_items(trans, log,
+                                                                  inode, 0, 0);
                         }
                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                               &inode->runtime_flags) ||
@@ -5487,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                         if (inode_only == LOG_INODE_ALL)
                                 fast_search = true;
                         max_key.type = BTRFS_XATTR_ITEM_KEY;
-                       ret = drop_objectid_items(trans, log, path, ino,
-                                                 max_key.type);
+                       ret = drop_inode_items(trans, log, path, inode,
+                                              max_key.type);
                 } else {
                         if (inode_only == LOG_INODE_ALL)
                                 fast_search = true;
@@ -5511,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
         btrfs_release_path(path);
         btrfs_release_path(dst_path);
-       err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+       err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
         if (err)
                 goto out_unlock;
         xattrs_logged = true;
         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
                 btrfs_release_path(path);
                 btrfs_release_path(dst_path);
-               err = btrfs_log_holes(trans, root, inode, path);
+               err = btrfs_log_holes(trans, inode, path);
                 if (err)
                         goto out_unlock;
         }
@@ -5538,16 +5754,14 @@ log_extents:
                  * BTRFS_INODE_COPY_EVERYTHING set.
                  */
                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
-                       err = btrfs_log_all_xattrs(trans, root, inode, path,
-                                                  dst_path);
+                       err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
                         if (err)
                                 goto out_unlock;
                         btrfs_release_path(path);
                 }
         }
         if (fast_search) {
-               ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                               ctx);
+               ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
                 if (ret) {
                         err = ret;
                         goto out_unlock;
@@ -5562,59 +5776,52 @@ log_extents:
         }
  
         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
-               ret = log_directory_changes(trans, root, inode, path, dst_path,
-                                       ctx);
+               ret = log_directory_changes(trans, inode, path, dst_path, ctx);
                 if (ret) {
                         err = ret;
                         goto out_unlock;
                 }
         }
  
+       spin_lock(&inode->lock);
+       inode->logged_trans = trans->transid;
         /*
-        * If we are logging that an ancestor inode exists as part of logging a
-        * new name from a link or rename operation, don't mark the inode as
-        * logged - otherwise if an explicit fsync is made against an ancestor,
-        * the fsync considers the inode in the log and doesn't sync the log,
-        * resulting in the ancestor missing after a power failure unless the
-        * log was synced as part of an fsync against any other unrelated inode.
-        * So keep it simple for this case and just don't flag the ancestors as
-        * logged.
+        * Don't update last_log_commit if we logged that an inode exists.
+        * We do this for three reasons:
+        *
+        * 1) We might have had buffered writes to this inode that were
+        *    flushed and had their ordered extents completed in this
+        *    transaction, but we did not previously log the inode with
+        *    LOG_INODE_ALL. Later the inode was evicted and after that
+        *    it was loaded again and this LOG_INODE_EXISTS log operation
+        *    happened. We must make sure that if an explicit fsync against
+        *    the inode is performed later, it logs the new extents, an
+        *    updated inode item, etc, and syncs the log. The same logic
+        *    applies to direct IO writes instead of buffered writes.
+        *
+        * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+        *    is logged with an i_size of 0 or whatever value was logged
+        *    before. If later the i_size of the inode is increased by a
+        *    truncate operation, the log is synced through an fsync of
+        *    some other inode and then finally an explicit fsync against
+        *    this inode is made, we must make sure this fsync logs the
+        *    inode with the new i_size, the hole between old i_size and
+        *    the new i_size, and syncs the log.
+        *
+        * 3) If we are logging that an ancestor inode exists as part of
+        *    logging a new name from a link or rename operation, don't update
+        *    its last_log_commit - otherwise if an explicit fsync is made
+        *    against an ancestor, the fsync considers the inode in the log
+        *    and doesn't sync the log, resulting in the ancestor missing after
+        *    a power failure unless the log was synced as part of an fsync
+        *    against any other unrelated inode.
          */
-       if (!ctx ||
-           !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
-             &inode->vfs_inode != ctx->inode)) {
-               spin_lock(&inode->lock);
-               inode->logged_trans = trans->transid;
-               /*
-                * Don't update last_log_commit if we logged that an inode exists.
-                * We do this for two reasons:
-                *
-                * 1) We might have had buffered writes to this inode that were
-                *    flushed and had their ordered extents completed in this
-                *    transaction, but we did not previously log the inode with
-                *    LOG_INODE_ALL. Later the inode was evicted and after that
-                *    it was loaded again and this LOG_INODE_EXISTS log operation
-                *    happened. We must make sure that if an explicit fsync against
-                *    the inode is performed later, it logs the new extents, an
-                *    updated inode item, etc, and syncs the log. The same logic
-                *    applies to direct IO writes instead of buffered writes.
-                *
-                * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
-                *    is logged with an i_size of 0 or whatever value was logged
-                *    before. If later the i_size of the inode is increased by a
-                *    truncate operation, the log is synced through an fsync of
-                *    some other inode and then finally an explicit fsync against
-                *    this inode is made, we must make sure this fsync logs the
-                *    inode with the new i_size, the hole between old i_size and
-                *    the new i_size, and syncs the log.
-                */
-               if (inode_only != LOG_INODE_EXISTS)
-                       inode->last_log_commit = inode->last_sub_trans;
-               spin_unlock(&inode->lock);
-       }
+       if (inode_only != LOG_INODE_EXISTS)
+               inode->last_log_commit = inode->last_sub_trans;
+       spin_unlock(&inode->lock);
  out_unlock:
         mutex_unlock(&inode->log_mutex);
-
+out:
         btrfs_free_path(path);
         btrfs_free_path(dst_path);
         return err;
@@ -5714,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
         struct btrfs_dir_list *dir_elem;
         int ret = 0;
  
+       /*
+        * If we are logging a new name, as part of a link or rename operation,
+        * don't bother logging new dentries, as we just want to log the names
+        * of an inode and that any new parents exist.
+        */
+       if (ctx->logging_new_name)
+               return 0;
+
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -5790,7 +6005,7 @@ process_leaf:
                         ctx->log_new_dentries = false;
                         if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                 log_mode = LOG_INODE_ALL;
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+                       ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
                                               log_mode, ctx);
                         btrfs_add_delayed_iput(di_inode);
                         if (ret)
@@ -5934,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                                 continue;
                         }
  
-                       if (ctx)
-                               ctx->log_new_dentries = false;
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+                       ctx->log_new_dentries = false;
+                       ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
                                               LOG_INODE_ALL, ctx);
-                       if (!ret && ctx && ctx->log_new_dentries)
+                       if (!ret && ctx->log_new_dentries)
                                 ret = log_new_dir_dentries(trans, root,
                                                    BTRFS_I(dir_inode), ctx);
                         btrfs_add_delayed_iput(dir_inode);
@@ -5984,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
  
                 if (BTRFS_I(inode)->generation >= trans->transid &&
                     need_log_inode(trans, BTRFS_I(inode)))
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+                       ret = btrfs_log_inode(trans, BTRFS_I(inode),
                                               LOG_INODE_EXISTS, ctx);
                 btrfs_add_delayed_iput(inode);
                 if (ret)
@@ -6039,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
  
                 if (inode->generation >= trans->transid &&
                     need_log_inode(trans, inode)) {
-                       ret = btrfs_log_inode(trans, root, inode,
+                       ret = btrfs_log_inode(trans, inode,
                                               LOG_INODE_EXISTS, ctx);
                         if (ret)
                                 break;
@@ -6182,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         if (ret)
                 goto end_no_trans;
  
-       ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+       ret = btrfs_log_inode(trans, inode, inode_only, ctx);
         if (ret)
                 goto end_trans;
  
@@ -6199,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                 goto end_trans;
         }
  
-       if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+       if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
                 log_dentries = true;
  
         /*
@@ -6325,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
  
         ret = walk_log_tree(trans, log_root_tree, &wc);
         if (ret) {
-               btrfs_handle_fs_error(fs_info, ret,
-                       "Failed to pin buffers while recovering log root tree.");
+               btrfs_abort_transaction(trans, ret);
                 goto error;
         }
  
@@ -6339,8 +6552,7 @@ again:
                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
  
                 if (ret < 0) {
-                       btrfs_handle_fs_error(fs_info, ret,
-                                   "Couldn't find tree log root.");
+                       btrfs_abort_transaction(trans, ret);
                         goto error;
                 }
                 if (ret > 0) {
@@ -6357,8 +6569,7 @@ again:
                 log = btrfs_read_tree_root(log_root_tree, &found_key);
                 if (IS_ERR(log)) {
                         ret = PTR_ERR(log);
-                       btrfs_handle_fs_error(fs_info, ret,
-                                   "Couldn't read tree log root.");
+                       btrfs_abort_transaction(trans, ret);
                         goto error;
                 }
  
@@ -6386,8 +6597,7 @@ again:
  
                         if (!ret)
                                 goto next;
-                       btrfs_handle_fs_error(fs_info, ret,
-                               "Couldn't read target root for tree log recovery.");
+                       btrfs_abort_transaction(trans, ret);
                         goto error;
                 }
  
@@ -6395,14 +6605,15 @@ again:
                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
                 if (ret)
                         /* The loop needs to continue due to the root refs */
-                       btrfs_handle_fs_error(fs_info, ret,
-                               "failed to record the log root in transaction");
+                       btrfs_abort_transaction(trans, ret);
                 else
                         ret = walk_log_tree(trans, log, &wc);
  
                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
                                                       path);
+                       if (ret)
+                               btrfs_abort_transaction(trans, ret);
                 }
  
                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6419,6 +6630,8 @@ again:
                          * could only happen during mount.
                          */
                         ret = btrfs_init_root_free_objectid(root);
+                       if (ret)
+                               btrfs_abort_transaction(trans, ret);
                 }
  
                 wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h

index 731bd9c..f6811c3 100644 (file)
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
         int log_transid;
         bool log_new_dentries;
         bool logging_new_name;
+       /* Tracks the last logged dir item/index key offset. */
+       u64 last_dir_item_offset;
         struct inode *inode;
         struct list_head list;
         /* Only used for fast fsyncs. */
@@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
  int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                           struct dentry *dentry,
                           struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                const char *name, int name_len,
-                                struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              const char *name, int name_len,
-                              struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct btrfs_inode *inode, u64 dirid);
  void btrfs_end_log_trans(struct btrfs_root *root);
  void btrfs_pin_log_trans(struct btrfs_root *root);
  void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 9533f35..61ac57b 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -14,6 +14,7 @@
  #include <linux/semaphore.h>
  #include <linux/uuid.h>
  #include <linux/list_sort.h>
+#include <linux/namei.h>
  #include "misc.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                              enum btrfs_map_op op,
                              u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                              int mirror_num, int need_raid_map);
  
  /*
@@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
  
                 device = NULL;
         } else {
+               struct btrfs_dev_lookup_args args = {
+                       .devid = devid,
+                       .uuid = disk_super->dev_item.uuid,
+               };
+
                 mutex_lock(&fs_devices->device_list_mutex);
-               device = btrfs_find_device(fs_devices, devid,
-                               disk_super->dev_item.uuid, NULL);
+               device = btrfs_find_device(fs_devices, &args);
  
                 /*
                  * If this disk has been pulled into an fs devices created by
@@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
  
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
  
         mutex_unlock(&uuid_mutex);
  }
@@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
  
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                 fs_devices->missing_devices--;
+       }
  
         btrfs_close_bdev(device);
         if (device->bdev) {
@@ -1222,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
                 return -EINVAL;
  
         fs_devices->opened = 1;
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
         fs_devices->total_rw_bytes = 0;
         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1843,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
  
+       btrfs_reserve_chunk_metadata(trans, true);
         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
                                       &key, sizeof(*dev_item));
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret)
                 goto out;
  
@@ -1882,18 +1891,22 @@ out:
  /*
   * Function to update ctime/mtime for a given device path.
   * Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
   */
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
  {
-       struct inode *inode = bdev->bd_inode;
+       struct path path;
         struct timespec64 now;
+       int ret;
  
-       /* Shouldn't happen but just in case. */
-       if (!inode)
+       ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+       if (ret)
                 return;
  
-       now = current_time(inode);
-       generic_update_time(inode, &now, S_MTIME | S_CTIME);
+       now = current_time(d_inode(path.dentry));
+       inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+       path_put(&path);
  }
  
  static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1917,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
  
+       btrfs_reserve_chunk_metadata(trans, false);
         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret) {
                 if (ret > 0)
                         ret = -ENOENT;
@@ -1986,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
  }
  
  /*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
   * and replace it with the provided or the next active device, in the context
   * where this function called, there should be always be another device (or
   * this_dev) which is active.
@@ -2005,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
                         (fs_info->sb->s_bdev == device->bdev))
                 fs_info->sb->s_bdev = next_device->bdev;
  
-       if (fs_info->fs_devices->latest_bdev == device->bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+               fs_info->fs_devices->latest_dev = next_device;
  }
  
  /*
@@ -2069,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
  
         /* Update ctime/mtime for device path for libblkid */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
  }
  
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
-                   u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+                   struct btrfs_dev_lookup_args *args,
+                   struct block_device **bdev, fmode_t *mode)
  {
         struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
@@ -2081,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
         u64 num_devices;
         int ret = 0;
  
-       mutex_lock(&uuid_mutex);
-
+       /*
+        * The device list in fs_devices is accessed without locks (neither
+        * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+        * filesystem and another device rm cannot run.
+        */
         num_devices = btrfs_num_devices(fs_info);
  
         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
         if (ret)
                 goto out;
  
-       device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
-
-       if (IS_ERR(device)) {
-               if (PTR_ERR(device) == -ENOENT &&
-                   device_path && strcmp(device_path, "missing") == 0)
+       device = btrfs_find_device(fs_info->fs_devices, args);
+       if (!device) {
+               if (args->missing)
                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                 else
-                       ret = PTR_ERR(device);
+                       ret = -ENOENT;
                 goto out;
         }
  
@@ -2126,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                 mutex_unlock(&fs_info->chunk_mutex);
         }
  
-       mutex_unlock(&uuid_mutex);
         ret = btrfs_shrink_device(device, 0);
         if (!ret)
                 btrfs_reada_remove_dev(device);
-       mutex_lock(&uuid_mutex);
         if (ret)
                 goto error_undo;
  
@@ -2159,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
         /*
          * In normal cases the cur_devices == fs_devices. But in case
          * of deleting a seed device, the cur_devices should point to
-        * its own fs_devices listed under the fs_devices->seed.
+        * its own fs_devices listed under the fs_devices->seed_list.
          */
         cur_devices = device->fs_devices;
         mutex_lock(&fs_devices->device_list_mutex);
@@ -2210,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
         synchronize_rcu();
         btrfs_free_device(device);
  
-       if (cur_devices->open_devices == 0) {
+       /*
+        * This can happen if cur_devices is the private seed devices list.  We
+        * cannot call close_fs_devices() here because it expects the uuid_mutex
+        * to be held, but in fact we don't need that for the private
+        * seed_devices, we can simply decrement cur_devices->opened and then
+        * remove it from our list and free the fs_devices.
+        */
+       if (cur_devices->num_devices == 0) {
                 list_del_init(&cur_devices->seed_list);
-               close_fs_devices(cur_devices);
+               ASSERT(cur_devices->opened == 1);
+               cur_devices->opened--;
                 free_fs_devices(cur_devices);
         }
  
  out:
-       mutex_unlock(&uuid_mutex);
         return ret;
  
  error_undo:
@@ -2305,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
  
         mutex_unlock(&fs_devices->device_list_mutex);
  
-       /*
-        * The update_dev_time() with in btrfs_scratch_superblocks()
-        * may lead to a call to btrfs_show_devname() which will try
-        * to hold device_list_mutex. And here this device
-        * is already out of device list, so we don't have to hold
-        * the device_list_mutex lock.
-        */
         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
                                   tgtdev->name->str);
  
@@ -2320,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
         btrfs_free_device(tgtdev);
  }
  
-static struct btrfs_device *btrfs_find_device_by_path(
-               struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info:   the filesystem
+ * @args:      the args to populate
+ * @path:      the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks.  The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+                                struct btrfs_dev_lookup_args *args,
+                                const char *path)
  {
-       int ret = 0;
         struct btrfs_super_block *disk_super;
-       u64 devid;
-       u8 *dev_uuid;
         struct block_device *bdev;
-       struct btrfs_device *device;
+       int ret;
  
-       ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
-                                   fs_info->bdev_holder, 0, &bdev, &disk_super);
-       if (ret)
-               return ERR_PTR(ret);
+       if (!path || !path[0])
+               return -EINVAL;
+       if (!strcmp(path, "missing")) {
+               args->missing = true;
+               return 0;
+       }
  
-       devid = btrfs_stack_device_id(&disk_super->dev_item);
-       dev_uuid = disk_super->dev_item.uuid;
+       args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+       args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+       if (!args->uuid || !args->fsid) {
+               btrfs_put_dev_args_from_path(args);
+               return -ENOMEM;
+       }
+
+       ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+                                   &bdev, &disk_super);
+       if (ret)
+               return ret;
+       args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+       memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->metadata_uuid);
+               memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
         else
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->fsid);
-
+               memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
         btrfs_release_disk_super(disk_super);
-       if (!device)
-               device = ERR_PTR(-ENOENT);
         blkdev_put(bdev, FMODE_READ);
-       return device;
+       return 0;
  }
  
  /*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
   */
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+       kfree(args->uuid);
+       kfree(args->fsid);
+       args->uuid = NULL;
+       args->fsid = NULL;
+}
+
  struct btrfs_device *btrfs_find_device_by_devspec(
                 struct btrfs_fs_info *fs_info, u64 devid,
                 const char *device_path)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_device *device;
+       int ret;
  
         if (devid) {
-               device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
-                                          NULL);
+               args.devid = devid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                 if (!device)
                         return ERR_PTR(-ENOENT);
                 return device;
         }
  
-       if (!device_path || !device_path[0])
-               return ERR_PTR(-EINVAL);
-
-       if (strcmp(device_path, "missing") == 0) {
-               /* Find first missing device */
-               list_for_each_entry(device, &fs_info->fs_devices->devices,
-                                   dev_list) {
-                       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-                                    &device->dev_state) && !device->bdev)
-                               return device;
-               }
+       ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+       if (ret)
+               return ERR_PTR(ret);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
+       btrfs_put_dev_args_from_path(&args);
+       if (!device)
                 return ERR_PTR(-ENOENT);
-       }
-
-       return btrfs_find_device_by_path(fs_info, device_path);
+       return device;
  }
  
  /*
@@ -2459,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
   */
  static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_root *root = fs_info->chunk_root;
         struct btrfs_path *path;
@@ -2468,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
         struct btrfs_key key;
         u8 fs_uuid[BTRFS_FSID_SIZE];
         u8 dev_uuid[BTRFS_UUID_SIZE];
-       u64 devid;
         int ret;
  
         path = btrfs_alloc_path();
@@ -2480,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
         key.type = BTRFS_DEV_ITEM_KEY;
  
         while (1) {
+               btrfs_reserve_chunk_metadata(trans, false);
                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+               btrfs_trans_release_chunk_metadata(trans);
                 if (ret < 0)
                         goto error;
  
@@ -2505,13 +2551,14 @@ next_slot:
  
                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
                                           struct btrfs_dev_item);
-               devid = btrfs_device_id(leaf, dev_item);
+               args.devid = btrfs_device_id(leaf, dev_item);
                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                                    BTRFS_UUID_SIZE);
                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                    BTRFS_FSID_SIZE);
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          fs_uuid);
+               args.uuid = dev_uuid;
+               args.fsid = fs_uuid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                 BUG_ON(!device); /* Logic error */
  
                 if (device->fs_devices->seeding) {
@@ -2627,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
                         btrfs_abort_transaction(trans, ret);
                         goto error_trans;
                 }
+               btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+                                               device);
         }
  
         device->fs_devices = fs_devices;
@@ -2733,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
         btrfs_forget_devices(device_path);
  
         /* Update ctime/mtime for blkid or udev */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
  
         return ret;
  
@@ -2826,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
         struct btrfs_super_block *super_copy = fs_info->super_copy;
         u64 old_total;
         u64 diff;
+       int ret;
  
         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                 return -EACCES;
@@ -2854,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
                               &trans->transaction->dev_update_list);
         mutex_unlock(&fs_info->chunk_mutex);
  
-       return btrfs_update_device(trans, device);
+       btrfs_reserve_chunk_metadata(trans, false);
+       ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
+
+       return ret;
  }
  
  static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
                 struct btrfs_block_group *sys_bg;
  
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                 if (IS_ERR(sys_bg)) {
                         ret = PTR_ERR(sys_bg);
                         btrfs_abort_transaction(trans, ret);
@@ -4889,8 +4943,10 @@ again:
                         round_down(old_total - diff, fs_info->sectorsize));
         mutex_unlock(&fs_info->chunk_mutex);
  
+       btrfs_reserve_chunk_metadata(trans, false);
         /* Now btrfs_update_device() will change the on-disk size. */
         ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, ret);
                 btrfs_end_transaction(trans);
@@ -4973,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
  }
  
  /*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
   * Wraps needed parameters.
   */
  struct alloc_chunk_ctl {
@@ -5377,7 +5433,7 @@ error_del_extent:
         return block_group;
  }
  
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                             u64 type)
  {
         struct btrfs_fs_info *info = trans->fs_info;
@@ -5578,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
          */
  
         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-       meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       meta_bg = btrfs_create_chunk(trans, alloc_profile);
         if (IS_ERR(meta_bg))
                 return PTR_ERR(meta_bg);
  
         alloc_profile = btrfs_system_alloc_profile(fs_info);
-       sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       sys_bg = btrfs_create_chunk(trans, alloc_profile);
         if (IS_ERR(sys_bg))
                 return PTR_ERR(sys_bg);
  
@@ -5597,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
         return btrfs_raid_array[index].tolerated_failures;
  }
  
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
  {
         struct extent_map *em;
         struct map_lookup *map;
-       int readonly = 0;
         int miss_ndevs = 0;
         int i;
+       bool ret = true;
  
         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
         if (IS_ERR(em))
-               return 1;
+               return false;
  
         map = em->map_lookup;
         for (i = 0; i < map->num_stripes; i++) {
@@ -5618,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
                 }
                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
                                         &map->stripes[i].dev->dev_state)) {
-                       readonly = 1;
+                       ret = false;
                         goto end;
                 }
         }
  
         /*
-        * If the number of missing devices is larger than max errors,
-        * we can not write the data into that chunk successfully, so
-        * set it readonly.
+        * If the number of missing devices is larger than max errors, we can
+        * not write the data into that chunk successfully.
          */
         if (miss_ndevs > btrfs_chunk_max_errors(map))
-               readonly = 1;
+               ret = false;
  end:
         free_extent_map(em);
-       return readonly;
+       return ret;
  }
  
  void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5795,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
  }
  
  /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
  {
         int i;
         int again = 1;
@@ -5804,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
                 again = 0;
                 for (i = 0; i < num_stripes - 1; i++) {
                         /* Swap if parity is on a smaller index */
-                       if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
-                               swap(bbio->stripes[i], bbio->stripes[i + 1]);
-                               swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+                       if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+                               swap(bioc->stripes[i], bioc->stripes[i + 1]);
+                               swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
                                 again = 1;
                         }
                 }
         }
  }
  
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+                                                      int total_stripes,
+                                                      int real_stripes)
  {
-       struct btrfs_bio *bbio = kzalloc(
-                /* the size of the btrfs_bio */
-               sizeof(struct btrfs_bio) +
-               /* plus the variable array for the stripes */
-               sizeof(struct btrfs_bio_stripe) * (total_stripes) +
-               /* plus the variable array for the tgt dev */
+       struct btrfs_io_context *bioc = kzalloc(
+                /* The size of btrfs_io_context */
+               sizeof(struct btrfs_io_context) +
+               /* Plus the variable array for the stripes */
+               sizeof(struct btrfs_io_stripe) * (total_stripes) +
+               /* Plus the variable array for the tgt dev */
                 sizeof(int) * (real_stripes) +
                 /*
-                * plus the raid_map, which includes both the tgt dev
-                * and the stripes
+                * Plus the raid_map, which includes both the tgt dev
+                * and the stripes.
                  */
                 sizeof(u64) * (total_stripes),
                 GFP_NOFS|__GFP_NOFAIL);
  
-       atomic_set(&bbio->error, 0);
-       refcount_set(&bbio->refs, 1);
+       atomic_set(&bioc->error, 0);
+       refcount_set(&bioc->refs, 1);
  
-       bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
-       bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+       bioc->fs_info = fs_info;
+       bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+       bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
  
-       return bbio;
+       return bioc;
  }
  
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
  {
-       WARN_ON(!refcount_read(&bbio->refs));
-       refcount_inc(&bbio->refs);
+       WARN_ON(!refcount_read(&bioc->refs));
+       refcount_inc(&bioc->refs);
  }
  
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
  {
-       if (!bbio)
+       if (!bioc)
                 return;
-       if (refcount_dec_and_test(&bbio->refs))
-               kfree(bbio);
+       if (refcount_dec_and_test(&bioc->refs))
+               kfree(bioc);
  }
  
  /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5859,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
   */
  static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                                          u64 logical, u64 *length_ret,
-                                        struct btrfs_bio **bbio_ret)
+                                        struct btrfs_io_context **bioc_ret)
  {
         struct extent_map *em;
         struct map_lookup *map;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
         u64 length = *length_ret;
         u64 offset;
         u64 stripe_nr;
@@ -5882,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
         int ret = 0;
         int i;
  
-       /* discard always return a bbio */
-       ASSERT(bbio_ret);
+       /* Discard always returns a bioc. */
+       ASSERT(bioc_ret);
  
         em = btrfs_get_chunk_map(fs_info, logical, length);
         if (IS_ERR(em))
@@ -5946,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                                         &stripe_index);
         }
  
-       bbio = alloc_btrfs_bio(num_stripes, 0);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+       if (!bioc) {
                 ret = -ENOMEM;
                 goto out;
         }
  
         for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical =
+               bioc->stripes[i].physical =
                         map->stripes[stripe_index].physical +
                         stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
  
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                                  BTRFS_BLOCK_GROUP_RAID10)) {
-                       bbio->stripes[i].length = stripes_per_dev *
+                       bioc->stripes[i].length = stripes_per_dev *
                                 map->stripe_len;
  
                         if (i / sub_stripes < remaining_stripes)
-                               bbio->stripes[i].length +=
-                                       map->stripe_len;
+                               bioc->stripes[i].length += map->stripe_len;
  
                         /*
                          * Special for the first stripe and
@@ -5976,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                          *    off     end_off
                          */
                         if (i < sub_stripes)
-                               bbio->stripes[i].length -=
-                                       stripe_offset;
+                               bioc->stripes[i].length -= stripe_offset;
  
                         if (stripe_index >= last_stripe &&
                             stripe_index <= (last_stripe +
                                              sub_stripes - 1))
-                               bbio->stripes[i].length -=
-                                       stripe_end_offset;
+                               bioc->stripes[i].length -= stripe_end_offset;
  
                         if (i == sub_stripes - 1)
                                 stripe_offset = 0;
                 } else {
-                       bbio->stripes[i].length = length;
+                       bioc->stripes[i].length = length;
                 }
  
                 stripe_index++;
@@ -5998,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                 }
         }
  
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
  out:
         free_extent_map(em);
         return ret;
@@ -6024,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
                                          u64 srcdev_devid, int *mirror_num,
                                          u64 *physical)
  {
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         int num_stripes;
         int index_srcdev = 0;
         int found = 0;
@@ -6033,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
         int ret = 0;
  
         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &length, &bbio, 0, 0);
+                               logical, &length, &bioc, 0, 0);
         if (ret) {
-               ASSERT(bbio == NULL);
+               ASSERT(bioc == NULL);
                 return ret;
         }
  
-       num_stripes = bbio->num_stripes;
+       num_stripes = bioc->num_stripes;
         if (*mirror_num > num_stripes) {
                 /*
                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
                  * that means that the requested area is not left of the left
                  * cursor
                  */
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                 return -EIO;
         }
  
@@ -6056,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
          * pointer to the one of the target drive.
          */
         for (i = 0; i < num_stripes; i++) {
-               if (bbio->stripes[i].dev->devid != srcdev_devid)
+               if (bioc->stripes[i].dev->devid != srcdev_devid)
                         continue;
  
                 /*
@@ -6064,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
                  * mirror with the lowest physical address
                  */
                 if (found &&
-                   physical_of_found <= bbio->stripes[i].physical)
+                   physical_of_found <= bioc->stripes[i].physical)
                         continue;
  
                 index_srcdev = i;
                 found = 1;
-               physical_of_found = bbio->stripes[i].physical;
+               physical_of_found = bioc->stripes[i].physical;
         }
  
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
  
         ASSERT(found);
         if (!found)
@@ -6103,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
  }
  
  static void handle_ops_on_dev_replace(enum btrfs_map_op op,
-                                     struct btrfs_bio **bbio_ret,
+                                     struct btrfs_io_context **bioc_ret,
                                       struct btrfs_dev_replace *dev_replace,
                                       u64 logical,
                                       int *num_stripes_ret, int *max_errors_ret)
  {
-       struct btrfs_bio *bbio = *bbio_ret;
+       struct btrfs_io_context *bioc = *bioc_ret;
         u64 srcdev_devid = dev_replace->srcdev->devid;
         int tgtdev_indexes = 0;
         int num_stripes = *num_stripes_ret;
@@ -6138,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
                  */
                 index_where_to_add = num_stripes;
                 for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                 /* write to new disk, too */
-                               struct btrfs_bio_stripe *new =
-                                       bbio->stripes + index_where_to_add;
-                               struct btrfs_bio_stripe *old =
-                                       bbio->stripes + i;
+                               struct btrfs_io_stripe *new =
+                                       bioc->stripes + index_where_to_add;
+                               struct btrfs_io_stripe *old =
+                                       bioc->stripes + i;
  
                                 new->physical = old->physical;
                                 new->length = old->length;
                                 new->dev = dev_replace->tgtdev;
-                               bbio->tgtdev_map[i] = index_where_to_add;
+                               bioc->tgtdev_map[i] = index_where_to_add;
                                 index_where_to_add++;
                                 max_errors++;
                                 tgtdev_indexes++;
@@ -6168,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
                  * full copy of the source drive.
                  */
                 for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                 /*
                                  * In case of DUP, in order to keep it simple,
                                  * only add the mirror with the lowest physical
                                  * address
                                  */
                                 if (found &&
-                                   physical_of_found <=
-                                    bbio->stripes[i].physical)
+                                   physical_of_found <= bioc->stripes[i].physical)
                                         continue;
                                 index_srcdev = i;
                                 found = 1;
-                               physical_of_found = bbio->stripes[i].physical;
+                               physical_of_found = bioc->stripes[i].physical;
                         }
                 }
                 if (found) {
-                       struct btrfs_bio_stripe *tgtdev_stripe =
-                               bbio->stripes + num_stripes;
+                       struct btrfs_io_stripe *tgtdev_stripe =
+                               bioc->stripes + num_stripes;
  
                         tgtdev_stripe->physical = physical_of_found;
                         tgtdev_stripe->length =
-                               bbio->stripes[index_srcdev].length;
+                               bioc->stripes[index_srcdev].length;
                         tgtdev_stripe->dev = dev_replace->tgtdev;
-                       bbio->tgtdev_map[index_srcdev] = num_stripes;
+                       bioc->tgtdev_map[index_srcdev] = num_stripes;
  
                         tgtdev_indexes++;
                         num_stripes++;
@@ -6200,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
  
         *num_stripes_ret = num_stripes;
         *max_errors_ret = max_errors;
-       bbio->num_tgtdevs = tgtdev_indexes;
-       *bbio_ret = bbio;
+       bioc->num_tgtdevs = tgtdev_indexes;
+       *bioc_ret = bioc;
  }
  
  static bool need_full_stripe(enum btrfs_map_op op)
@@ -6304,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
  static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                              enum btrfs_map_op op,
                              u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                              int mirror_num, int need_raid_map)
  {
         struct extent_map *em;
@@ -6319,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
         int num_stripes;
         int max_errors = 0;
         int tgtdev_indexes = 0;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
         int dev_replace_is_ongoing = 0;
         int num_alloc_stripes;
@@ -6328,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
         u64 raid56_full_stripe_start = (u64)-1;
         struct btrfs_io_geometry geom;
  
-       ASSERT(bbio_ret);
+       ASSERT(bioc_ret);
         ASSERT(op != BTRFS_MAP_DISCARD);
  
         em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6472,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                 tgtdev_indexes = num_stripes;
         }
  
-       bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+       if (!bioc) {
                 ret = -ENOMEM;
                 goto out;
         }
  
         for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+               bioc->stripes[i].physical = map->stripes[stripe_index].physical +
                         stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
                 stripe_index++;
         }
  
-       /* build raid_map */
+       /* Build raid_map */
         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
             (need_full_stripe(op) || mirror_num > 1)) {
                 u64 tmp;
@@ -6497,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                 /* Fill in the logical address of each stripe */
                 tmp = stripe_nr * data_stripes;
                 for (i = 0; i < data_stripes; i++)
-                       bbio->raid_map[(i+rot) % num_stripes] =
+                       bioc->raid_map[(i + rot) % num_stripes] =
                                 em->start + (tmp + i) * map->stripe_len;
  
-               bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+               bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                       bbio->raid_map[(i+rot+1) % num_stripes] =
+                       bioc->raid_map[(i + rot + 1) % num_stripes] =
                                 RAID6_Q_STRIPE;
  
-               sort_parity_stripes(bbio, num_stripes);
+               sort_parity_stripes(bioc, num_stripes);
         }
  
         if (need_full_stripe(op))
@@ -6513,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
  
         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
             need_full_stripe(op)) {
-               handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+               handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
                                           &num_stripes, &max_errors);
         }
  
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
-       bbio->max_errors = max_errors;
-       bbio->mirror_num = mirror_num;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
+       bioc->max_errors = max_errors;
+       bioc->mirror_num = mirror_num;
  
         /*
          * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6530,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
          */
         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
                 WARN_ON(num_stripes > 1);
-               bbio->stripes[0].dev = dev_replace->tgtdev;
-               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
-               bbio->mirror_num = map->num_stripes + 1;
+               bioc->stripes[0].dev = dev_replace->tgtdev;
+               bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bioc->mirror_num = map->num_stripes + 1;
         }
  out:
         if (dev_replace_is_ongoing) {
@@ -6546,43 +6600,43 @@ out:
  
  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                       u64 logical, u64 *length,
-                     struct btrfs_bio **bbio_ret, int mirror_num)
+                     struct btrfs_io_context **bioc_ret, int mirror_num)
  {
         if (op == BTRFS_MAP_DISCARD)
                 return __btrfs_map_block_for_discard(fs_info, logical,
-                                                    length, bbio_ret);
+                                                    length, bioc_ret);
  
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
                                  mirror_num, 0);
  }
  
  /* For Scrub/replace */
  int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret)
+                    struct btrfs_io_context **bioc_ret)
  {
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
  }
  
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
  {
-       bio->bi_private = bbio->private;
-       bio->bi_end_io = bbio->end_io;
+       bio->bi_private = bioc->private;
+       bio->bi_end_io = bioc->end_io;
         bio_endio(bio);
  
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
  }
  
  static void btrfs_end_bio(struct bio *bio)
  {
-       struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_io_context *bioc = bio->bi_private;
         int is_orig_bio = 0;
  
         if (bio->bi_status) {
-               atomic_inc(&bbio->error);
+               atomic_inc(&bioc->error);
                 if (bio->bi_status == BLK_STS_IOERR ||
                     bio->bi_status == BLK_STS_TARGET) {
-                       struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+                       struct btrfs_device *dev = btrfs_bio(bio)->device;
  
                         ASSERT(dev->bdev);
                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6597,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio)
                 }
         }
  
-       if (bio == bbio->orig_bio)
+       if (bio == bioc->orig_bio)
                 is_orig_bio = 1;
  
-       btrfs_bio_counter_dec(bbio->fs_info);
+       btrfs_bio_counter_dec(bioc->fs_info);
  
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                 if (!is_orig_bio) {
                         bio_put(bio);
-                       bio = bbio->orig_bio;
+                       bio = bioc->orig_bio;
                 }
  
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                 /* only send an error to the higher layers if it is
                  * beyond the tolerance of the btrfs bio
                  */
-               if (atomic_read(&bbio->error) > bbio->max_errors) {
+               if (atomic_read(&bioc->error) > bioc->max_errors) {
                         bio->bi_status = BLK_STS_IOERR;
                 } else {
                         /*
@@ -6622,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio)
                         bio->bi_status = BLK_STS_OK;
                 }
  
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
         } else if (!is_orig_bio) {
                 bio_put(bio);
         }
  }
  
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
                               u64 physical, struct btrfs_device *dev)
  {
-       struct btrfs_fs_info *fs_info = bbio->fs_info;
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
  
-       bio->bi_private = bbio;
-       btrfs_io_bio(bio)->device = dev;
+       bio->bi_private = bioc;
+       btrfs_bio(bio)->device = dev;
         bio->bi_end_io = btrfs_end_bio;
         bio->bi_iter.bi_sector = physical >> 9;
         /*
@@ -6663,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
         btrfsic_submit_bio(bio);
  }
  
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
  {
-       atomic_inc(&bbio->error);
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       atomic_inc(&bioc->error);
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                 /* Should be the original bio. */
-               WARN_ON(bio != bbio->orig_bio);
+               WARN_ON(bio != bioc->orig_bio);
  
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                 bio->bi_iter.bi_sector = logical >> 9;
-               if (atomic_read(&bbio->error) > bbio->max_errors)
+               if (atomic_read(&bioc->error) > bioc->max_errors)
                         bio->bi_status = BLK_STS_IOERR;
                 else
                         bio->bi_status = BLK_STS_OK;
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
         }
  }
  
@@ -6691,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
         int ret;
         int dev_nr;
         int total_devs;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
  
         length = bio->bi_iter.bi_size;
         map_length = length;
  
         btrfs_bio_counter_inc_blocked(fs_info);
         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
-                               &map_length, &bbio, mirror_num, 1);
+                               &map_length, &bioc, mirror_num, 1);
         if (ret) {
                 btrfs_bio_counter_dec(fs_info);
                 return errno_to_blk_status(ret);
         }
  
-       total_devs = bbio->num_stripes;
-       bbio->orig_bio = first_bio;
-       bbio->private = first_bio->bi_private;
-       bbio->end_io = first_bio->bi_end_io;
-       bbio->fs_info = fs_info;
-       atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+       total_devs = bioc->num_stripes;
+       bioc->orig_bio = first_bio;
+       bioc->private = first_bio->bi_private;
+       bioc->end_io = first_bio->bi_end_io;
+       atomic_set(&bioc->stripes_pending, bioc->num_stripes);
  
-       if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+       if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
                 /* In this case, map_length has been set to the length of
                    a single stripe; not the whole write */
                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-                       ret = raid56_parity_write(fs_info, bio, bbio,
-                                                 map_length);
+                       ret = raid56_parity_write(bio, bioc, map_length);
                 } else {
-                       ret = raid56_parity_recover(fs_info, bio, bbio,
-                                                   map_length, mirror_num, 1);
+                       ret = raid56_parity_recover(bio, bioc, map_length,
+                                                   mirror_num, 1);
                 }
  
                 btrfs_bio_counter_dec(fs_info);
@@ -6735,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
         }
  
         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-               dev = bbio->stripes[dev_nr].dev;
+               dev = bioc->stripes[dev_nr].dev;
                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
                                                    &dev->dev_state) ||
                     (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-                       bbio_error(bbio, first_bio, logical);
+                       bioc_error(bioc, first_bio, logical);
                         continue;
                 }
  
@@ -6749,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                 else
                         bio = first_bio;
  
-               submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+               submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
         }
         btrfs_bio_counter_dec(fs_info);
         return BLK_STS_OK;
  }
  
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+                                     const struct btrfs_fs_devices *fs_devices)
+{
+       if (args->fsid == NULL)
+               return true;
+       if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+               return true;
+       return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+                                 const struct btrfs_device *device)
+{
+       ASSERT((args->devid != (u64)-1) || args->missing);
+
+       if ((args->devid != (u64)-1) && device->devid != args->devid)
+               return false;
+       if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+               return false;
+       if (!args->missing)
+               return true;
+       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+           !device->bdev)
+               return true;
+       return false;
+}
+
  /*
   * Find a device specified by @devid or @uuid in the list of @fs_devices, or
   * return NULL.
@@ -6762,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
   * If devid and uuid are both specified, the match must be exact, otherwise
   * only devid is used.
   */
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
-                                      u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+                                      const struct btrfs_dev_lookup_args *args)
  {
         struct btrfs_device *device;
         struct btrfs_fs_devices *seed_devs;
  
-       if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+       if (dev_args_match_fs_devices(args, fs_devices)) {
                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                       if (device->devid == devid &&
-                           (!uuid || memcmp(device->uuid, uuid,
-                                            BTRFS_UUID_SIZE) == 0))
+                       if (dev_args_match_device(args, device))
                                 return device;
                 }
         }
  
         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
-               if (!fsid ||
-                   !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
-                       list_for_each_entry(device, &seed_devs->devices,
-                                           dev_list) {
-                               if (device->devid == devid &&
-                                   (!uuid || memcmp(device->uuid, uuid,
-                                                    BTRFS_UUID_SIZE) == 0))
-                                       return device;
-                       }
+               if (!dev_args_match_fs_devices(args, seed_devs))
+                       continue;
+               list_for_each_entry(device, &seed_devs->devices, dev_list) {
+                       if (dev_args_match_device(args, device))
+                               return device;
                 }
         }
  
@@ -6952,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
  static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                           struct btrfs_chunk *chunk)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = leaf->fs_info;
         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
         struct map_lookup *map;
@@ -7029,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                 map->stripes[i].physical =
                         btrfs_stripe_offset_nr(leaf, chunk, i);
                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+               args.devid = devid;
                 read_extent_buffer(leaf, uuid, (unsigned long)
                                    btrfs_stripe_dev_uuid_nr(chunk, i),
                                    BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
-                                                       devid, uuid, NULL);
+               args.uuid = uuid;
+               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
                 if (!map->stripes[i].dev &&
                     !btrfs_test_opt(fs_info, DEGRADED)) {
                         free_extent_map(em);
@@ -7151,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
  static int read_one_dev(struct extent_buffer *leaf,
                         struct btrfs_dev_item *dev_item)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_fs_info *fs_info = leaf->fs_info;
         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         struct btrfs_device *device;
@@ -7159,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf,
         u8 fs_uuid[BTRFS_FSID_SIZE];
         u8 dev_uuid[BTRFS_UUID_SIZE];
  
-       devid = btrfs_device_id(leaf, dev_item);
+       devid = args.devid = btrfs_device_id(leaf, dev_item);
         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                            BTRFS_UUID_SIZE);
         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                            BTRFS_FSID_SIZE);
+       args.uuid = dev_uuid;
+       args.fsid = fs_uuid;
  
         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
                 fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7171,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf,
                         return PTR_ERR(fs_devices);
         }
  
-       device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                  fs_uuid);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
         if (!device) {
                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
                         btrfs_report_missing_device(fs_info, devid,
@@ -7841,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
  int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
                         struct btrfs_ioctl_get_dev_stats *stats)
  {
+       BTRFS_DEV_LOOKUP_ARGS(args);
         struct btrfs_device *dev;
         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         int i;
  
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+       args.devid = stats->devid;
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         mutex_unlock(&fs_devices->device_list_mutex);
  
         if (!dev) {
@@ -7922,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
                                  u64 chunk_offset, u64 devid,
                                  u64 physical_offset, u64 physical_len)
  {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
         struct extent_map *em;
         struct map_lookup *map;
@@ -7977,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
         }
  
         /* Make sure no dev extent is beyond device boundary */
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
         if (!dev) {
                 btrfs_err(fs_info, "failed to find devid %llu", devid);
                 ret = -EUCLEAN;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 2183361..3b81306 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -236,17 +236,40 @@ struct btrfs_fs_devices {
         bool fsid_change;
         struct list_head fs_list;
  
+       /*
+        * Number of devices under this fsid including missing and
+        * replace-target device and excludes seed devices.
+        */
         u64 num_devices;
+
+       /*
+        * The number of devices that successfully opened, including
+        * replace-target, excludes seed devices.
+        */
         u64 open_devices;
+
+       /* The number of devices that are under the chunk allocation list. */
         u64 rw_devices;
+
+       /* Count of missing devices under this fsid excluding seed device. */
         u64 missing_devices;
         u64 total_rw_bytes;
+
+       /*
+        * Count of devices from btrfs_super_block::num_devices for this fsid,
+        * which includes the seed device, excludes the transient replace-target
+        * device.
+        */
         u64 total_devices;
  
         /* Highest generation number of seen devices */
         u64 latest_generation;
  
-       struct block_device *latest_bdev;
+       /*
+        * The mount device or a device with highest generation after removal
+        * or replace.
+        */
+       struct btrfs_device *latest_dev;
  
         /* all of the devices in the FS, protected by a mutex
          * so we can safely walk it to write out the supers without
@@ -300,48 +323,62 @@ struct btrfs_fs_devices {
                                 / sizeof(struct btrfs_stripe) + 1)
  
  /*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet.  We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios.  We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
   */
-struct btrfs_io_bio {
+struct btrfs_bio {
         unsigned int mirror_num;
+
+       /* @device is for stripe IO submission. */
         struct btrfs_device *device;
-       u64 logical;
         u8 *csum;
         u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
         struct bvec_iter iter;
+
         /*
          * This member must come last, bio_alloc_bioset will allocate enough
-        * bytes for entire btrfs_io_bio but relies on bio being last.
+        * bytes for entire btrfs_bio but relies on bio being last.
          */
         struct bio bio;
  };
  
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
  {
-       return container_of(bio, struct btrfs_io_bio, bio);
+       return container_of(bio, struct btrfs_bio, bio);
  }
  
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
  {
-       if (io_bio->csum != io_bio->csum_inline) {
-               kfree(io_bio->csum);
-               io_bio->csum = NULL;
+       if (bbio->csum != bbio->csum_inline) {
+               kfree(bbio->csum);
+               bbio->csum = NULL;
         }
  }
  
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
         struct btrfs_device *dev;
         u64 physical;
         u64 length; /* only used for discard mappings */
  };
  
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ *   Used by submit_stripe_bio() for mapping logical bio
+ *   into physical device address.
+ *
+ * - Contain device replace info
+ *   Used by handle_ops_on_dev_replace() to copy logical bios
+ *   into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
         refcount_t refs;
         atomic_t stripes_pending;
         struct btrfs_fs_info *fs_info;
@@ -361,7 +398,7 @@ struct btrfs_bio {
          * so raid_map[0] is the start of our full stripe
          */
         u64 *raid_map;
-       struct btrfs_bio_stripe stripes[];
+       struct btrfs_io_stripe stripes[];
  };
  
  struct btrfs_device_info {
@@ -396,11 +433,11 @@ struct map_lookup {
         int num_stripes;
         int sub_stripes;
         int verified_stripes; /* For mount time dev extent verification */
-       struct btrfs_bio_stripe stripes[];
+       struct btrfs_io_stripe stripes[];
  };
  
  #define map_lookup_size(n) (sizeof(struct map_lookup) + \
-                           (sizeof(struct btrfs_bio_stripe) * (n)))
+                           (sizeof(struct btrfs_io_stripe) * (n)))
  
  struct btrfs_balance_args;
  struct btrfs_balance_progress;
@@ -414,6 +451,22 @@ struct btrfs_balance_control {
         struct btrfs_balance_progress stat;
  };
  
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+       u64 devid;
+       u8 *uuid;
+       u8 *fsid;
+       bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+       struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
  enum btrfs_map_op {
         BTRFS_MAP_READ,
         BTRFS_MAP_WRITE,
@@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
         }
  }
  
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
  int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
-                   struct btrfs_bio **bbio_ret, int mirror_num);
+                   struct btrfs_io_context **bioc_ret, int mirror_num);
  int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret);
+                    struct btrfs_io_context **bioc_ret);
  int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
                           enum btrfs_map_op op, u64 logical,
                           struct btrfs_io_geometry *io_geom);
  int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
  int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                             u64 type);
  void btrfs_mapping_tree_free(struct extent_map_tree *tree);
  blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
  struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
                                                   u64 devid,
                                                   const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+                                struct btrfs_dev_lookup_args *args,
+                                const char *path);
  struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                         const u64 *devid,
                                         const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
  void btrfs_free_device(struct btrfs_device *device);
  int btrfs_rm_device(struct btrfs_fs_info *fs_info,
-                   const char *device_path, u64 devid,
+                   struct btrfs_dev_lookup_args *args,
                     struct block_device **bdev, fmode_t *mode);
  void __exit btrfs_cleanup_fs_uuids(void);
  int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
  int btrfs_grow_device(struct btrfs_trans_handle *trans,
                       struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
-                                      u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+                                      const struct btrfs_dev_lookup_args *args);
  int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
  int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
  int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
  int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
  int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
  int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
  int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                          u64 *start, u64 *max_avail);
  void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c

index 8a45142..2837b4c 100644 (file)
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
                  * matches our target xattr, so lets check.
                  */
                 ret = 0;
-               btrfs_assert_tree_locked(path->nodes[0]);
+               btrfs_assert_tree_write_locked(path->nodes[0]);
                 di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
                 if (!di && !(flags & XATTR_REPLACE)) {
                         ret = -ENOSPC;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c

index 47af1ab..67d932d 100644 (file)
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -4,6 +4,7 @@
  #include <linux/slab.h>
  #include <linux/blkdev.h>
  #include <linux/sched/mm.h>
+#include <linux/atomic.h>
  #include "ctree.h"
  #include "volumes.h"
  #include "zoned.h"
@@ -39,12 +40,30 @@
  #define BTRFS_NR_SB_LOG_ZONES 2
  
  /*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES         (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
   * Maximum supported zone size. Currently, SMR disks have a zone size of
   * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
   * expect the zone size to become larger than 8GiB in the near future.
   */
  #define BTRFS_MAX_ZONE_SIZE            SZ_8G
  
+#define SUPER_INFO_SECTORS     ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+       return (zone->cond == BLK_ZONE_COND_FULL) ||
+               (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
  static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  {
         struct blk_zone *zones = data;
@@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
         bool empty[BTRFS_NR_SB_LOG_ZONES];
         bool full[BTRFS_NR_SB_LOG_ZONES];
         sector_t sector;
+       int i;
  
-       ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
-              zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
-       empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
-       empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
-       full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
-       full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+       for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+               ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+               empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+               full[i] = sb_zone_is_full(&zones[i]);
+       }
  
         /*
          * Possible states of log buffer zones
@@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         struct btrfs_fs_info *fs_info = device->fs_info;
         struct btrfs_zoned_device_info *zone_info = NULL;
         struct block_device *bdev = device->bdev;
+       struct request_queue *queue = bdev_get_queue(bdev);
+       unsigned int max_active_zones;
+       unsigned int nactive;
         sector_t nr_sectors;
         sector_t sector = 0;
         struct blk_zone *zones = NULL;
@@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         if (!IS_ALIGNED(nr_sectors, zone_sectors))
                 zone_info->nr_zones++;
  
+       max_active_zones = queue_max_active_zones(queue);
+       if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+               btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+                                rcu_str_deref(device->name), max_active_zones,
+                                BTRFS_MIN_ACTIVE_ZONES);
+               ret = -EINVAL;
+               goto out;
+       }
+       zone_info->max_active_zones = max_active_zones;
+
         zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
         if (!zone_info->seq_zones) {
                 ret = -ENOMEM;
@@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                 goto out;
         }
  
+       zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+       if (!zone_info->active_zones) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
         zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
         if (!zones) {
                 ret = -ENOMEM;
@@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
         }
  
         /* Get zones type */
+       nactive = 0;
         while (sector < nr_sectors) {
                 nr_zones = BTRFS_REPORT_NR_ZONES;
                 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                 for (i = 0; i < nr_zones; i++) {
                         if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
                                 __set_bit(nreported, zone_info->seq_zones);
-                       if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+                       switch (zones[i].cond) {
+                       case BLK_ZONE_COND_EMPTY:
                                 __set_bit(nreported, zone_info->empty_zones);
+                               break;
+                       case BLK_ZONE_COND_IMP_OPEN:
+                       case BLK_ZONE_COND_EXP_OPEN:
+                       case BLK_ZONE_COND_CLOSED:
+                               __set_bit(nreported, zone_info->active_zones);
+                               nactive++;
+                               break;
+                       }
                         nreported++;
                 }
                 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                 goto out;
         }
  
+       if (max_active_zones) {
+               if (nactive > max_active_zones) {
+                       btrfs_err_in_rcu(device->fs_info,
+                       "zoned: %u active zones on %s exceeds max_active_zones %u",
+                                        nactive, rcu_str_deref(device->name),
+                                        max_active_zones);
+                       ret = -EIO;
+                       goto out;
+               }
+               atomic_set(&zone_info->active_zones_left,
+                          max_active_zones - nactive);
+       }
+
         /* Validate superblock log */
         nr_zones = BTRFS_NR_SB_LOG_ZONES;
         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
  out:
         kfree(zones);
  out_free_zone_info:
+       bitmap_free(zone_info->active_zones);
         bitmap_free(zone_info->empty_zones);
         bitmap_free(zone_info->seq_zones);
         kfree(zone_info);
@@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
         if (!zone_info)
                 return;
  
+       bitmap_free(zone_info->active_zones);
         bitmap_free(zone_info->seq_zones);
         bitmap_free(zone_info->empty_zones);
         kfree(zone_info);
@@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
  
         /*
          * stripe_size is always aligned to BTRFS_STRIPE_LEN in
-        * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+        * btrfs_create_chunk(). Since we want stripe_len == zone_size,
          * check the alignment here.
          */
         if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
                         reset = &zones[1];
  
                 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
-                       ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+                       ASSERT(sb_zone_is_full(reset));
  
                         ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
                                                reset->start, reset->len,
@@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
                         reset->wp = reset->start;
                 }
         } else if (ret != -ENOENT) {
-               /* For READ, we want the precious one */
+               /*
+                * For READ, we want the previous one. Move write pointer to
+                * the end of a zone, if it is at the head of a zone.
+                */
+               u64 zone_end = 0;
+
                 if (wp == zones[0].start << SECTOR_SHIFT)
-                       wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+                       zone_end = zones[1].start + zones[1].capacity;
+               else if (wp == zones[1].start << SECTOR_SHIFT)
+                       zone_end = zones[0].start + zones[0].capacity;
+               if (zone_end)
+                       wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+                                       BTRFS_SUPER_INFO_SIZE);
+
                 wp -= BTRFS_SUPER_INFO_SIZE;
         }
  
@@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
         return true;
  }
  
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
  {
         struct btrfs_zoned_device_info *zinfo = device->zone_info;
         struct blk_zone *zone;
+       int i;
  
         if (!is_sb_log_zone(zinfo, mirror))
-               return;
+               return 0;
  
         zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
-       if (zone->cond != BLK_ZONE_COND_FULL) {
+       for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+               /* Advance the next zone */
+               if (zone->cond == BLK_ZONE_COND_FULL) {
+                       zone++;
+                       continue;
+               }
+
                 if (zone->cond == BLK_ZONE_COND_EMPTY)
                         zone->cond = BLK_ZONE_COND_IMP_OPEN;
  
-               zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+               zone->wp += SUPER_INFO_SECTORS;
+
+               if (sb_zone_is_full(zone)) {
+                       /*
+                        * No room left to write new superblock. Since
+                        * superblock is written with REQ_SYNC, it is safe to
+                        * finish the zone now.
+                        *
+                        * If the write pointer is exactly at the capacity,
+                        * explicit ZONE_FINISH is not necessary.
+                        */
+                       if (zone->wp != zone->start + zone->capacity) {
+                               int ret;
+
+                               ret = blkdev_zone_mgmt(device->bdev,
+                                               REQ_OP_ZONE_FINISH, zone->start,
+                                               zone->len, GFP_NOFS);
+                               if (ret)
+                                       return ret;
+                       }
  
-               if (zone->wp == zone->start + zone->len)
+                       zone->wp = zone->start + zone->len;
                         zone->cond = BLK_ZONE_COND_FULL;
-
-               return;
+               }
+               return 0;
         }
  
-       zone++;
-       ASSERT(zone->cond != BLK_ZONE_COND_FULL);
-       if (zone->cond == BLK_ZONE_COND_EMPTY)
-               zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
-       zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
-       if (zone->wp == zone->start + zone->len)
-               zone->cond = BLK_ZONE_COND_FULL;
+       /* All the zones are FULL. Should not reach here. */
+       ASSERT(0);
+       return -EIO;
  }
  
  int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
         return pos;
  }
  
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+       struct btrfs_zoned_device_info *zone_info = device->zone_info;
+       unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+       /* We can use any number of zones */
+       if (zone_info->max_active_zones == 0)
+               return true;
+
+       if (!test_bit(zno, zone_info->active_zones)) {
+               /* Active zone left? */
+               if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+                       return false;
+               if (test_and_set_bit(zno, zone_info->active_zones)) {
+                       /* Someone already set the bit */
+                       atomic_inc(&zone_info->active_zones_left);
+               }
+       }
+
+       return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+       struct btrfs_zoned_device_info *zone_info = device->zone_info;
+       unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+       /* We can use any number of zones */
+       if (zone_info->max_active_zones == 0)
+               return;
+
+       if (test_and_clear_bit(zno, zone_info->active_zones))
+               atomic_inc(&zone_info->active_zones_left);
+}
+
  int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
                             u64 length, u64 *bytes)
  {
@@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
         *bytes = length;
         while (length) {
                 btrfs_dev_set_zone_empty(device, physical);
+               btrfs_dev_clear_active_zone(device, physical);
                 physical += device->zone_info->zone_size;
                 length -= device->zone_info->zone_size;
         }
@@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
         int i;
         unsigned int nofs_flag;
         u64 *alloc_offsets = NULL;
+       u64 *caps = NULL;
+       unsigned long *active = NULL;
         u64 last_alloc = 0;
         u32 num_sequential = 0, num_conventional = 0;
  
@@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
  
         map = em->map_lookup;
  
+       cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+       if (!cache->physical_map) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
         alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
         if (!alloc_offsets) {
-               free_extent_map(em);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+       if (!caps) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+       if (!active) {
+               ret = -ENOMEM;
+               goto out;
         }
  
         for (i = 0; i < map->num_stripes; i++) {
@@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                         goto out;
                 }
  
+               caps[i] = (zone.capacity << SECTOR_SHIFT);
+
                 switch (zone.cond) {
                 case BLK_ZONE_COND_OFFLINE:
                 case BLK_ZONE_COND_READONLY:
@@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                         alloc_offsets[i] = 0;
                         break;
                 case BLK_ZONE_COND_FULL:
-                       alloc_offsets[i] = fs_info->zone_size;
+                       alloc_offsets[i] = caps[i];
                         break;
                 default:
                         /* Partially used zone */
                         alloc_offsets[i] =
                                         ((zone.wp - zone.start) << SECTOR_SHIFT);
+                       __set_bit(i, active);
                         break;
                 }
+
+               /*
+                * Consider a zone as active if we can allow any number of
+                * active zones.
+                */
+               if (!device->zone_info->max_active_zones)
+                       __set_bit(i, active);
         }
  
         if (num_sequential > 0)
@@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                  * calculate_alloc_pointer() which takes extent buffer
                  * locks to avoid deadlock.
                  */
+
+               /* Zone capacity is always zone size in emulation */
+               cache->zone_capacity = cache->length;
                 if (new) {
                         cache->alloc_offset = 0;
                         goto out;
@@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                         goto out;
                 }
                 cache->alloc_offset = alloc_offsets[0];
+               cache->zone_capacity = caps[0];
+               cache->zone_is_active = test_bit(0, active);
                 break;
         case BTRFS_BLOCK_GROUP_DUP:
         case BTRFS_BLOCK_GROUP_RAID1:
@@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                 goto out;
         }
  
+       if (cache->zone_is_active) {
+               btrfs_get_block_group(cache);
+               spin_lock(&fs_info->zone_active_bgs_lock);
+               list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+               spin_unlock(&fs_info->zone_active_bgs_lock);
+       }
+
  out:
         if (cache->alloc_offset > fs_info->zone_size) {
                 btrfs_err(fs_info,
@@ -1218,6 +1390,14 @@ out:
                 ret = -EIO;
         }
  
+       if (cache->alloc_offset > cache->zone_capacity) {
+               btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+                         cache->alloc_offset, cache->zone_capacity,
+                         cache->start);
+               ret = -EIO;
+       }
+
         /* An extent is allocated after the write pointer */
         if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
                 btrfs_err(fs_info,
@@ -1229,6 +1409,12 @@ out:
         if (!ret)
                 cache->meta_write_pointer = cache->alloc_offset + cache->start;
  
+       if (ret) {
+               kfree(cache->physical_map);
+               cache->physical_map = NULL;
+       }
+       bitmap_free(active);
+       kfree(caps);
         kfree(alloc_offsets);
         free_extent_map(em);
  
@@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
                 return;
  
         WARN_ON(cache->bytes_super != 0);
-       unusable = cache->alloc_offset - cache->used;
-       free = cache->length - cache->alloc_offset;
+       unusable = (cache->alloc_offset - cache->used) +
+                  (cache->length - cache->zone_capacity);
+       free = cache->zone_capacity - cache->alloc_offset;
  
         /* We only need ->free_space in ALLOC_SEQ block groups */
         cache->last_byte_to_unpin = (u64)-1;
         cache->cached = BTRFS_CACHE_FINISHED;
         cache->free_space_ctl->free_space = free;
         cache->zone_unusable = unusable;
-
-       /* Should not have any excluded extents. Just in case, though */
-       btrfs_free_excluded_extents(cache);
  }
  
  void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
         if (!is_data_inode(&inode->vfs_inode))
                 return false;
  
+       /*
+        * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+        * extent layout the relocation code has.
+        * Furthermore we have set aside own block-group from which only the
+        * relocation "process" can allocate and make sure only one process at a
+        * time can add pages to an extent that gets relocated, so it's safe to
+        * use regular REQ_OP_WRITE for this special case.
+        */
+       if (btrfs_is_data_reloc_root(inode->root))
+               return false;
+
         cache = btrfs_lookup_block_group(fs_info, start);
         ASSERT(cache);
         if (!cache)
@@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
  static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
                           struct blk_zone *zone)
  {
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
         u64 mapped_length = PAGE_SIZE;
         unsigned int nofs_flag;
         int nmirrors;
         int i, ret;
  
         ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                              &mapped_length, &bbio);
-       if (ret || !bbio || mapped_length < PAGE_SIZE) {
-               btrfs_put_bbio(bbio);
+                              &mapped_length, &bioc);
+       if (ret || !bioc || mapped_length < PAGE_SIZE) {
+               btrfs_put_bioc(bioc);
                 return -EIO;
         }
  
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                 return -EINVAL;
  
         nofs_flag = memalloc_nofs_save();
-       nmirrors = (int)bbio->num_stripes;
+       nmirrors = (int)bioc->num_stripes;
         for (i = 0; i < nmirrors; i++) {
-               u64 physical = bbio->stripes[i].physical;
-               struct btrfs_device *dev = bbio->stripes[i].dev;
+               u64 physical = bioc->stripes[i].physical;
+               struct btrfs_device *dev = bioc->stripes[i].dev;
  
                 /* Missing device */
                 if (!dev->bdev)
@@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
  
         return device;
  }
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+       struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+       bool ret;
+
+       if (!btrfs_is_zoned(block_group->fs_info))
+               return true;
+
+       map = block_group->physical_map;
+       /* Currently support SINGLE profile only */
+       ASSERT(map->num_stripes == 1);
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (device->zone_info->max_active_zones == 0)
+               return true;
+
+       spin_lock(&block_group->lock);
+
+       if (block_group->zone_is_active) {
+               ret = true;
+               goto out_unlock;
+       }
+
+       /* No space left */
+       if (block_group->alloc_offset == block_group->zone_capacity) {
+               ret = false;
+               goto out_unlock;
+       }
+
+       if (!btrfs_dev_set_active_zone(device, physical)) {
+               /* Cannot activate the zone */
+               ret = false;
+               goto out_unlock;
+       }
+
+       /* Successfully activated all the zones */
+       block_group->zone_is_active = 1;
+
+       spin_unlock(&block_group->lock);
+
+       /* For the active block group list */
+       btrfs_get_block_group(block_group);
+
+       spin_lock(&fs_info->zone_active_bgs_lock);
+       ASSERT(list_empty(&block_group->active_bg_list));
+       list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+       spin_unlock(&fs_info->zone_active_bgs_lock);
+
+       return true;
+
+out_unlock:
+       spin_unlock(&block_group->lock);
+       return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+       struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+       int ret = 0;
+
+       if (!btrfs_is_zoned(fs_info))
+               return 0;
+
+       map = block_group->physical_map;
+       /* Currently support SINGLE profile only */
+       ASSERT(map->num_stripes == 1);
+
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (device->zone_info->max_active_zones == 0)
+               return 0;
+
+       spin_lock(&block_group->lock);
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               return 0;
+       }
+
+       /* Check if we have unwritten allocated space */
+       if ((block_group->flags &
+            (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+           block_group->alloc_offset > block_group->meta_write_pointer) {
+               spin_unlock(&block_group->lock);
+               return -EAGAIN;
+       }
+       spin_unlock(&block_group->lock);
+
+       ret = btrfs_inc_block_group_ro(block_group, false);
+       if (ret)
+               return ret;
+
+       /* Ensure all writes in this block group finish */
+       btrfs_wait_block_group_reservations(block_group);
+       /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+       btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+                                block_group->length);
+
+       spin_lock(&block_group->lock);
+
+       /*
+        * Bail out if someone already deactivated the block group, or
+        * allocated space is left in the block group.
+        */
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               btrfs_dec_block_group_ro(block_group);
+               return 0;
+       }
+
+       if (block_group->reserved) {
+               spin_unlock(&block_group->lock);
+               btrfs_dec_block_group_ro(block_group);
+               return -EAGAIN;
+       }
+
+       block_group->zone_is_active = 0;
+       block_group->alloc_offset = block_group->zone_capacity;
+       block_group->free_space_ctl->free_space = 0;
+       btrfs_clear_treelog_bg(block_group);
+       spin_unlock(&block_group->lock);
+
+       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+                              physical >> SECTOR_SHIFT,
+                              device->zone_info->zone_size >> SECTOR_SHIFT,
+                              GFP_NOFS);
+       btrfs_dec_block_group_ro(block_group);
+
+       if (!ret) {
+               btrfs_dev_clear_active_zone(device, physical);
+
+               spin_lock(&fs_info->zone_active_bgs_lock);
+               ASSERT(!list_empty(&block_group->active_bg_list));
+               list_del_init(&block_group->active_bg_list);
+               spin_unlock(&fs_info->zone_active_bgs_lock);
+
+               /* For active_bg_list */
+               btrfs_put_block_group(block_group);
+       }
+
+       return ret;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+{
+       struct btrfs_device *device;
+       bool ret = false;
+
+       if (!btrfs_is_zoned(fs_devices->fs_info))
+               return true;
+
+       /* Non-single profiles are not supported yet */
+       if (raid_index != BTRFS_RAID_SINGLE)
+               return false;
+
+       /* Check if there is a device with active zones left */
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+               if (!device->bdev)
+                       continue;
+
+               if (!zinfo->max_active_zones ||
+                   atomic_read(&zinfo->active_zones_left)) {
+                       ret = true;
+                       break;
+               }
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+       struct btrfs_block_group *block_group;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+
+       if (!btrfs_is_zoned(fs_info))
+               return;
+
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+       ASSERT(block_group);
+
+       if (logical + length < block_group->start + block_group->zone_capacity)
+               goto out;
+
+       spin_lock(&block_group->lock);
+
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               goto out;
+       }
+
+       block_group->zone_is_active = 0;
+       /* We should have consumed all the free space */
+       ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+       ASSERT(block_group->free_space_ctl->free_space == 0);
+       btrfs_clear_treelog_bg(block_group);
+       spin_unlock(&block_group->lock);
+
+       map = block_group->physical_map;
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (!device->zone_info->max_active_zones)
+               goto out;
+
+       btrfs_dev_clear_active_zone(device, physical);
+
+       spin_lock(&fs_info->zone_active_bgs_lock);
+       ASSERT(!list_empty(&block_group->active_bg_list));
+       list_del_init(&block_group->active_bg_list);
+       spin_unlock(&fs_info->zone_active_bgs_lock);
+
+       btrfs_put_block_group(block_group);
+
+out:
+       btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+       struct btrfs_fs_info *fs_info = bg->fs_info;
+
+       spin_lock(&fs_info->relocation_bg_lock);
+       if (fs_info->data_reloc_bg == bg->start)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h

index 4b29970..e53ab7b 100644 (file)
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -23,8 +23,11 @@ struct btrfs_zoned_device_info {
         u64 zone_size;
         u8  zone_size_shift;
         u32 nr_zones;
+       unsigned int max_active_zones;
+       atomic_t active_zones_left;
         unsigned long *seq_zones;
         unsigned long *empty_zones;
+       unsigned long *active_zones;
         struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
  };
  
@@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
                                u64 *bytenr_ret);
  int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
                           u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
  int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
  u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
                                  u64 hole_end, u64 num_bytes);
@@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
                                   u64 physical_start, u64 physical_pos);
  struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
                                             u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+                            int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
  #else /* CONFIG_BLK_DEV_ZONED */
  static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                      struct blk_zone *zone)
@@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
         return 0;
  }
  
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+       return 0;
+}
  
  static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
  {
@@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
         return ERR_PTR(-EOPNOTSUPP);
  }
  
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+       return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+       return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+                                          int raid_index)
+{
+       return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+                                          u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
  #endif
  
  static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/inode.c b/fs/inode.c

index ed0cab8..9abc88d 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1782,12 +1782,13 @@ EXPORT_SYMBOL(generic_update_time);
   * This does the actual work of updating an inodes time or version.  Must have
   * had called mnt_want_write() before calling this.
   */
-static int update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
  {
         if (inode->i_op->update_time)
                 return inode->i_op->update_time(inode, time, flags);
         return generic_update_time(inode, time, flags);
  }
+EXPORT_SYMBOL(inode_update_time);
  
  /**
   *     atime_needs_update      -       update the access time
@@ -1857,7 +1858,7 @@ void touch_atime(const struct path *path)
          * of the fs read only, e.g. subvolumes in Btrfs.
          */
         now = current_time(inode);
-       update_time(inode, &now, S_ATIME);
+       inode_update_time(inode, &now, S_ATIME);
         __mnt_drop_write(mnt);
  skip_update:
         sb_end_write(inode->i_sb);
@@ -2002,7 +2003,7 @@ int file_update_time(struct file *file)
         if (__mnt_want_write_file(file))
                 return 0;
  
-       ret = update_time(inode, &now, sync_it);
+       ret = inode_update_time(inode, &now, sync_it);
         __mnt_drop_write_file(file);
  
         return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 0dcb902..f3cfca5 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2496,6 +2496,8 @@ enum file_time_flags {
  
  extern bool atime_needs_update(const struct path *, struct inode *);
  extern void touch_atime(const struct path *);
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+
  static inline void file_accessed(struct file *file)
  {
         if (!(file->f_flags & O_NOATIME))
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h

index d7d3cfe..7386199 100644 (file)
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -771,10 +771,16 @@ struct btrfs_ioctl_received_subvol_args {
   */
  #define BTRFS_SEND_FLAG_OMIT_END_CMD           0x4
  
+/*
+ * Read the protocol version in the structure
+ */
+#define BTRFS_SEND_FLAG_VERSION                        0x8
+
  #define BTRFS_SEND_FLAG_MASK \
         (BTRFS_SEND_FLAG_NO_FILE_DATA | \
          BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
-        BTRFS_SEND_FLAG_OMIT_END_CMD)
+        BTRFS_SEND_FLAG_OMIT_END_CMD | \
+        BTRFS_SEND_FLAG_VERSION)
  
  struct btrfs_ioctl_send_args {
         __s64 send_fd;                  /* in */
@@ -782,7 +788,8 @@ struct btrfs_ioctl_send_args {
         __u64 __user *clone_sources;    /* in */
         __u64 parent_root;              /* in */
         __u64 flags;                    /* in */
-       __u64 reserved[4];              /* in */
+       __u32 version;                  /* in */
+       __u8  reserved[28];             /* in */
  };
  
  /*
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
fs/btrfs/block-group.c		patch \| blob \| history
fs/btrfs/block-group.h		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/compression.h		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/delayed-ref.c		patch \| blob \| history
fs/btrfs/delayed-ref.h		patch \| blob \| history
fs/btrfs/dev-replace.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/extent_map.c		patch \| blob \| history
fs/btrfs/file-item.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/locking.h		patch \| blob \| history
fs/btrfs/lzo.c		patch \| blob \| history
fs/btrfs/raid56.c		patch \| blob \| history
fs/btrfs/raid56.h		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/ref-verify.c		patch \| blob \| history
fs/btrfs/reflink.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/send.h		patch \| blob \| history
fs/btrfs/space-info.c		patch \| blob \| history
fs/btrfs/subpage.c		patch \| blob \| history
fs/btrfs/subpage.h		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/sysfs.c		patch \| blob \| history
fs/btrfs/tests/extent-buffer-tests.c		patch \| blob \| history
fs/btrfs/tests/extent-io-tests.c		patch \| blob \| history
fs/btrfs/tests/inode-tests.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/tree-log.h		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
fs/btrfs/xattr.c		patch \| blob \| history
fs/btrfs/zoned.c		patch \| blob \| history
fs/btrfs/zoned.h		patch \| blob \| history
fs/inode.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/uapi/linux/btrfs.h		patch \| blob \| history