Merge tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 1 Nov 2021 19:48:25 +0000 (12:48 -0700)
Pull btrfs updates from David Sterba:
 "The updates this time are more under the hood and enhancing existing
  features (subpage with compression and zoned namespaces).

  Performance related:

   - misc small inode logging improvements (+3% throughput, -11% latency
     on sample dbench workload)

   - more efficient directory logging: bulk item insertion, less tree
     searches and locking

   - speed up bulk insertion of items into a b-tree, which is used when
     logging directories, when running delayed items for directories
     (fsync and transaction commits) and when running the slow path
     (full sync) of an fsync (bulk creation run time -4%, deletion -12%)

  Core:

   - continued subpage support
      - make defragmentation work
      - make compression write work

   - zoned mode
      - support ZNS (zoned namespaces), zone capacity is number of
        usable blocks in each zone
      - add dedicated block group (zoned) for relocation, to prevent
        out of order writes in some cases
      - greedy block group reclaim, pick the ones with least usable
        space first

   - preparatory work for send protocol updates

   - error handling improvements

   - cleanups and refactoring

  Fixes:

   - lockdep warnings
      - in show_devname callback, on seeding device
      - device delete on loop device due to conversions to workqueues

   - fix deadlock between chunk allocation and chunk btree modifications

   - fix tracking of missing device count and status"

* tag 'for-5.16-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (140 commits)
  btrfs: remove root argument from check_item_in_log()
  btrfs: remove root argument from add_link()
  btrfs: remove root argument from btrfs_unlink_inode()
  btrfs: remove root argument from drop_one_dir_item()
  btrfs: clear MISSING device status bit in btrfs_close_one_device
  btrfs: call btrfs_check_rw_degradable only if there is a missing device
  btrfs: send: prepare for v2 protocol
  btrfs: fix comment about sector sizes supported in 64K systems
  btrfs: update device path inode time instead of bd_inode
  fs: export an inode_update_time helper
  btrfs: fix deadlock when defragging transparent huge pages
  btrfs: sysfs: convert scnprintf and snprintf to sysfs_emit
  btrfs: make btrfs_super_block size match BTRFS_SUPER_INFO_SIZE
  btrfs: update comments for chunk allocation -ENOSPC cases
  btrfs: fix deadlock between chunk allocation and chunk btree modifications
  btrfs: zoned: use greedy gc for auto reclaim
  btrfs: check-integrity: stop storing the block device name in btrfsic_dev_state
  btrfs: use btrfs_get_dev_args_from_path in dev removal ioctls
  btrfs: add a btrfs_get_dev_args_from_path helper
  btrfs: handle device lookup with btrfs_dev_lookup_args
  ...

53 files changed:
fs/btrfs/block-group.c
fs/btrfs/block-group.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/compression.c
fs/btrfs/compression.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/dev-replace.c
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.h
fs/btrfs/lzo.c
fs/btrfs/raid56.c
fs/btrfs/raid56.h
fs/btrfs/reada.c
fs/btrfs/ref-verify.c
fs/btrfs/reflink.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/send.h
fs/btrfs/space-info.c
fs/btrfs/subpage.c
fs/btrfs/subpage.h
fs/btrfs/super.c
fs/btrfs/sysfs.c
fs/btrfs/tests/extent-buffer-tests.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/tree-log.h
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/btrfs/zoned.c
fs/btrfs/zoned.h
fs/inode.c
include/linux/fs.h
include/uapi/linux/btrfs.h

index a3b830b..444e9c8 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <linux/list_sort.h>
 #include "misc.h"
 #include "ctree.h"
 #include "block-group.h"
@@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
                 */
                WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
                kfree(cache->free_space_ctl);
+               kfree(cache->physical_map);
                kfree(cache);
        }
 }
@@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_unlock(&cluster->refill_lock);
 
        btrfs_clear_treelog_bg(block_group);
+       btrfs_clear_data_reloc_bg(block_group);
 
        path = btrfs_alloc_path();
        if (!path) {
@@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
        spin_unlock(&fs_info->unused_bgs_lock);
 }
 
+/*
+ * We want block groups with a low number of used bytes to be in the beginning
+ * of the list, so they will get reclaimed first.
+ */
+static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
+                          const struct list_head *b)
+{
+       const struct btrfs_block_group *bg1, *bg2;
+
+       bg1 = list_entry(a, struct btrfs_block_group, bg_list);
+       bg2 = list_entry(b, struct btrfs_block_group, bg_list);
+
+       return bg1->used > bg2->used;
+}
+
 void btrfs_reclaim_bgs_work(struct work_struct *work)
 {
        struct btrfs_fs_info *fs_info =
@@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
        }
 
        spin_lock(&fs_info->unused_bgs_lock);
+       /*
+        * Sort happens under lock because we can't simply splice it and sort.
+        * The block groups might still be in use and reachable via bg_list,
+        * and their presence in the reclaim_bgs list must be preserved.
+        */
+       list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
        while (!list_empty(&fs_info->reclaim_bgs)) {
                u64 zone_unusable;
                int ret = 0;
@@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
        INIT_LIST_HEAD(&cache->discard_list);
        INIT_LIST_HEAD(&cache->dirty_list);
        INIT_LIST_HEAD(&cache->io_list);
+       INIT_LIST_HEAD(&cache->active_bg_list);
        btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
        atomic_set(&cache->frozen, 0);
        mutex_init(&cache->free_space_lock);
@@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
         */
        if (btrfs_is_zoned(info)) {
                btrfs_calc_zone_unusable(cache);
+               /* Should not have any excluded extents. Just in case, though. */
+               btrfs_free_excluded_extents(cache);
        } else if (cache->length == cache->used) {
                cache->last_byte_to_unpin = (u64)-1;
                cache->cached = BTRFS_CACHE_FINISHED;
@@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info,
        link_block_group(cache);
 
        set_avail_alloc_bits(info, cache->flags);
-       if (btrfs_chunk_readonly(info, cache->start)) {
+       if (btrfs_chunk_writeable(info, cache->start)) {
+               if (cache->used == 0) {
+                       ASSERT(list_empty(&cache->bg_list));
+                       if (btrfs_test_opt(info, DISCARD_ASYNC))
+                               btrfs_discard_queue_work(&info->discard_ctl, cache);
+                       else
+                               btrfs_mark_bg_unused(cache);
+               }
+       } else {
                inc_block_group_ro(cache, 1);
-       } else if (cache->used == 0) {
-               ASSERT(list_empty(&cache->bg_list));
-               if (btrfs_test_opt(info, DISCARD_ASYNC))
-                       btrfs_discard_queue_work(&info->discard_ctl, cache);
-               else
-                       btrfs_mark_bg_unused(cache);
        }
+
        return 0;
 error:
        btrfs_put_block_group(cache);
@@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
                return ERR_PTR(ret);
        }
 
+       /*
+        * New block group is likely to be used soon. Try to activate it now.
+        * Failure is OK for now.
+        */
+       btrfs_zone_activate(cache);
+
        ret = exclude_super_stripes(cache);
        if (ret) {
                /* We may have excluded something, so call this just in case */
@@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
         */
        trace_btrfs_add_block_group(fs_info, cache, 1);
        btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
-                               cache->bytes_super, 0, &cache->space_info);
+                               cache->bytes_super, cache->zone_unusable,
+                               &cache->space_info);
        btrfs_update_global_block_rsv(fs_info);
 
        link_block_group(cache);
@@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
        if (!--cache->ro) {
                if (btrfs_is_zoned(cache->fs_info)) {
                        /* Migrate zone_unusable bytes back */
-                       cache->zone_unusable = cache->alloc_offset - cache->used;
+                       cache->zone_unusable =
+                               (cache->alloc_offset - cache->used) +
+                               (cache->length - cache->zone_capacity);
                        sinfo->bytes_zone_unusable += cache->zone_unusable;
                        sinfo->bytes_readonly -= cache->zone_unusable;
                }
@@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 }
 
 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-                            u64 bytenr, u64 num_bytes, int alloc)
+                            u64 bytenr, u64 num_bytes, bool alloc)
 {
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_block_group *cache = NULL;
@@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
         */
        check_system_chunk(trans, flags);
 
-       bg = btrfs_alloc_chunk(trans, flags);
+       bg = btrfs_create_chunk(trans, flags);
        if (IS_ERR(bg)) {
                ret = PTR_ERR(bg);
                goto out;
        }
 
-       /*
-        * If this is a system chunk allocation then stop right here and do not
-        * add the chunk item to the chunk btree. This is to prevent a deadlock
-        * because this system chunk allocation can be triggered while COWing
-        * some extent buffer of the chunk btree and while holding a lock on a
-        * parent extent buffer, in which case attempting to insert the chunk
-        * item (or update the device item) would result in a deadlock on that
-        * parent extent buffer. In this case defer the chunk btree updates to
-        * the second phase of chunk allocation and keep our reservation until
-        * the second phase completes.
-        *
-        * This is a rare case and can only be triggered by the very few cases
-        * we have where we need to touch the chunk btree outside chunk allocation
-        * and chunk removal. These cases are basically adding a device, removing
-        * a device or resizing a device.
-        */
-       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               return 0;
-
        ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
        /*
         * Normally we are not expected to fail with -ENOSPC here, since we have
         * previously reserved space in the system space_info and allocated one
-        * new system chunk if necessary. However there are two exceptions:
+        * new system chunk if necessary. However there are three exceptions:
         *
         * 1) We may have enough free space in the system space_info but all the
         *    existing system block groups have a profile which can not be used
@@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
         *    with enough free space got turned into RO mode by a running scrub,
         *    and in this case we have to allocate a new one and retry. We only
         *    need do this allocate and retry once, since we have a transaction
-        *    handle and scrub uses the commit root to search for block groups.
+        *    handle and scrub uses the commit root to search for block groups;
+        *
+        * 3) We had one system block group with enough free space when we called
+        *    check_system_chunk(), but after that, right before we tried to
+        *    allocate the last extent buffer we needed, a discard operation came
+        *    in and it temporarily removed the last free space entry from the
+        *    block group (discard removes a free space entry, discards it, and
+        *    then adds back the entry to the block group cache).
         */
        if (ret == -ENOSPC) {
                const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
                struct btrfs_block_group *sys_bg;
 
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                if (IS_ERR(sys_bg)) {
                        ret = PTR_ERR(sys_bg);
                        btrfs_abort_transaction(trans, ret);
@@ -3519,7 +3546,15 @@ out:
  *    properly, either intentionally or as a bug. One example where this is
  *    done intentionally is fsync, as it does not reserve any transaction units
  *    and ends up allocating a variable number of metadata extents for log
- *    tree extent buffers.
+ *    tree extent buffers;
+ *
+ * 4) The task has reserved enough transaction units / metadata space, but right
+ *    before it tries to allocate the last extent buffer it needs, a discard
+ *    operation comes in and, temporarily, removes the last free space entry from
+ *    the only metadata block group that had free space (discard starts by
+ *    removing a free space entry from a block group, then does the discard
+ *    operation and, once it's done, it adds back the free space entry to the
+ *    block group).
  *
  * We also need this 2 phases setup when adding a device to a filesystem with
  * a seed device - we must create new metadata and system chunks without adding
@@ -3537,14 +3572,14 @@ out:
  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
  * the system chunk array due to concurrent allocations") provides more details.
  *
- * For allocation of system chunks, we defer the updates and insertions into the
- * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
- * if the chunk allocation is triggered while COWing an extent buffer of the
- * chunk btree, we are holding a lock on the parent of that extent buffer and
- * doing the chunk btree updates and insertions can require locking that parent.
- * This is for the very few and rare cases where we update the chunk btree that
- * are not chunk allocation or chunk removal: adding a device, removing a device
- * or resizing a device.
+ * Allocation of system chunks does not happen through this function. A task that
+ * needs to update the chunk btree (the only btree that uses system chunks), must
+ * preallocate chunk space by calling either check_system_chunk() or
+ * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
+ * metadata chunk or when removing a chunk, while the later is used before doing
+ * a modification to the chunk btree - use cases for the later are adding,
+ * removing and resizing a device as well as relocation of a system chunk.
+ * See the comment below for more details.
  *
  * The reservation of system space, done through check_system_chunk(), as well
  * as all the updates and insertions into the chunk btree must be done while
@@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
        if (trans->allocating_chunk)
                return -ENOSPC;
        /*
-        * If we are removing a chunk, don't re-enter or we would deadlock.
-        * System space reservation and system chunk allocation is done by the
-        * chunk remove operation (btrfs_remove_chunk()).
+        * Allocation of system chunks can not happen through this path, as we
+        * could end up in a deadlock if we are allocating a data or metadata
+        * chunk and there is another task modifying the chunk btree.
+        *
+        * This is because while we are holding the chunk mutex, we will attempt
+        * to add the new chunk item to the chunk btree or update an existing
+        * device item in the chunk btree, while the other task that is modifying
+        * the chunk btree is attempting to COW an extent buffer while holding a
+        * lock on it and on its parent - if the COW operation triggers a system
+        * chunk allocation, then we can deadlock because we are holding the
+        * chunk mutex and we may need to access that extent buffer or its parent
+        * in order to add the chunk item or update a device item.
+        *
+        * Tasks that want to modify the chunk tree should reserve system space
+        * before updating the chunk btree, by calling either
+        * btrfs_reserve_chunk_metadata() or check_system_chunk().
+        * It's possible that after a task reserves the space, it still ends up
+        * here - this happens in the cases described above at do_chunk_alloc().
+        * The task will have to either retry or fail.
         */
-       if (trans->removing_chunk)
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                return -ENOSPC;
 
        space_info = btrfs_find_space_info(fs_info, flags);
@@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
        return num_dev;
 }
 
-/*
- * Reserve space in the system space for allocating or removing a chunk
- */
-void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+                               u64 bytes,
+                               u64 type)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_space_info *info;
        u64 left;
-       u64 thresh;
        int ret = 0;
-       u64 num_devs;
 
        /*
         * Needed because we can end up allocating a system chunk and for an
@@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
        left = info->total_bytes - btrfs_space_info_used(info, true);
        spin_unlock(&info->lock);
 
-       num_devs = get_profile_num_devs(fs_info, type);
-
-       /* num_devs device items to update and 1 chunk item to add or remove */
-       thresh = btrfs_calc_metadata_size(fs_info, num_devs) +
-               btrfs_calc_insert_metadata_size(fs_info, 1);
-
-       if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
+       if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
                btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
-                          left, thresh, type);
+                          left, bytes, type);
                btrfs_dump_space_info(fs_info, info, 0, 0);
        }
 
-       if (left < thresh) {
+       if (left < bytes) {
                u64 flags = btrfs_system_alloc_profile(fs_info);
                struct btrfs_block_group *bg;
 
@@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
                 * needing it, as we might not need to COW all nodes/leafs from
                 * the paths we visit in the chunk tree (they were already COWed
                 * or created in the current transaction for example).
-                *
-                * Also, if our caller is allocating a system chunk, do not
-                * attempt to insert the chunk item in the chunk btree, as we
-                * could deadlock on an extent buffer since our caller may be
-                * COWing an extent buffer from the chunk btree.
                 */
-               bg = btrfs_alloc_chunk(trans, flags);
+               bg = btrfs_create_chunk(trans, flags);
                if (IS_ERR(bg)) {
                        ret = PTR_ERR(bg);
-               } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+               } else {
                        /*
                         * If we fail to add the chunk item here, we end up
                         * trying again at phase 2 of chunk allocation, at
                         * btrfs_create_pending_block_groups(). So ignore
-                        * any error here.
+                        * any error here. An ENOSPC here could happen, due to
+                        * the cases described at do_chunk_alloc() - the system
+                        * block group we just created was just turned into RO
+                        * mode by a scrub for example, or a running discard
+                        * temporarily removed its free space entries, etc.
                         */
                        btrfs_chunk_alloc_add_chunk_item(trans, bg);
                }
@@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
        if (!ret) {
                ret = btrfs_block_rsv_add(fs_info->chunk_root,
                                          &fs_info->chunk_block_rsv,
-                                         thresh, BTRFS_RESERVE_NO_FLUSH);
+                                         bytes, BTRFS_RESERVE_NO_FLUSH);
                if (!ret)
-                       trans->chunk_bytes_reserved += thresh;
+                       trans->chunk_bytes_reserved += bytes;
        }
 }
 
+/*
+ * Reserve space in the system space for allocating or removing a chunk.
+ * The caller must be holding fs_info->chunk_mutex.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       const u64 num_devs = get_profile_num_devs(fs_info, type);
+       u64 bytes;
+
+       /* num_devs device items to update and 1 chunk item to add or remove. */
+       bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
+               btrfs_calc_insert_metadata_size(fs_info, 1);
+
+       reserve_chunk_space(trans, bytes, type);
+}
+
+/*
+ * Reserve space in the system space, if needed, for doing a modification to the
+ * chunk btree.
+ *
+ * @trans:             A transaction handle.
+ * @is_item_insertion: Indicate if the modification is for inserting a new item
+ *                     in the chunk btree or if it's for the deletion or update
+ *                     of an existing item.
+ *
+ * This is used in a context where we need to update the chunk btree outside
+ * block group allocation and removal, to avoid a deadlock with a concurrent
+ * task that is allocating a metadata or data block group and therefore needs to
+ * update the chunk btree while holding the chunk mutex. After the update to the
+ * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
+ *
+ */
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+                                 bool is_item_insertion)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       u64 bytes;
+
+       if (is_item_insertion)
+               bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
+       else
+               bytes = btrfs_calc_metadata_size(fs_info, 1);
+
+       mutex_lock(&fs_info->chunk_mutex);
+       reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
+       mutex_unlock(&fs_info->chunk_mutex);
+}
+
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group *block_group;
@@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        }
        spin_unlock(&info->unused_bgs_lock);
 
+       spin_lock(&info->zone_active_bgs_lock);
+       while (!list_empty(&info->zone_active_bgs)) {
+               block_group = list_first_entry(&info->zone_active_bgs,
+                                              struct btrfs_block_group,
+                                              active_bg_list);
+               list_del_init(&block_group->active_bg_list);
+               btrfs_put_block_group(block_group);
+       }
+       spin_unlock(&info->zone_active_bgs_lock);
+
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
                block_group = rb_entry(n, struct btrfs_block_group,
index c72a71e..5878b7c 100644 (file)
@@ -98,6 +98,7 @@ struct btrfs_block_group {
        unsigned int to_copy:1;
        unsigned int relocating_repair:1;
        unsigned int chunk_item_inserted:1;
+       unsigned int zone_is_active:1;
 
        int disk_cache_state;
 
@@ -202,7 +203,10 @@ struct btrfs_block_group {
         */
        u64 alloc_offset;
        u64 zone_unusable;
+       u64 zone_capacity;
        u64 meta_write_pointer;
+       struct map_lookup *physical_map;
+       struct list_head active_bg_list;
 };
 
 static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
-                            u64 bytenr, u64 num_bytes, int alloc);
+                            u64 bytenr, u64 num_bytes, bool alloc);
 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
                             u64 ram_bytes, u64 num_bytes, int delalloc);
 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
@@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                      enum btrfs_chunk_alloc_enum force);
 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
 void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
+void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
+                                 bool is_item_insertion);
 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
index 76ee145..ab2a4a5 100644 (file)
@@ -138,17 +138,34 @@ struct btrfs_inode {
        /* a local copy of root's last_log_commit */
        int last_log_commit;
 
-       /* total number of bytes pending delalloc, used by stat to calc the
-        * real block usage of the file
-        */
-       u64 delalloc_bytes;
-
-       /*
-        * Total number of bytes pending delalloc that fall within a file
-        * range that is either a hole or beyond EOF (and no prealloc extent
-        * exists in the range). This is always <= delalloc_bytes.
-        */
-       u64 new_delalloc_bytes;
+       union {
+               /*
+                * Total number of bytes pending delalloc, used by stat to
+                * calculate the real block usage of the file. This is used
+                * only for files.
+                */
+               u64 delalloc_bytes;
+               /*
+                * The offset of the last dir item key that was logged.
+                * This is used only for directories.
+                */
+               u64 last_dir_item_offset;
+       };
+
+       union {
+               /*
+                * Total number of bytes pending delalloc that fall within a file
+                * range that is either a hole or beyond EOF (and no prealloc extent
+                * exists in the range). This is always <= delalloc_bytes and this
+                * is used only for files.
+                */
+               u64 new_delalloc_bytes;
+               /*
+                * The offset of the last dir index key that was logged.
+                * This is used only for directories.
+                */
+               u64 last_dir_index_offset;
+       };
 
        /*
         * total number of bytes pending defrag, used by stat to check whether
@@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 
 struct btrfs_dio_private {
        struct inode *inode;
-       u64 logical_offset;
+
+       /*
+        * Since DIO can use anonymous page, we cannot use page_offset() to
+        * grab the file offset, thus need a dedicated member for file offset.
+        */
+       u64 file_offset;
        u64 disk_bytenr;
        /* Used for bio::bi_size */
        u32 bytes;
index 8681608..7e9f90f 100644 (file)
@@ -186,7 +186,6 @@ struct btrfsic_dev_state {
        struct list_head collision_resolving_node;      /* list node */
        struct btrfsic_block dummy_block_for_bio_bh_flush;
        u64 last_flush_gen;
-       char name[BDEVNAME_SIZE];
 };
 
 struct btrfsic_block_hashtable {
@@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
        ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
        ds->bdev = NULL;
        ds->state = NULL;
-       ds->name[0] = '\0';
        INIT_LIST_HEAD(&ds->collision_resolving_node);
        ds->last_flush_gen = 0;
        btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
@@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror(
                superblock_tmp->mirror_num = 1 + superblock_mirror_num;
                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
                        btrfs_info_in_rcu(fs_info,
-                               "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
+                       "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
                                     superblock_bdev,
                                     rcu_str_deref(device->name), dev_bytenr,
-                                    dev_state->name, dev_bytenr,
+                                    dev_state->bdev, dev_bytenr,
                                     superblock_mirror_num);
                list_add(&superblock_tmp->all_blocks_node,
                         &state->all_blocks_list);
@@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame:
                        if (disk_item_offset + sizeof(struct btrfs_item) >
                            sf->block_ctx->len) {
 leaf_item_out_of_bounce_error:
-                               pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+                               pr_info(
+               "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n",
                                       sf->block_ctx->start,
-                                      sf->block_ctx->dev->name);
+                                      sf->block_ctx->dev->bdev);
                                goto one_stack_frame_backwards;
                        }
                        btrfsic_read_from_block_data(sf->block_ctx,
@@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame:
                                          (uintptr_t)nodehdr;
                        if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
                            sf->block_ctx->len) {
-                               pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n",
+                               pr_info(
+               "btrfsic: node item out of bounce at logical %llu, dev %pg\n",
                                       sf->block_ctx->start,
-                                      sf->block_ctx->dev->name);
+                                      sf->block_ctx->dev->bdev);
                                goto one_stack_frame_backwards;
                        }
                        btrfsic_read_from_block_data(
@@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block(
                        if (next_block->logical_bytenr != next_bytenr &&
                            !(!next_block->is_metadata &&
                              0 == next_block->logical_bytenr))
-                               pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                                      next_bytenr, next_block_ctx->dev->name,
+                               pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+                                      next_bytenr, next_block_ctx->dev->bdev,
                                       next_block_ctx->dev_bytenr, *mirror_nump,
                                       btrfsic_get_block_type(state,
                                                              next_block),
                                       next_block->logical_bytenr);
                        else
-                               pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                      next_bytenr, next_block_ctx->dev->name,
+                               pr_info(
+               "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                      next_bytenr, next_block_ctx->dev->bdev,
                                       next_block_ctx->dev_bytenr, *mirror_nump,
                                       btrfsic_get_block_type(state,
                                                              next_block));
@@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data(
        if (file_extent_item_offset +
            offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
            block_ctx->len) {
-               pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
-                      block_ctx->start, block_ctx->dev->name);
+               pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+                      block_ctx->start, block_ctx->dev->bdev);
                return -1;
        }
 
@@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data(
 
        if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
            block_ctx->len) {
-               pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n",
-                      block_ctx->start, block_ctx->dev->name);
+               pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n",
+                      block_ctx->start, block_ctx->dev->bdev);
                return -1;
        }
        btrfsic_read_from_block_data(block_ctx, &file_extent_item,
@@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data(
                                    next_block->logical_bytenr != next_bytenr &&
                                    !(!next_block->is_metadata &&
                                      0 == next_block->logical_bytenr)) {
-                                       pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n",
+                                       pr_info(
+"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n",
                                               next_bytenr,
-                                              next_block_ctx.dev->name,
+                                              next_block_ctx.dev->bdev,
                                               next_block_ctx.dev_bytenr,
                                               mirror_num,
                                               next_block->logical_bytenr);
@@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        struct btrfs_fs_info *fs_info = state->fs_info;
        int ret;
        u64 length;
-       struct btrfs_bio *multi = NULL;
+       struct btrfs_io_context *multi = NULL;
        struct btrfs_device *device;
 
        length = len;
@@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                struct bio *bio;
                unsigned int j;
 
-               bio = btrfs_io_bio_alloc(num_pages - i);
+               bio = btrfs_bio_alloc(num_pages - i);
                bio_set_dev(bio, block_ctx->dev->bdev);
                bio->bi_iter.bi_sector = dev_bytenr >> 9;
                bio->bi_opf = REQ_OP_READ;
@@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
                        return -1;
                }
                if (submit_bio_wait(bio)) {
-                       pr_info("btrfsic: read error at logical %llu dev %s!\n",
-                              block_ctx->start, block_ctx->dev->name);
+                       pr_info("btrfsic: read error at logical %llu dev %pg!\n",
+                              block_ctx->start, block_ctx->dev->bdev);
                        bio_put(bio);
                        return -1;
                }
@@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
        list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
                const struct btrfsic_block_link *l;
 
-               pr_info("%c-block @%llu (%s/%llu/%d)\n",
+               pr_info("%c-block @%llu (%pg/%llu/%d)\n",
                       btrfsic_get_block_type(state, b_all),
-                      b_all->logical_bytenr, b_all->dev_state->name,
+                      b_all->logical_bytenr, b_all->dev_state->bdev,
                       b_all->dev_bytenr, b_all->mirror_num);
 
                list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
-                       pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n",
                               btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                               b_all->dev_bytenr, b_all->mirror_num,
                               l->ref_cnt,
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                }
 
                list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
-                       pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
                               btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                               b_all->dev_bytenr, b_all->mirror_num,
                               l->ref_cnt,
                               btrfsic_get_block_type(state, l->block_ref_from),
                               l->block_ref_from->logical_bytenr,
-                              l->block_ref_from->dev_state->name,
+                              l->block_ref_from->dev_state->bdev,
                               l->block_ref_from->dev_bytenr,
                               l->block_ref_from->mirror_num);
                }
@@ -1743,16 +1748,18 @@ again:
                                if (block->logical_bytenr != bytenr &&
                                    !(!block->is_metadata &&
                                      block->logical_bytenr == 0))
-                                       pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
-                                              bytenr, dev_state->name,
+                                       pr_info(
+"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n",
+                                              bytenr, dev_state->bdev,
                                               dev_bytenr,
                                               block->mirror_num,
                                               btrfsic_get_block_type(state,
                                                                      block),
                                               block->logical_bytenr);
                                else
-                                       pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                              bytenr, dev_state->name,
+                                       pr_info(
+               "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                              bytenr, dev_state->bdev,
                                               dev_bytenr, block->mirror_num,
                                               btrfsic_get_block_type(state,
                                                                      block));
@@ -1767,8 +1774,9 @@ again:
                        processed_len = state->datablock_size;
                        bytenr = block->logical_bytenr;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
-                                      bytenr, dev_state->name, dev_bytenr,
+                               pr_info(
+               "written block @%llu (%pg/%llu/%d) found in hash table, %c\n",
+                                      bytenr, dev_state->bdev, dev_bytenr,
                                       block->mirror_num,
                                       btrfsic_get_block_type(state, block));
                }
@@ -1778,9 +1786,10 @@ again:
                               list_empty(&block->ref_to_list) ? ' ' : '!',
                               list_empty(&block->ref_from_list) ? ' ' : '!');
                if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
-                       pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
+                       pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n",
                               btrfsic_get_block_type(state, block), bytenr,
-                              dev_state->name, dev_bytenr, block->mirror_num,
+                              dev_state->bdev, dev_bytenr, block->mirror_num,
                               block->generation,
                               btrfs_disk_key_objectid(&block->disk_key),
                               block->disk_key.type,
@@ -1792,9 +1801,10 @@ again:
                }
 
                if (!block->is_iodone && !block->never_written) {
-                       pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n",
                               btrfsic_get_block_type(state, block), bytenr,
-                              dev_state->name, dev_bytenr, block->mirror_num,
+                              dev_state->bdev, dev_bytenr, block->mirror_num,
                               block->generation,
                               btrfs_stack_header_generation(
                                       (struct btrfs_header *)
@@ -1921,8 +1931,9 @@ again:
                if (!is_metadata) {
                        processed_len = state->datablock_size;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block (%s/%llu/?) !found in hash table, D.\n",
-                                      dev_state->name, dev_bytenr);
+                               pr_info(
+                       "written block (%pg/%llu/?) !found in hash table, D\n",
+                                      dev_state->bdev, dev_bytenr);
                        if (!state->include_extent_data) {
                                /* ignore that written D block */
                                goto continue_loop;
@@ -1939,8 +1950,9 @@ again:
                        btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
                                                       dev_bytenr);
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                               pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n",
-                                      bytenr, dev_state->name, dev_bytenr);
+                               pr_info(
+                       "written block @%llu (%pg/%llu/?) !found in hash table, M\n",
+                                      bytenr, dev_state->bdev, dev_bytenr);
                }
 
                block_ctx.dev = dev_state;
@@ -1995,9 +2007,9 @@ again:
                        block->next_in_same_bio = NULL;
                }
                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("New written %c-block @%llu (%s/%llu/%d)\n",
+                       pr_info("new written %c-block @%llu (%pg/%llu/%d)\n",
                               is_metadata ? 'M' : 'D',
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                               block->dev_bytenr, block->mirror_num);
                list_add(&block->all_blocks_node, &state->all_blocks_list);
                btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp)
 
                if ((dev_state->state->print_mask &
                     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-                       pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                       pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n",
                               bp->bi_status,
                               btrfsic_get_block_type(dev_state->state, block),
-                              block->logical_bytenr, dev_state->name,
+                              block->logical_bytenr, dev_state->bdev,
                               block->dev_bytenr, block->mirror_num);
                next_block = block->next_in_same_bio;
                block->iodone_w_error = iodone_w_error;
@@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp)
                        dev_state->last_flush_gen++;
                        if ((dev_state->state->print_mask &
                             BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
-                               pr_info("bio_end_io() new %s flush_gen=%llu\n",
-                                      dev_state->name,
+                               pr_info("bio_end_io() new %pg flush_gen=%llu\n",
+                                      dev_state->bdev,
                                       dev_state->last_flush_gen);
                }
                if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock(
        if (!(superblock->generation > state->max_superblock_generation ||
              0 == state->max_superblock_generation)) {
                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-                       pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n",
+                       pr_info(
+       "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n",
                               superblock->logical_bytenr,
-                              superblock->dev_state->name,
+                              superblock->dev_state->bdev,
                               superblock->dev_bytenr, superblock->mirror_num,
                               btrfs_super_generation(super_hdr),
                               state->max_superblock_generation);
        } else {
                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
-                       pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n",
+                       pr_info(
+       "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n",
                               superblock->logical_bytenr,
-                              superblock->dev_state->name,
+                              superblock->dev_state->bdev,
                               superblock->dev_bytenr, superblock->mirror_num,
                               btrfs_super_generation(super_hdr),
                               state->max_superblock_generation);
@@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
         */
        list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+               "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n",
                               recursion_level,
                               btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                               block->dev_bytenr, block->mirror_num,
                               l->ref_cnt,
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                if (l->block_ref_to->never_written) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n",
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                        ret = -1;
                } else if (!l->block_ref_to->is_iodone) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n",
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                        ret = -1;
                } else if (l->block_ref_to->iodone_w_error) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n",
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num);
                        ret = -1;
@@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                           l->parent_generation &&
                           BTRFSIC_GENERATION_UNKNOWN !=
                           l->block_ref_to->generation) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n",
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num,
                               l->block_ref_to->generation,
@@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
                        ret = -1;
                } else if (l->block_ref_to->flush_gen >
                           l->block_ref_to->dev_state->last_flush_gen) {
-                       pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
+                       pr_info(
+"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n",
                               btrfsic_get_block_type(state, l->block_ref_to),
                               l->block_ref_to->logical_bytenr,
-                              l->block_ref_to->dev_state->name,
+                              l->block_ref_to->dev_state->bdev,
                               l->block_ref_to->dev_bytenr,
                               l->block_ref_to->mirror_num, block->flush_gen,
                               l->block_ref_to->dev_state->last_flush_gen);
@@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock(
         */
        list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                       pr_info(
+       "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n",
                               recursion_level,
                               btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, block->dev_state->name,
+                              block->logical_bytenr, block->dev_state->bdev,
                               block->dev_bytenr, block->mirror_num,
                               l->ref_cnt,
                               btrfsic_get_block_type(state, l->block_ref_from),
                               l->block_ref_from->logical_bytenr,
-                              l->block_ref_from->dev_state->name,
+                              l->block_ref_from->dev_state->bdev,
                               l->block_ref_from->dev_bytenr,
                               l->block_ref_from->mirror_num);
                if (l->block_ref_from->is_superblock &&
@@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock(
 static void btrfsic_print_add_link(const struct btrfsic_state *state,
                                   const struct btrfsic_block_link *l)
 {
-       pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+       pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
               l->ref_cnt,
               btrfsic_get_block_type(state, l->block_ref_from),
               l->block_ref_from->logical_bytenr,
-              l->block_ref_from->dev_state->name,
+              l->block_ref_from->dev_state->bdev,
               l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
               btrfsic_get_block_type(state, l->block_ref_to),
               l->block_ref_to->logical_bytenr,
-              l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+              l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
               l->block_ref_to->mirror_num);
 }
 
 static void btrfsic_print_rem_link(const struct btrfsic_state *state,
                                   const struct btrfsic_block_link *l)
 {
-       pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n",
+       pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n",
               l->ref_cnt,
               btrfsic_get_block_type(state, l->block_ref_from),
               l->block_ref_from->logical_bytenr,
-              l->block_ref_from->dev_state->name,
+              l->block_ref_from->dev_state->bdev,
               l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
               btrfsic_get_block_type(state, l->block_ref_to),
               l->block_ref_to->logical_bytenr,
-              l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
+              l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr,
               l->block_ref_to->mirror_num);
 }
 
@@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
         * This algorithm is recursive because the amount of used stack space
         * is very small and the max recursion depth is limited.
         */
-       indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)",
+       indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)",
                             btrfsic_get_block_type(state, block),
-                            block->logical_bytenr, block->dev_state->name,
+                            block->logical_bytenr, block->dev_state->bdev,
                             block->dev_bytenr, block->mirror_num);
        if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
                printk("[...]\n");
@@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
                block->never_written = never_written;
                block->mirror_num = mirror_num;
                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
-                       pr_info("New %s%c-block @%llu (%s/%llu/%d)\n",
+                       pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n",
                               additional_string,
                               btrfsic_get_block_type(state, block),
-                              block->logical_bytenr, dev_state->name,
+                              block->logical_bytenr, dev_state->bdev,
                               block->dev_bytenr, mirror_num);
                list_add(&block->all_blocks_node, &state->all_blocks_list);
                btrfsic_block_hashtable_add(block, &state->block_hashtable);
@@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
        }
 
        if (WARN_ON(!match)) {
-               pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n",
-                      bytenr, dev_state->name, dev_bytenr);
+               pr_info(
+"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n",
+                      bytenr, dev_state->bdev, dev_bytenr);
                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
                        ret = btrfsic_map_block(state, bytenr,
                                                state->metablock_size,
@@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                        if (ret)
                                continue;
 
-                       pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n",
-                              bytenr, block_ctx.dev->name,
+                       pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n",
+                              bytenr, block_ctx.dev->bdev,
                               block_ctx.dev_bytenr, mirror_num);
                }
        }
@@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
                        if ((dev_state->state->print_mask &
                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                              BTRFSIC_PRINT_MASK_VERBOSE)))
-                               pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n",
-                                      dev_state->name);
+                               pr_info(
+"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
+                                      dev_state->bdev);
                } else {
                        struct btrfsic_block *const block =
                                &dev_state->dummy_block_for_bio_bh_flush;
@@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
 
        list_for_each_entry(device, dev_head, dev_list) {
                struct btrfsic_dev_state *ds;
-               const char *p;
 
                if (!device->bdev || !device->name)
                        continue;
@@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info,
                }
                ds->bdev = device->bdev;
                ds->state = state;
-               bdevname(ds->bdev, ds->name);
-               ds->name[BDEVNAME_SIZE - 1] = '\0';
-               p = kbasename(ds->name);
-               strlcpy(ds->name, p, sizeof(ds->name));
                btrfsic_dev_state_hashtable_add(ds,
                                                &btrfsic_dev_state_hashtable);
        }
@@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices)
                if (b_all->is_iodone || b_all->never_written)
                        btrfsic_block_free(b_all);
                else
-                       pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n",
+                       pr_info(
+"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n",
                               btrfsic_get_block_type(state, b_all),
-                              b_all->logical_bytenr, b_all->dev_state->name,
+                              b_all->logical_bytenr, b_all->dev_state->bdev,
                               b_all->dev_bytenr, b_all->mirror_num);
        }
 
index 6c7eb80..32da97c 100644 (file)
@@ -29,6 +29,7 @@
 #include "compression.h"
 #include "extent_io.h"
 #include "extent_map.h"
+#include "subpage.h"
 #include "zoned.h"
 
 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -181,9 +182,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
                        if (memcmp(&csum, cb_sum, csum_size) != 0) {
                                btrfs_print_data_csum_error(inode, disk_start,
                                                csum, cb_sum, cb->mirror_num);
-                               if (btrfs_io_bio(bio)->device)
+                               if (btrfs_bio(bio)->device)
                                        btrfs_dev_stat_inc_and_print(
-                                               btrfs_io_bio(bio)->device,
+                                               btrfs_bio(bio)->device,
                                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
                                return -EIO;
                        }
@@ -194,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
        return 0;
 }
 
+/*
+ * Reduce bio and io accounting for a compressed_bio with its corresponding bio.
+ *
+ * Return true if there is no pending bio nor io.
+ * Return false otherwise.
+ */
+static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       unsigned int bi_size = 0;
+       bool last_io = false;
+       struct bio_vec *bvec;
+       struct bvec_iter_all iter_all;
+
+       /*
+        * At endio time, bi_iter.bi_size doesn't represent the real bio size.
+        * Thus here we have to iterate through all segments to grab correct
+        * bio size.
+        */
+       bio_for_each_segment_all(bvec, bio, iter_all)
+               bi_size += bvec->bv_len;
+
+       if (bio->bi_status)
+               cb->errors = 1;
+
+       ASSERT(bi_size && bi_size <= cb->compressed_len);
+       last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
+                                       &cb->pending_sectors);
+       /*
+        * Here we must wake up the possible error handler after all other
+        * operations on @cb finished, or we can race with
+        * finish_compressed_bio_*() which may free @cb.
+        */
+       wake_up_var(cb);
+
+       return last_io;
+}
+
+static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+{
+       unsigned int index;
+       struct page *page;
+
+       /* Release the compressed pages */
+       for (index = 0; index < cb->nr_pages; index++) {
+               page = cb->compressed_pages[index];
+               page->mapping = NULL;
+               put_page(page);
+       }
+
+       /* Do io completion on the original bio */
+       if (cb->errors) {
+               bio_io_error(cb->orig_bio);
+       } else {
+               struct bio_vec *bvec;
+               struct bvec_iter_all iter_all;
+
+               ASSERT(bio);
+               ASSERT(!bio->bi_status);
+               /*
+                * We have verified the checksum already, set page checked so
+                * the end_io handlers know about it
+                */
+               ASSERT(!bio_flagged(bio, BIO_CLONED));
+               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
+                       u64 bvec_start = page_offset(bvec->bv_page) +
+                                        bvec->bv_offset;
+
+                       btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb),
+                                       bvec->bv_page, bvec_start,
+                                       bvec->bv_len);
+               }
+
+               bio_endio(cb->orig_bio);
+       }
+
+       /* Finally free the cb struct */
+       kfree(cb->compressed_pages);
+       kfree(cb);
+}
+
 /* when we finish reading compressed pages from the disk, we
  * decompress them and then run the bio end_io routines on the
  * decompressed pages (in the inode address space).
@@ -208,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio)
 {
        struct compressed_bio *cb = bio->bi_private;
        struct inode *inode;
-       struct page *page;
-       unsigned int index;
-       unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
+       unsigned int mirror = btrfs_bio(bio)->mirror_num;
        int ret = 0;
 
-       if (bio->bi_status)
-               cb->errors = 1;
-
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
+       if (!dec_and_test_compressed_bio(cb, bio))
                goto out;
 
        /*
         * Record the correct mirror_num in cb->orig_bio so that
         * read-repair can work properly.
         */
-       btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
+       btrfs_bio(cb->orig_bio)->mirror_num = mirror;
        cb->mirror_num = mirror;
 
        /*
@@ -250,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio)
 csum_failed:
        if (ret)
                cb->errors = 1;
-
-       /* release the compressed pages */
-       index = 0;
-       for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
-               page->mapping = NULL;
-               put_page(page);
-       }
-
-       /* do io completion on the original bio */
-       if (cb->errors) {
-               bio_io_error(cb->orig_bio);
-       } else {
-               struct bio_vec *bvec;
-               struct bvec_iter_all iter_all;
-
-               /*
-                * we have verified the checksum already, set page
-                * checked so the end_io handlers know about it
-                */
-               ASSERT(!bio_flagged(bio, BIO_CLONED));
-               bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
-                       SetPageChecked(bvec->bv_page);
-
-               bio_endio(cb->orig_bio);
-       }
-
-       /* finally free the cb struct */
-       kfree(cb->compressed_pages);
-       kfree(cb);
+       finish_compressed_bio_read(cb, bio);
 out:
        bio_put(bio);
 }
@@ -291,6 +336,7 @@ out:
 static noinline void end_compressed_writeback(struct inode *inode,
                                              const struct compressed_bio *cb)
 {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        unsigned long index = cb->start >> PAGE_SHIFT;
        unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
        struct page *pages[16];
@@ -313,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode,
                for (i = 0; i < ret; i++) {
                        if (cb->errors)
                                SetPageError(pages[i]);
-                       end_page_writeback(pages[i]);
+                       btrfs_page_clamp_clear_writeback(fs_info, pages[i],
+                                                        cb->start, cb->len);
                        put_page(pages[i]);
                }
                nr_pages -= ret;
@@ -322,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode,
        /* the inode may be gone now */
 }
 
-/*
- * do the cleanup once all the compressed pages hit the disk.
- * This will clear writeback on the file pages and free the compressed
- * pages.
- *
- * This also calls the writeback end hooks for the file pages so that
- * metadata and checksums can be updated in the file.
- */
-static void end_compressed_bio_write(struct bio *bio)
+static void finish_compressed_bio_write(struct compressed_bio *cb)
 {
-       struct compressed_bio *cb = bio->bi_private;
-       struct inode *inode;
-       struct page *page;
+       struct inode *inode = cb->inode;
        unsigned int index;
 
-       if (bio->bi_status)
-               cb->errors = 1;
-
-       /* if there are more bios still pending for this compressed
-        * extent, just exit
-        */
-       if (!refcount_dec_and_test(&cb->pending_bios))
-               goto out;
-
-       /* ok, we're the last bio for this extent, step one is to
-        * call back into the FS and do all the end_io operations
+       /*
+        * Ok, we're the last bio for this extent, step one is to call back
+        * into the FS and do all the end_io operations.
         */
-       inode = cb->inode;
-       btrfs_record_physical_zoned(inode, cb->start, bio);
        btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
                        cb->start, cb->start + cb->len - 1,
                        !cb->errors);
 
        end_compressed_writeback(inode, cb);
-       /* note, our inode could be gone now */
+       /* Note, our inode could be gone now */
 
        /*
-        * release the compressed pages, these came from alloc_page and
+        * Release the compressed pages, these came from alloc_page and
         * are not attached to the inode at all
         */
-       index = 0;
        for (index = 0; index < cb->nr_pages; index++) {
-               page = cb->compressed_pages[index];
+               struct page *page = cb->compressed_pages[index];
+
                page->mapping = NULL;
                put_page(page);
        }
 
-       /* finally free the cb struct */
+       /* Finally free the cb struct */
        kfree(cb->compressed_pages);
        kfree(cb);
+}
+
+/*
+ * Do the cleanup once all the compressed pages hit the disk.  This will clear
+ * writeback on the file pages and free the compressed pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that metadata
+ * and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio)
+{
+       struct compressed_bio *cb = bio->bi_private;
+
+       if (!dec_and_test_compressed_bio(cb, bio))
+               goto out;
+
+       btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+
+       finish_compressed_bio_write(cb);
 out:
        bio_put(bio);
 }
 
+static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
+                                         struct compressed_bio *cb,
+                                         struct bio *bio, int mirror_num)
+{
+       blk_status_t ret;
+
+       ASSERT(bio->bi_iter.bi_size);
+       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+       if (ret)
+               return ret;
+       ret = btrfs_map_bio(fs_info, bio, mirror_num);
+       return ret;
+}
+
+/*
+ * Allocate a compressed_bio, which will be used to read/write on-disk
+ * (aka, compressed) * data.
+ *
+ * @cb:                 The compressed_bio structure, which records all the needed
+ *                      information to bind the compressed data to the uncompressed
+ *                      page cache.
+ * @disk_byten:         The logical bytenr where the compressed data will be read
+ *                      from or written to.
+ * @endio_func:         The endio function to call after the IO for compressed data
+ *                      is finished.
+ * @next_stripe_start:  Return value of logical bytenr of where next stripe starts.
+ *                      Let the caller know to only fill the bio up to the stripe
+ *                      boundary.
+ */
+
+
+static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
+                                       unsigned int opf, bio_end_io_t endio_func,
+                                       u64 *next_stripe_start)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
+       struct btrfs_io_geometry geom;
+       struct extent_map *em;
+       struct bio *bio;
+       int ret;
+
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
+
+       bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+       bio->bi_opf = opf;
+       bio->bi_private = cb;
+       bio->bi_end_io = endio_func;
+
+       em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
+       if (IS_ERR(em)) {
+               bio_put(bio);
+               return ERR_CAST(em);
+       }
+
+       if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+               bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev);
+
+       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom);
+       free_extent_map(em);
+       if (ret < 0) {
+               bio_put(bio);
+               return ERR_PTR(ret);
+       }
+       *next_stripe_start = disk_bytenr + geom.len;
+
+       return bio;
+}
+
 /*
  * worker function to build and submit bios for previously compressed pages.
  * The corresponding pages in the inode should be marked for writeback
@@ -396,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct bio *bio = NULL;
        struct compressed_bio *cb;
-       unsigned long bytes_left;
-       int pg_index = 0;
-       struct page *page;
-       u64 first_byte = disk_start;
+       u64 cur_disk_bytenr = disk_start;
+       u64 next_stripe_start;
        blk_status_t ret;
        int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
        const bool use_append = btrfs_use_zone_append(inode, disk_start);
        const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
 
-       WARN_ON(!PAGE_ALIGNED(start));
+       ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+              IS_ALIGNED(len, fs_info->sectorsize));
        cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
        if (!cb)
                return BLK_STS_RESOURCE;
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
        cb->errors = 0;
        cb->inode = &inode->vfs_inode;
        cb->start = start;
@@ -420,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
        cb->orig_bio = NULL;
        cb->nr_pages = nr_pages;
 
-       bio = btrfs_bio_alloc(first_byte);
-       bio->bi_opf = bio_op | write_flags;
-       bio->bi_private = cb;
-       bio->bi_end_io = end_compressed_bio_write;
-
-       if (use_append) {
-               struct btrfs_device *device;
-
-               device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
-               if (IS_ERR(device)) {
-                       kfree(cb);
-                       bio_put(bio);
-                       return BLK_STS_NOTSUPP;
+       while (cur_disk_bytenr < disk_start + compressed_len) {
+               u64 offset = cur_disk_bytenr - disk_start;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = compressed_pages[index];
+               bool submit = false;
+
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!bio) {
+                       bio = alloc_compressed_bio(cb, cur_disk_bytenr,
+                               bio_op | write_flags, end_compressed_bio_write,
+                               &next_stripe_start);
+                       if (IS_ERR(bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(bio));
+                               bio = NULL;
+                               goto finish_cb;
+                       }
                }
-
-               bio_set_dev(bio, device->bdev);
-       }
-
-       if (blkcg_css) {
-               bio->bi_opf |= REQ_CGROUP_PUNT;
-               kthread_associate_blkcg(blkcg_css);
-       }
-       refcount_set(&cb->pending_bios, 1);
-
-       /* create and submit bios for the compressed pages */
-       bytes_left = compressed_len;
-       for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
-               int submit = 0;
-               int len = 0;
-
-               page = compressed_pages[pg_index];
-               page->mapping = inode->vfs_inode.i_mapping;
-               if (bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
-                                                         0);
-
                /*
-                * Page can only be added to bio if the current bio fits in
-                * stripe.
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
                 */
-               if (!submit) {
-                       if (pg_index == 0 && use_append)
-                               len = bio_add_zone_append_page(bio, page,
-                                                              PAGE_SIZE, 0);
-                       else
-                               len = bio_add_page(bio, page, PAGE_SIZE, 0);
-               }
-
-               page->mapping = NULL;
-               if (submit || len < PAGE_SIZE) {
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
-                       ret = btrfs_bio_wq_end_io(fs_info, bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
+               ASSERT(cur_disk_bytenr != next_stripe_start);
 
+               /*
+                * We have various limits on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+
+               if (use_append)
+                       added = bio_add_zone_append_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               else
+                       added = bio_add_page(bio, page, real_size,
+                                       offset_in_page(offset));
+               /* Reached zoned boundary */
+               if (added == 0)
+                       submit = true;
+
+               cur_disk_bytenr += added;
+               /* Reached stripe boundary */
+               if (cur_disk_bytenr == next_stripe_start)
+                       submit = true;
+
+               /* Finished the range */
+               if (cur_disk_bytenr == disk_start + compressed_len)
+                       submit = true;
+
+               if (submit) {
                        if (!skip_sum) {
                                ret = btrfs_csum_one_bio(inode, bio, start, 1);
-                               BUG_ON(ret); /* -ENOMEM */
-                       }
-
-                       ret = btrfs_map_bio(fs_info, bio, 0);
-                       if (ret) {
-                               bio->bi_status = ret;
-                               bio_endio(bio);
+                               if (ret)
+                                       goto finish_cb;
                        }
 
-                       bio = btrfs_bio_alloc(first_byte);
-                       bio->bi_opf = bio_op | write_flags;
-                       bio->bi_private = cb;
-                       bio->bi_end_io = end_compressed_bio_write;
-                       if (blkcg_css)
-                               bio->bi_opf |= REQ_CGROUP_PUNT;
-                       /*
-                        * Use bio_add_page() to ensure the bio has at least one
-                        * page.
-                        */
-                       bio_add_page(bio, page, PAGE_SIZE, 0);
+                       ret = submit_compressed_bio(fs_info, cb, bio, 0);
+                       if (ret)
+                               goto finish_cb;
+                       bio = NULL;
                }
-               if (bytes_left < PAGE_SIZE) {
-                       btrfs_info(fs_info,
-                                       "bytes left %lu compress len %u nr %u",
-                              bytes_left, cb->compressed_len, cb->nr_pages);
-               }
-               bytes_left -= PAGE_SIZE;
-               first_byte += PAGE_SIZE;
                cond_resched();
        }
+       if (blkcg_css)
+               kthread_associate_blkcg(NULL);
 
-       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
-
-       if (!skip_sum) {
-               ret = btrfs_csum_one_bio(inode, bio, start, 1);
-               BUG_ON(ret); /* -ENOMEM */
-       }
+       return 0;
 
-       ret = btrfs_map_bio(fs_info, bio, 0);
-       if (ret) {
+finish_cb:
+       if (bio) {
                bio->bi_status = ret;
                bio_endio(bio);
        }
+       /* Last byte of @cb is submitted, endio will free @cb */
+       if (cur_disk_bytenr == disk_start + compressed_len)
+               return ret;
 
-       if (blkcg_css)
-               kthread_associate_blkcg(NULL);
-
-       return 0;
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_start + compressed_len - cur_disk_bytenr) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_write(cb);
+       return ret;
 }
 
 static u64 bio_end_offset(struct bio *bio)
@@ -541,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio)
        return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
 }
 
+/*
+ * Add extra pages in the same compressed file extent so that we don't need to
+ * re-read the same extent again and again.
+ *
+ * NOTE: this won't work well for subpage, as for subpage read, we lock the
+ * full page then submit bio for each compressed/regular extents.
+ *
+ * This means, if we have several sectors in the same page points to the same
+ * on-disk compressed data, we will re-read the same extent many times and
+ * this function can only help for the next page.
+ */
 static noinline int add_ra_bio_pages(struct inode *inode,
                                     u64 compressed_end,
                                     struct compressed_bio *cb)
 {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        unsigned long end_index;
-       unsigned long pg_index;
-       u64 last_offset;
+       u64 cur = bio_end_offset(cb->orig_bio);
        u64 isize = i_size_read(inode);
        int ret;
        struct page *page;
-       unsigned long nr_pages = 0;
        struct extent_map *em;
        struct address_space *mapping = inode->i_mapping;
        struct extent_map_tree *em_tree;
        struct extent_io_tree *tree;
-       u64 end;
-       int misses = 0;
+       int sectors_missed = 0;
 
-       last_offset = bio_end_offset(cb->orig_bio);
        em_tree = &BTRFS_I(inode)->extent_tree;
        tree = &BTRFS_I(inode)->io_tree;
 
@@ -578,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
        end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
-       while (last_offset < compressed_end) {
-               pg_index = last_offset >> PAGE_SHIFT;
+       while (cur < compressed_end) {
+               u64 page_end;
+               u64 pg_index = cur >> PAGE_SHIFT;
+               u32 add_size;
 
                if (pg_index > end_index)
                        break;
 
                page = xa_load(&mapping->i_pages, pg_index);
                if (page && !xa_is_value(page)) {
-                       misses++;
-                       if (misses > 4)
+                       sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
+                                         fs_info->sectorsize_bits;
+
+                       /* Beyond threshold, no need to continue */
+                       if (sectors_missed > 4)
                                break;
-                       goto next;
+
+                       /*
+                        * Jump to next page start as we already have page for
+                        * current offset.
+                        */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                }
 
                page = __page_cache_alloc(mapping_gfp_constraint(mapping,
@@ -599,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
                if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
                        put_page(page);
-                       goto next;
+                       /* There is already a page, skip to page end */
+                       cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
+                       continue;
                }
 
-               /*
-                * at this point, we have a locked page in the page cache
-                * for these bytes in the file.  But, we have to make
-                * sure they map to this compressed extent on disk.
-                */
                ret = set_page_extent_mapped(page);
                if (ret < 0) {
                        unlock_page(page);
@@ -614,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                        break;
                }
 
-               end = last_offset + PAGE_SIZE - 1;
-               lock_extent(tree, last_offset, end);
+               page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
+               lock_extent(tree, cur, page_end);
                read_lock(&em_tree->lock);
-               em = lookup_extent_mapping(em_tree, last_offset,
-                                          PAGE_SIZE);
+               em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
                read_unlock(&em_tree->lock);
 
-               if (!em || last_offset < em->start ||
-                   (last_offset + PAGE_SIZE > extent_map_end(em)) ||
+               /*
+                * At this point, we have a locked page in the page cache for
+                * these bytes in the file.  But, we have to make sure they map
+                * to this compressed extent on disk.
+                */
+               if (!em || cur < em->start ||
+                   (cur + fs_info->sectorsize > extent_map_end(em)) ||
                    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
                        free_extent_map(em);
-                       unlock_extent(tree, last_offset, end);
+                       unlock_extent(tree, cur, page_end);
                        unlock_page(page);
                        put_page(page);
                        break;
@@ -643,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                        }
                }
 
-               ret = bio_add_page(cb->orig_bio, page,
-                                  PAGE_SIZE, 0);
-
-               if (ret == PAGE_SIZE) {
-                       nr_pages++;
-                       put_page(page);
-               } else {
-                       unlock_extent(tree, last_offset, end);
+               add_size = min(em->start + em->len, page_end + 1) - cur;
+               ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
+               if (ret != add_size) {
+                       unlock_extent(tree, cur, page_end);
                        unlock_page(page);
                        put_page(page);
                        break;
                }
-next:
-               last_offset += PAGE_SIZE;
+               /*
+                * If it's subpage, we also need to increase its
+                * subpage::readers number, as at endio we will decrease
+                * subpage::readers and to unlock the page.
+                */
+               if (fs_info->sectorsize < PAGE_SIZE)
+                       btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+               put_page(page);
+               cur += add_size;
        }
        return 0;
 }
@@ -681,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        unsigned int compressed_len;
        unsigned int nr_pages;
        unsigned int pg_index;
-       struct page *page;
-       struct bio *comp_bio;
-       u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
+       struct bio *comp_bio = NULL;
+       const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 cur_disk_byte = disk_bytenr;
+       u64 next_stripe_start;
        u64 file_offset;
        u64 em_len;
        u64 em_start;
@@ -710,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        if (!cb)
                goto out;
 
-       refcount_set(&cb->pending_bios, 0);
+       refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
        cb->errors = 0;
        cb->inode = inode;
        cb->mirror_num = mirror_num;
@@ -750,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        /* include any pages we added in add_ra-bio_pages */
        cb->len = bio->bi_iter.bi_size;
 
-       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-       comp_bio->bi_opf = REQ_OP_READ;
-       comp_bio->bi_private = cb;
-       comp_bio->bi_end_io = end_compressed_bio_read;
-       refcount_set(&cb->pending_bios, 1);
-
-       for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-               u32 pg_len = PAGE_SIZE;
-               int submit = 0;
+       while (cur_disk_byte < disk_bytenr + compressed_len) {
+               u64 offset = cur_disk_byte - disk_bytenr;
+               unsigned int index = offset >> PAGE_SHIFT;
+               unsigned int real_size;
+               unsigned int added;
+               struct page *page = cb->compressed_pages[index];
+               bool submit = false;
+
+               /* Allocate new bio if submitted or not yet allocated */
+               if (!comp_bio) {
+                       comp_bio = alloc_compressed_bio(cb, cur_disk_byte,
+                                       REQ_OP_READ, end_compressed_bio_read,
+                                       &next_stripe_start);
+                       if (IS_ERR(comp_bio)) {
+                               ret = errno_to_blk_status(PTR_ERR(comp_bio));
+                               comp_bio = NULL;
+                               goto finish_cb;
+                       }
+               }
+               /*
+                * We should never reach next_stripe_start start as we will
+                * submit comp_bio when reach the boundary immediately.
+                */
+               ASSERT(cur_disk_byte != next_stripe_start);
+               /*
+                * We have various limit on the real read size:
+                * - stripe boundary
+                * - page boundary
+                * - compressed length boundary
+                */
+               real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte);
+               real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset));
+               real_size = min_t(u64, real_size, compressed_len - offset);
+               ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
 
+               added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
                /*
-                * To handle subpage case, we need to make sure the bio only
-                * covers the range we need.
-                *
-                * If we're at the last page, truncate the length to only cover
-                * the remaining part.
+                * Maximum compressed extent is smaller than bio size limit,
+                * thus bio_add_page() should always success.
                 */
-               if (pg_index == nr_pages - 1)
-                       pg_len = min_t(u32, PAGE_SIZE,
-                                       compressed_len - pg_index * PAGE_SIZE);
+               ASSERT(added == real_size);
+               cur_disk_byte += added;
 
-               page = cb->compressed_pages[pg_index];
-               page->mapping = inode->i_mapping;
-               page->index = em_start >> PAGE_SHIFT;
+               /* Reached stripe boundary, need to submit */
+               if (cur_disk_byte == next_stripe_start)
+                       submit = true;
 
-               if (comp_bio->bi_iter.bi_size)
-                       submit = btrfs_bio_fits_in_stripe(page, pg_len,
-                                                         comp_bio, 0);
+               /* Has finished the range, need to submit */
+               if (cur_disk_byte == disk_bytenr + compressed_len)
+                       submit = true;
 
-               page->mapping = NULL;
-               if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
+               if (submit) {
                        unsigned int nr_sectors;
 
-                       ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
-                                                 BTRFS_WQ_ENDIO_DATA);
-                       BUG_ON(ret); /* -ENOMEM */
-
-                       /*
-                        * inc the count before we submit the bio so
-                        * we know the end IO handler won't happen before
-                        * we inc the count.  Otherwise, the cb might get
-                        * freed before we're done setting it up
-                        */
-                       refcount_inc(&cb->pending_bios);
-
                        ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-                       BUG_ON(ret); /* -ENOMEM */
+                       if (ret)
+                               goto finish_cb;
 
                        nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
                                                  fs_info->sectorsize);
                        sums += fs_info->csum_size * nr_sectors;
 
-                       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-                       if (ret) {
-                               comp_bio->bi_status = ret;
-                               bio_endio(comp_bio);
-                       }
-
-                       comp_bio = btrfs_bio_alloc(cur_disk_byte);
-                       comp_bio->bi_opf = REQ_OP_READ;
-                       comp_bio->bi_private = cb;
-                       comp_bio->bi_end_io = end_compressed_bio_read;
-
-                       bio_add_page(comp_bio, page, pg_len, 0);
+                       ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
+                       if (ret)
+                               goto finish_cb;
+                       comp_bio = NULL;
                }
-               cur_disk_byte += pg_len;
        }
-
-       ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
-       BUG_ON(ret); /* -ENOMEM */
-
-       ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
-       BUG_ON(ret); /* -ENOMEM */
-
-       ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
-       if (ret) {
-               comp_bio->bi_status = ret;
-               bio_endio(comp_bio);
-       }
-
        return 0;
 
 fail2:
@@ -844,6 +951,26 @@ fail1:
 out:
        free_extent_map(em);
        return ret;
+finish_cb:
+       if (comp_bio) {
+               comp_bio->bi_status = ret;
+               bio_endio(comp_bio);
+       }
+       /* All bytes of @cb is submitted, endio will free @cb */
+       if (cur_disk_byte == disk_bytenr + compressed_len)
+               return ret;
+
+       wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
+                          (disk_bytenr + compressed_len - cur_disk_byte) >>
+                          fs_info->sectorsize_bits);
+       /*
+        * Even with previous bio ended, we should still have io not yet
+        * submitted, thus need to finish @cb manually.
+        */
+       ASSERT(refcount_read(&cb->pending_sectors));
+       /* Now we are the only one referring @cb, can finish it safely. */
+       finish_compressed_bio_read(cb, NULL);
+       return ret;
 }
 
 /*
index 399be0b..56eef08 100644 (file)
@@ -28,8 +28,8 @@ struct btrfs_inode;
 #define        BTRFS_ZLIB_DEFAULT_LEVEL                3
 
 struct compressed_bio {
-       /* number of bios pending for this compressed extent */
-       refcount_t pending_bios;
+       /* Number of sectors with unfinished IO (unsubmitted or unfinished) */
+       refcount_t pending_sectors;
 
        /* Number of compressed pages in the array */
        unsigned int nr_pages;
index 66290b2..c3983bd 100644 (file)
@@ -396,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        if (*cow_ret == buf)
                unlock_orig = 1;
 
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
 
        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
                trans->transid != fs_info->running_transaction->transid);
@@ -2488,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
        int ret;
 
        BUG_ON(!path->nodes[level]);
-       btrfs_assert_tree_locked(path->nodes[level]);
+       btrfs_assert_tree_write_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
@@ -2828,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (slot >= btrfs_header_nritems(upper) - 1)
                return 1;
 
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
 
        right = btrfs_read_node_slot(upper, slot + 1);
        /*
@@ -3066,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        if (right_nritems == 0)
                return 1;
 
-       btrfs_assert_tree_locked(path->nodes[1]);
+       btrfs_assert_tree_write_locked(path->nodes[1]);
 
        left = btrfs_read_node_slot(path->nodes[1], slot - 1);
        /*
@@ -3582,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 }
 
 /*
- * This function duplicate a item, giving 'new_key' to the new item.
- * It guarantees both items live in the same tree leaf and the new item
- * is contiguous with the original item.
- *
- * This allows us to split file extent in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
-                        struct btrfs_path *path,
-                        const struct btrfs_key *new_key)
-{
-       struct extent_buffer *leaf;
-       int ret;
-       u32 item_size;
-
-       leaf = path->nodes[0];
-       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-       ret = setup_leaf_for_split(trans, root, path,
-                                  item_size + sizeof(struct btrfs_item));
-       if (ret)
-               return ret;
-
-       path->slots[0]++;
-       setup_items_for_insert(root, path, new_key, &item_size, 1);
-       leaf = path->nodes[0];
-       memcpy_extent_buffer(leaf,
-                            btrfs_item_ptr_offset(leaf, path->slots[0]),
-                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
-                            item_size);
-       return 0;
-}
-
-/*
  * make the item pointed to by the path smaller.  new_size indicates
  * how small to make it, and from_end tells us if we just chop bytes
  * off the end of the item or if we shift the item to chop bytes off
@@ -3786,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
  *
  * @root:      root we are inserting items to
  * @path:      points to the leaf/slot where we are going to insert new items
- * @cpu_key:   array of keys for items to be inserted
- * @data_size: size of the body of each item we are going to insert
- * @nr:                size of @cpu_key/@data_size arrays
+ * @batch:      information about the batch of items to insert
  */
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+                                  const struct btrfs_item_batch *batch)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_item *item;
@@ -3804,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        int slot;
        struct btrfs_map_token token;
        u32 total_size;
-       u32 total_data = 0;
-
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
 
+       /*
+        * Before anything else, update keys in the parent and other ancestors
+        * if needed, then release the write locks on them, so that other tasks
+        * can use them while we modify the leaf.
+        */
        if (path->slots[0] == 0) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
                fixup_low_keys(path, &disk_key, 1);
        }
        btrfs_unlock_up_safe(path, 1);
@@ -3821,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 
        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(leaf);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
 
        if (btrfs_leaf_free_space(leaf) < total_size) {
                btrfs_print_leaf(leaf);
@@ -3850,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
                        item = btrfs_item_nr(i);
                        ioff = btrfs_token_item_offset(&token, item);
                        btrfs_set_token_item_offset(&token, item,
-                                                   ioff - total_data);
+                                                   ioff - batch->total_data_size);
                }
                /* shift the items */
-               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
                              btrfs_item_nr_offset(slot),
                              (nritems - slot) * sizeof(struct btrfs_item));
 
                /* shift the data */
                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
-                             data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
-                             data_end, old_data - data_end);
+                                     data_end - batch->total_data_size,
+                                     BTRFS_LEAF_DATA_OFFSET + data_end,
+                                     old_data - data_end);
                data_end = old_data;
        }
 
        /* setup the item for the new data */
-       for (i = 0; i < nr; i++) {
-               btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+       for (i = 0; i < batch->nr; i++) {
+               btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
                item = btrfs_item_nr(slot + i);
-               data_end -= data_size[i];
+               data_end -= batch->data_sizes[i];
                btrfs_set_token_item_offset(&token, item, data_end);
-               btrfs_set_token_item_size(&token, item, data_size[i]);
+               btrfs_set_token_item_size(&token, item, batch->data_sizes[i]);
        }
 
-       btrfs_set_header_nritems(leaf, nritems + nr);
+       btrfs_set_header_nritems(leaf, nritems + batch->nr);
        btrfs_mark_buffer_dirty(leaf);
 
        if (btrfs_leaf_free_space(leaf) < 0) {
@@ -3884,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 }
 
 /*
+ * Insert a new item into a leaf.
+ *
+ * @root:      The root of the btree.
+ * @path:      A path pointing to the target leaf and slot.
+ * @key:       The key of the new item.
+ * @data_size: The size of the data associated with the new key.
+ */
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                const struct btrfs_key *key,
+                                u32 data_size)
+{
+       struct btrfs_item_batch batch;
+
+       batch.keys = key;
+       batch.data_sizes = &data_size;
+       batch.total_data_size = data_size;
+       batch.nr = 1;
+
+       setup_items_for_insert(root, path, &batch);
+}
+
+/*
  * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr)
+                           const struct btrfs_item_batch *batch)
 {
        int ret = 0;
        int slot;
-       int i;
-       u32 total_size = 0;
-       u32 total_data = 0;
-
-       for (i = 0; i < nr; i++)
-               total_data += data_size[i];
+       u32 total_size;
 
-       total_size = total_data + (nr * sizeof(struct btrfs_item));
-       ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+       total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
+       ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
        if (ret == 0)
                return -EEXIST;
        if (ret < 0)
@@ -3912,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
        slot = path->slots[0];
        BUG_ON(slot < 0);
 
-       setup_items_for_insert(root, path, cpu_key, data_size, nr);
+       setup_items_for_insert(root, path, batch);
        return 0;
 }
 
@@ -3944,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 }
 
 /*
+ * This function duplicates an item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item is
+ * contiguous with the original item.
+ *
+ * This allows us to split a file extent in place, keeping a lock on the leaf
+ * the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const struct btrfs_key *new_key)
+{
+       struct extent_buffer *leaf;
+       int ret;
+       u32 item_size;
+
+       leaf = path->nodes[0];
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+       ret = setup_leaf_for_split(trans, root, path,
+                                  item_size + sizeof(struct btrfs_item));
+       if (ret)
+               return ret;
+
+       path->slots[0]++;
+       btrfs_setup_item_for_insert(root, path, new_key, item_size);
+       leaf = path->nodes[0];
+       memcpy_extent_buffer(leaf,
+                            btrfs_item_ptr_offset(leaf, path->slots[0]),
+                            btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+                            item_size);
+       return 0;
+}
+
+/*
  * delete the pointer from a given node.
  *
  * the tree should have been previously balanced so the deletion does not
index c0cebcf..7553e9d 100644 (file)
@@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep;
 extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
 struct btrfs_ordered_sum;
 struct btrfs_ref;
+struct btrfs_bio;
 
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
@@ -217,6 +218,9 @@ struct btrfs_root_backup {
        u8 unused_8[10];
 } __attribute__ ((__packed__));
 
+#define BTRFS_SUPER_INFO_OFFSET                        SZ_64K
+#define BTRFS_SUPER_INFO_SIZE                  4096
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -269,7 +273,11 @@ struct btrfs_super_block {
        __le64 reserved[28];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
        struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+       /* Padded to 4096 bytes */
+       u8 padding[565];
 } __attribute__ ((__packed__));
+static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 
 /*
  * Compat flags that we support.  If any incompat flags are set other than the
@@ -899,6 +907,7 @@ struct btrfs_fs_info {
        struct btrfs_workqueue *scrub_workers;
        struct btrfs_workqueue *scrub_wr_completion_workers;
        struct btrfs_workqueue *scrub_parity_workers;
+       struct btrfs_subpage_info *subpage_info;
 
        struct btrfs_discard_ctl discard_ctl;
 
@@ -1017,6 +1026,16 @@ struct btrfs_fs_info {
        spinlock_t treelog_bg_lock;
        u64 treelog_bg;
 
+       /*
+        * Start of the dedicated data relocation block group, protected by
+        * relocation_bg_lock.
+        */
+       spinlock_t relocation_bg_lock;
+       u64 data_reloc_bg;
+
+       spinlock_t zone_active_bgs_lock;
+       struct list_head zone_active_bgs;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
        spinlock_t ref_verify_lock;
        struct rb_root block_tree;
@@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
        return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
 
-void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
-                           const struct btrfs_key *cpu_key, u32 *data_size,
-                           int nr);
+/*
+ * Describes a batch of items to insert in a btree. This is used by
+ * btrfs_insert_empty_items().
+ */
+struct btrfs_item_batch {
+       /*
+        * Pointer to an array containing the keys of the items to insert (in
+        * sorted order).
+        */
+       const struct btrfs_key *keys;
+       /* Pointer to an array containing the data size for each item to insert. */
+       const u32 *data_sizes;
+       /*
+        * The sum of data sizes for all items. The caller can compute this while
+        * setting up the data_sizes array, so it ends up being more efficient
+        * than having btrfs_insert_empty_items() or setup_item_for_insert()
+        * doing it, as it would avoid an extra loop over a potentially large
+        * array, and in the case of setup_item_for_insert(), we would be doing
+        * it while holding a write lock on a leaf and often on upper level nodes
+        * too, unnecessarily increasing the size of a critical section.
+        */
+       u32 total_data_size;
+       /* Size of the keys and data_sizes arrays (number of items in the batch). */
+       int nr;
+};
+
+void btrfs_setup_item_for_insert(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                const struct btrfs_key *key,
+                                u32 data_size);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      const struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
-                            const struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr);
+                            const struct btrfs_item_batch *batch);
 
 static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
@@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
                                          const struct btrfs_key *key,
                                          u32 data_size)
 {
-       return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+       struct btrfs_item_batch batch;
+
+       batch.keys = key;
+       batch.data_sizes = &data_size;
+       batch.total_data_size = data_size;
+       batch.nr = 1;
+
+       return btrfs_insert_empty_items(trans, root, path, &batch);
 }
 
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
@@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
 /* inode.c */
 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
                                   int mirror_num, unsigned long bio_flags);
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-                                   struct page *page, u64 start, u64 end);
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+                                   u32 bio_offset, struct page *page,
+                                   u64 start, u64 end);
 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
                                           u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root,
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
                       struct btrfs_inode *dir, struct btrfs_inode *inode,
                       const char *name, int name_len);
 int btrfs_add_link(struct btrfs_trans_handle *trans,
@@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
                                 struct extent_state *other);
 void btrfs_split_delalloc_extent(struct inode *inode,
                                 struct extent_state *orig, u64 split);
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-                            unsigned long bio_flags);
 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
@@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
 int __pure btrfs_is_empty_uuid(u8 *uuid);
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                      struct btrfs_ioctl_defrag_range_args *range,
-                     u64 newer_than, unsigned long max_pages);
+                     u64 newer_than, unsigned long max_to_defrag);
 void btrfs_get_block_group_info(struct list_head *groups_list,
                                struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
@@ -3563,6 +3613,9 @@ do {                                                              \
                          (errno), fmt, ##args);                \
 } while (0)
 
+#define BTRFS_FS_ERROR(fs_info)        (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
+                                                  &(fs_info)->fs_state)))
+
 __printf(5, 6)
 __cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
@@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
        return fs_info->zoned != 0;
 }
 
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
+{
+       return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
+}
+
 /*
  * We use page status Private2 to indicate there is an ordered extent with
  * unfinished IO.
index 1e08eb2..e164766 100644 (file)
@@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_path *path,
                                     struct btrfs_delayed_item *first_item)
 {
-       LIST_HEAD(batch);
+       LIST_HEAD(item_list);
        struct btrfs_delayed_item *curr;
        struct btrfs_delayed_item *next;
        const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
+       struct btrfs_item_batch batch;
        int total_size;
-       int nitems;
        char *ins_data = NULL;
-       struct btrfs_key *ins_keys;
-       u32 *ins_sizes;
        int ret;
 
-       list_add_tail(&first_item->tree_list, &batch);
-       nitems = 1;
+       list_add_tail(&first_item->tree_list, &item_list);
+       batch.total_data_size = first_item->data_len;
+       batch.nr = 1;
        total_size = first_item->data_len + sizeof(struct btrfs_item);
        curr = first_item;
 
@@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                if (total_size + next_size > max_size)
                        break;
 
-               list_add_tail(&next->tree_list, &batch);
-               nitems++;
+               list_add_tail(&next->tree_list, &item_list);
+               batch.nr++;
                total_size += next_size;
+               batch.total_data_size += next->data_len;
                curr = next;
        }
 
-       if (nitems == 1) {
-               ins_keys = &first_item->key;
-               ins_sizes = &first_item->data_len;
+       if (batch.nr == 1) {
+               batch.keys = &first_item->key;
+               batch.data_sizes = &first_item->data_len;
        } else {
+               struct btrfs_key *ins_keys;
+               u32 *ins_sizes;
                int i = 0;
 
-               ins_data = kmalloc(nitems * sizeof(u32) +
-                                  nitems * sizeof(struct btrfs_key), GFP_NOFS);
+               ins_data = kmalloc(batch.nr * sizeof(u32) +
+                                  batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
                if (!ins_data) {
                        ret = -ENOMEM;
                        goto out;
                }
                ins_sizes = (u32 *)ins_data;
-               ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32));
-               list_for_each_entry(curr, &batch, tree_list) {
+               ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
+               batch.keys = ins_keys;
+               batch.data_sizes = ins_sizes;
+               list_for_each_entry(curr, &item_list, tree_list) {
                        ins_keys[i] = curr->key;
                        ins_sizes[i] = curr->data_len;
                        i++;
                }
        }
 
-       ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
-                                      nitems);
+       ret = btrfs_insert_empty_items(trans, root, path, &batch);
        if (ret)
                goto out;
 
-       list_for_each_entry(curr, &batch, tree_list) {
+       list_for_each_entry(curr, &item_list, tree_list) {
                char *data_ptr;
 
                data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
@@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
         */
        btrfs_release_path(path);
 
-       list_for_each_entry_safe(curr, next, &batch, tree_list) {
+       list_for_each_entry_safe(curr, next, &item_list, tree_list) {
                list_del(&curr->tree_list);
                btrfs_delayed_item_release_metadata(root, curr);
                btrfs_release_delayed_item(curr);
index ca848b1..cca7e85 100644 (file)
@@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        u64 parent = generic_ref->parent;
        u8 ref_type;
 
-       is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID);
+       is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
 
        ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
        BUG_ON(extent_op && extent_op->is_data);
@@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        }
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-           is_fstree(generic_ref->real_root) &&
-           is_fstree(generic_ref->tree_ref.root) &&
            !generic_ref->skip_qgroup) {
                record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
@@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
                ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
        init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
-                               generic_ref->tree_ref.root, action, ref_type);
-       ref->root = generic_ref->tree_ref.root;
+                               generic_ref->tree_ref.owning_root, action,
+                               ref_type);
+       ref->root = generic_ref->tree_ref.owning_root;
        ref->parent = parent;
        ref->level = level;
 
        init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
-                             generic_ref->tree_ref.root, 0, action, false,
-                             is_system);
+                             generic_ref->tree_ref.owning_root, 0, action,
+                             false, is_system);
        head_ref->extent_op = extent_op;
 
        delayed_refs = &trans->transaction->delayed_refs;
@@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
        u64 bytenr = generic_ref->bytenr;
        u64 num_bytes = generic_ref->len;
        u64 parent = generic_ref->parent;
-       u64 ref_root = generic_ref->data_ref.ref_root;
+       u64 ref_root = generic_ref->data_ref.owning_root;
        u64 owner = generic_ref->data_ref.ino;
        u64 offset = generic_ref->data_ref.offset;
        u8 ref_type;
@@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
        }
 
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
-           is_fstree(ref_root) &&
-           is_fstree(generic_ref->real_root) &&
            !generic_ref->skip_qgroup) {
                record = kzalloc(sizeof(*record), GFP_NOFS);
                if (!record) {
index e22fba2..91a3aab 100644 (file)
@@ -186,8 +186,8 @@ enum btrfs_ref_type {
 struct btrfs_data_ref {
        /* For EXTENT_DATA_REF */
 
-       /* Root which refers to this data extent */
-       u64 ref_root;
+       /* Original root this data extent belongs to */
+       u64 owning_root;
 
        /* Inode which refers to this data extent */
        u64 ino;
@@ -210,11 +210,11 @@ struct btrfs_tree_ref {
        int level;
 
        /*
-        * Root which refers to this tree block.
+        * Root which owns this tree block.
         *
         * For TREE_BLOCK_REF (skinny metadata, either inline or keyed)
         */
-       u64 root;
+       u64 owning_root;
 
        /* For non-skinny metadata, no special member needed */
 };
@@ -231,17 +231,10 @@ struct btrfs_ref {
         */
        bool skip_qgroup;
 
-       /*
-        * Optional. For which root is this modification.
-        * Mostly used for qgroup optimization.
-        *
-        * When unset, data/tree ref init code will populate it.
-        * In certain cases, we're modifying reference for a different root.
-        * E.g. COW fs tree blocks for balance.
-        * In that case, tree_ref::root will be fs tree, but we're doing this
-        * for reloc tree, then we should set @real_root to reloc tree.
-        */
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
+       /* Through which root is this modification. */
        u64 real_root;
+#endif
        u64 bytenr;
        u64 len;
 
@@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
 }
 
 static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref,
-                               int level, u64 root)
+                               int level, u64 root, u64 mod_root, bool skip_qgroup)
 {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        /* If @real_root not set, use @root as fallback */
-       if (!generic_ref->real_root)
-               generic_ref->real_root = root;
+       generic_ref->real_root = mod_root ?: root;
+#endif
        generic_ref->tree_ref.level = level;
-       generic_ref->tree_ref.root = root;
+       generic_ref->tree_ref.owning_root = root;
        generic_ref->type = BTRFS_REF_METADATA;
+       if (skip_qgroup || !(is_fstree(root) &&
+                            (!mod_root || is_fstree(mod_root))))
+               generic_ref->skip_qgroup = true;
+       else
+               generic_ref->skip_qgroup = false;
+
 }
 
 static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref,
-                               u64 ref_root, u64 ino, u64 offset)
+                               u64 ref_root, u64 ino, u64 offset, u64 mod_root,
+                               bool skip_qgroup)
 {
+#ifdef CONFIG_BTRFS_FS_REF_VERIFY
        /* If @real_root not set, use @root as fallback */
-       if (!generic_ref->real_root)
-               generic_ref->real_root = ref_root;
-       generic_ref->data_ref.ref_root = ref_root;
+       generic_ref->real_root = mod_root ?: ref_root;
+#endif
+       generic_ref->data_ref.owning_root = ref_root;
        generic_ref->data_ref.ino = ino;
        generic_ref->data_ref.offset = offset;
        generic_ref->type = BTRFS_REF_DATA;
+       if (skip_qgroup || !(is_fstree(ref_root) &&
+                            (!mod_root || is_fstree(mod_root))))
+               generic_ref->skip_qgroup = true;
+       else
+               generic_ref->skip_qgroup = false;
 }
 
 static inline struct btrfs_delayed_extent_op *
index fbb8b44..c85a7d4 100644 (file)
@@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data);
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
 {
+       struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
        struct btrfs_key key;
        struct btrfs_root *dev_root = fs_info->dev_root;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
@@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found:
                 * We don't have a replace item or it's corrupted.  If there is
                 * a replace target, fail the mount.
                 */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                        btrfs_err(fs_info,
                        "found replace target device without a valid replace item");
                        ret = -EUCLEAN;
@@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found:
                 * We don't have an active replace item but if there is a
                 * replace target, fail the mount.
                 */
-               if (btrfs_find_device(fs_info->fs_devices,
-                                     BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
+               if (btrfs_find_device(fs_info->fs_devices, &args)) {
                        btrfs_err(fs_info,
                        "replace devid present without an active replace item");
                        ret = -EUCLEAN;
@@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found:
                break;
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
-               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
-                                               src_devid, NULL, NULL);
-               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
-                                                       BTRFS_DEV_REPLACE_DEVID,
-                                                       NULL, NULL);
+               dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
+               args.devid = src_devid;
+               dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
+
                /*
                 * allow 'btrfs dev replace_cancel' if src/tgt device is
                 * missing
index 29e7598..59c3be8 100644 (file)
@@ -683,7 +683,7 @@ err:
        return ret;
 }
 
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
                                   struct page *page, u64 start, u64 end,
                                   int mirror)
 {
@@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page)
                BUG_ON(!eb);
                BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                BUG_ON(!atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                return __set_page_dirty_nobuffers(page);
        }
        ASSERT(PagePrivate(page) && page->private);
@@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page)
                ASSERT(eb);
                ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
                ASSERT(atomic_read(&eb->refs));
-               btrfs_assert_tree_locked(eb);
+               btrfs_assert_tree_write_locked(eb);
                free_extent_buffer(eb);
 
                cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
@@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
        struct btrfs_fs_info *fs_info = buf->fs_info;
        if (btrfs_header_generation(buf) ==
            fs_info->running_transaction->transid) {
-               btrfs_assert_tree_locked(buf);
+               btrfs_assert_tree_write_locked(buf);
 
                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
                        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
@@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
                goto fail;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
-           root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+           !btrfs_is_data_reloc_root(root)) {
                set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
                btrfs_check_and_init_root_item(&root->root_item);
        }
@@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
        btrfs_extent_buffer_leak_debug_check(fs_info);
        kfree(fs_info->super_copy);
        kfree(fs_info->super_for_commit);
+       kfree(fs_info->subpage_info);
        kvfree(fs_info);
 }
 
@@ -1953,8 +1954,7 @@ sleep:
                wake_up_process(fs_info->cleaner_kthread);
                mutex_unlock(&fs_info->transaction_kthread_mutex);
 
-               if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
-                                     &fs_info->fs_state)))
+               if (BTRFS_FS_ERROR(fs_info))
                        btrfs_cleanup_transaction(fs_info);
                if (!kthread_should_stop() &&
                                (!btrfs_transaction_blocked(fs_info) ||
@@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
 
        /*
         * For 4K page size, we only support 4K sector size.
-        * For 64K page size, we support read-write for 64K sector size, and
-        * read-only for 4K sector size.
+        * For 64K page size, we support 64K and 4K sector sizes.
         */
        if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
            (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
@@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
        spin_lock_init(&fs_info->treelog_bg_lock);
+       spin_lock_init(&fs_info->zone_active_bgs_lock);
+       spin_lock_init(&fs_info->relocation_bg_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->unused_bg_unpin_mutex);
        mutex_init(&fs_info->reclaim_bgs_lock);
@@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        INIT_LIST_HEAD(&fs_info->unused_bgs);
        INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+       INIT_LIST_HEAD(&fs_info->zone_active_bgs);
 #ifdef CONFIG_BTRFS_DEBUG
        INIT_LIST_HEAD(&fs_info->allocated_roots);
        INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
        btrfs_init_btree_inode(fs_info);
 
-       invalidate_bdev(fs_devices->latest_bdev);
+       invalidate_bdev(fs_devices->latest_dev->bdev);
 
        /*
         * Read super block and check the signature bytes only
         */
-       disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
+       disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
        if (IS_ERR(disk_super)) {
                err = PTR_ERR(disk_super);
                goto fail_alloc;
@@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                goto fail_alloc;
        }
 
-       if (sectorsize != PAGE_SIZE) {
+       if (sectorsize < PAGE_SIZE) {
+               struct btrfs_subpage_info *subpage_info;
+
                btrfs_warn(fs_info,
                "read-write for sector size %u with page size %lu is experimental",
                           sectorsize, PAGE_SIZE);
-       }
-       if (sectorsize != PAGE_SIZE) {
                if (btrfs_super_incompat_flags(fs_info->super_copy) &
                        BTRFS_FEATURE_INCOMPAT_RAID56) {
                        btrfs_err(fs_info,
@@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                        err = -EINVAL;
                        goto fail_alloc;
                }
+               subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+               if (!subpage_info)
+                       goto fail_alloc;
+               btrfs_init_subpage_info(subpage_info, sectorsize);
+               fs_info->subpage_info = subpage_info;
        }
 
        ret = btrfs_init_workqueues(fs_info, fs_devices);
@@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
         * below in btrfs_init_dev_replace().
         */
        btrfs_free_extra_devids(fs_devices);
-       if (!fs_devices->latest_bdev) {
+       if (!fs_devices->latest_dev->bdev) {
                btrfs_err(fs_info, "failed to read devices");
                goto fail_tree_roots;
        }
@@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                goto fail_sysfs;
        }
 
-       if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+       if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+           !btrfs_check_rw_degradable(fs_info, NULL)) {
                btrfs_warn(fs_info,
                "writable mount is not allowed due to too many missing devices");
                goto fail_sysfs;
@@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device,
                        bio->bi_opf |= REQ_FUA;
 
                btrfsic_submit_bio(bio);
-               btrfs_advance_sb_log(device, i);
+
+               if (btrfs_advance_sb_log(device, i))
+                       errors++;
        }
        return errors < i ? 0 : -1;
 }
@@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                drop_ref = true;
        spin_unlock(&fs_info->fs_roots_radix_lock);
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                ASSERT(root->log_root == NULL);
                if (root->reloc_root) {
                        btrfs_put_root(root->reloc_root);
@@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
                        btrfs_err(fs_info, "commit super ret %d", ret);
        }
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
-           test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                btrfs_error_commit_super(fs_info);
 
        kthread_stop(fs_info->transaction_kthread);
@@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
                return;
 #endif
-       btrfs_assert_tree_locked(buf);
+       btrfs_assert_tree_write_locked(buf);
        if (transid != fs_info->generation)
                WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
                        buf->start, transid, fs_info->generation);
index 0e7e952..a2b5db4 100644 (file)
@@ -6,9 +6,6 @@
 #ifndef BTRFS_DISK_IO_H
 #define BTRFS_DISK_IO_H
 
-#define BTRFS_SUPER_INFO_OFFSET SZ_64K
-#define BTRFS_SUPER_INFO_SIZE 4096
-
 #define BTRFS_SUPER_MIRROR_MAX  3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
@@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
                                 struct btrfs_root *root);
-int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
+int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
                                   struct page *page, u64 start, u64 end,
                                   int mirror);
 blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
index 0ab456c..3fd736a 100644 (file)
@@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
        return ret;
 }
 
-static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
+static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
 {
        struct btrfs_device *dev = stripe->dev;
        struct btrfs_fs_info *fs_info = dev->fs_info;
@@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
        u64 discarded_bytes = 0;
        u64 end = bytenr + num_bytes;
        u64 cur = bytenr;
-       struct btrfs_bio *bbio = NULL;
-
+       struct btrfs_io_context *bioc = NULL;
 
        /*
-        * Avoid races with device replace and make sure our bbio has devices
+        * Avoid races with device replace and make sure our bioc has devices
         * associated to its stripes that don't go away while we are discarding.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
        while (cur < end) {
-               struct btrfs_bio_stripe *stripe;
+               struct btrfs_io_stripe *stripe;
                int i;
 
                num_bytes = end - cur;
                /* Tell the block device(s) that the sectors can be discarded */
                ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
-                                     &num_bytes, &bbio, 0);
+                                     &num_bytes, &bioc, 0);
                /*
                 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
                 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
@@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                if (ret < 0)
                        goto out;
 
-               stripe = bbio->stripes;
-               for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+               stripe = bioc->stripes;
+               for (i = 0; i < bioc->num_stripes; i++, stripe++) {
                        u64 bytes;
                        struct btrfs_device *device = stripe->dev;
 
@@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                                 * And since there are two loops, explicitly
                                 * go to out to avoid confusion.
                                 */
-                               btrfs_put_bbio(bbio);
+                               btrfs_put_bioc(bioc);
                                goto out;
                        }
 
@@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
                         */
                        ret = 0;
                }
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                cur += num_bytes;
        }
 out:
@@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
               generic_ref->action);
        BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
-              generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
+              generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
 
        if (generic_ref->type == BTRFS_REF_METADATA)
                ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
@@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
 
 out:
        btrfs_free_path(path);
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+       if (btrfs_is_data_reloc_root(root))
                WARN_ON(ret > 0);
        return ret;
 }
@@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
                                               num_bytes, parent);
-                       generic_ref.real_root = root->root_key.objectid;
                        btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
-                                           key.offset);
-                       generic_ref.skip_qgroup = for_reloc;
+                                           key.offset, root->root_key.objectid,
+                                           for_reloc);
                        if (inc)
                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
                        else
@@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        num_bytes = fs_info->nodesize;
                        btrfs_init_generic_ref(&generic_ref, action, bytenr,
                                               num_bytes, parent);
-                       generic_ref.real_root = root->root_key.objectid;
-                       btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
-                       generic_ref.skip_qgroup = for_reloc;
+                       btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
+                                           root->root_key.objectid, for_reloc);
                        if (inc)
                                ret = btrfs_inc_extent_ref(trans, &generic_ref);
                        else
@@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        goto out;
                }
 
-               ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
+               ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
@@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
                               buf->start, buf->len, parent);
        btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
-                           root->root_key.objectid);
+                           root->root_key.objectid, 0, false);
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                btrfs_ref_tree_mod(fs_info, &generic_ref);
@@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
         * tree, just update pinning info and exit early.
         */
        if ((ref->type == BTRFS_REF_METADATA &&
-            ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+            ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
            (ref->type == BTRFS_REF_DATA &&
-            ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
+            ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
                /* unlocks the pinned mutex */
                btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
                ret = 0;
@@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
        }
 
        if (!((ref->type == BTRFS_REF_METADATA &&
-              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+              ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
              (ref->type == BTRFS_REF_DATA &&
-              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+              ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
                btrfs_ref_tree_mod(fs_info, ref);
 
        return ret;
@@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy {
  */
 struct find_free_extent_ctl {
        /* Basic allocation info */
+       u64 ram_bytes;
        u64 num_bytes;
+       u64 min_alloc_size;
        u64 empty_size;
        u64 flags;
        int delalloc;
@@ -3495,6 +3494,9 @@ struct find_free_extent_ctl {
        /* Allocation is called for tree-log */
        bool for_treelog;
 
+       /* Allocation is called for data relocation */
+       bool for_data_reloc;
+
        /* RAID index, converted from flags */
        int index;
 
@@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        u64 avail;
        u64 bytenr = block_group->start;
        u64 log_bytenr;
+       u64 data_reloc_bytenr;
        int ret = 0;
-       bool skip;
+       bool skip = false;
 
        ASSERT(btrfs_is_zoned(block_group->fs_info));
 
@@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
         */
        spin_lock(&fs_info->treelog_bg_lock);
        log_bytenr = fs_info->treelog_bg;
-       skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
-                             (!ffe_ctl->for_treelog && bytenr == log_bytenr));
+       if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
+                          (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
+               skip = true;
        spin_unlock(&fs_info->treelog_bg_lock);
        if (skip)
                return 1;
 
+       /*
+        * Do not allow non-relocation blocks in the dedicated relocation block
+        * group, and vice versa.
+        */
+       spin_lock(&fs_info->relocation_bg_lock);
+       data_reloc_bytenr = fs_info->data_reloc_bg;
+       if (data_reloc_bytenr &&
+           ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
+            (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
+               skip = true;
+       spin_unlock(&fs_info->relocation_bg_lock);
+       if (skip)
+               return 1;
+       /* Check RO and no space case before trying to activate it */
+       spin_lock(&block_group->lock);
+       if (block_group->ro ||
+           block_group->alloc_offset == block_group->zone_capacity) {
+               spin_unlock(&block_group->lock);
+               return 1;
+       }
+       spin_unlock(&block_group->lock);
+
+       if (!btrfs_zone_activate(block_group))
+               return 1;
+
        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
        spin_lock(&fs_info->treelog_bg_lock);
+       spin_lock(&fs_info->relocation_bg_lock);
 
        ASSERT(!ffe_ctl->for_treelog ||
               block_group->start == fs_info->treelog_bg ||
               fs_info->treelog_bg == 0);
+       ASSERT(!ffe_ctl->for_data_reloc ||
+              block_group->start == fs_info->data_reloc_bg ||
+              fs_info->data_reloc_bg == 0);
 
        if (block_group->ro) {
                ret = 1;
@@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
                goto out;
        }
 
-       avail = block_group->length - block_group->alloc_offset;
+       /*
+        * Do not allow currently used block group to be the data relocation
+        * dedicated block group.
+        */
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+           (block_group->used || block_group->reserved)) {
+               ret = 1;
+               goto out;
+       }
+
+       WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
+       avail = block_group->zone_capacity - block_group->alloc_offset;
        if (avail < num_bytes) {
                if (ffe_ctl->max_extent_size < avail) {
                        /*
@@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
        if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
                fs_info->treelog_bg = block_group->start;
 
+       if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
+               fs_info->data_reloc_bg = block_group->start;
+
        ffe_ctl->found_offset = start + block_group->alloc_offset;
        block_group->alloc_offset += num_bytes;
        spin_lock(&ctl->tree_lock);
@@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
 out:
        if (ret && ffe_ctl->for_treelog)
                fs_info->treelog_bg = 0;
+       if (ret && ffe_ctl->for_data_reloc)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
        spin_unlock(&fs_info->treelog_bg_lock);
        spin_unlock(&block_group->lock);
        spin_unlock(&space_info->lock);
@@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
            ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
                ffe_ctl->orig_have_caching_bg = true;
 
-       if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
-           ffe_ctl->have_caching_bg)
-               return 1;
-
-       if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
-               return 1;
-
        if (ins->objectid) {
                found_extent(ffe_ctl, ins);
                return 0;
        }
 
+       if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+           !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+               /*
+                * If we have enough free space left in an already active block
+                * group and we can't activate any other zone now, retry the
+                * active ones with a smaller allocation size.  Returning early
+                * from here will tell btrfs_reserve_extent() to haven the
+                * size.
+                */
+               return -ENOSPC;
+       }
+
+       if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+               return 1;
+
+       ffe_ctl->index++;
+       if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
+               return 1;
+
        /*
         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
         *                      caching kthreads as we move along
@@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
                                ffe_ctl->hint_byte = fs_info->treelog_bg;
                        spin_unlock(&fs_info->treelog_bg_lock);
                }
+               if (ffe_ctl->for_data_reloc) {
+                       spin_lock(&fs_info->relocation_bg_lock);
+                       if (fs_info->data_reloc_bg)
+                               ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+                       spin_unlock(&fs_info->relocation_bg_lock);
+               }
                return 0;
        default:
                BUG();
@@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
  *    |- If not found, re-iterate all block groups
  */
 static noinline int find_free_extent(struct btrfs_root *root,
-                               u64 ram_bytes, u64 num_bytes, u64 empty_size,
-                               u64 hint_byte_orig, struct btrfs_key *ins,
-                               u64 flags, int delalloc)
+                                    struct btrfs_key *ins,
+                                    struct find_free_extent_ctl *ffe_ctl)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int cache_block_group_error = 0;
        struct btrfs_block_group *block_group = NULL;
-       struct find_free_extent_ctl ffe_ctl = {0};
        struct btrfs_space_info *space_info;
        bool full_search = false;
-       bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
-       WARN_ON(num_bytes < fs_info->sectorsize);
-
-       ffe_ctl.num_bytes = num_bytes;
-       ffe_ctl.empty_size = empty_size;
-       ffe_ctl.flags = flags;
-       ffe_ctl.search_start = 0;
-       ffe_ctl.delalloc = delalloc;
-       ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
-       ffe_ctl.have_caching_bg = false;
-       ffe_ctl.orig_have_caching_bg = false;
-       ffe_ctl.found_offset = 0;
-       ffe_ctl.hint_byte = hint_byte_orig;
-       ffe_ctl.for_treelog = for_treelog;
-       ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
+       WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
 
+       ffe_ctl->search_start = 0;
        /* For clustered allocation */
-       ffe_ctl.retry_clustered = false;
-       ffe_ctl.retry_unclustered = false;
-       ffe_ctl.last_ptr = NULL;
-       ffe_ctl.use_cluster = true;
+       ffe_ctl->empty_cluster = 0;
+       ffe_ctl->last_ptr = NULL;
+       ffe_ctl->use_cluster = true;
+       ffe_ctl->have_caching_bg = false;
+       ffe_ctl->orig_have_caching_bg = false;
+       ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
+       ffe_ctl->loop = 0;
+       /* For clustered allocation */
+       ffe_ctl->retry_clustered = false;
+       ffe_ctl->retry_unclustered = false;
+       ffe_ctl->cached = 0;
+       ffe_ctl->max_extent_size = 0;
+       ffe_ctl->total_free_space = 0;
+       ffe_ctl->found_offset = 0;
+       ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
 
        if (btrfs_is_zoned(fs_info))
-               ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;
+               ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
 
        ins->type = BTRFS_EXTENT_ITEM_KEY;
        ins->objectid = 0;
        ins->offset = 0;
 
-       trace_find_free_extent(root, num_bytes, empty_size, flags);
+       trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
+                              ffe_ctl->flags);
 
-       space_info = btrfs_find_space_info(fs_info, flags);
+       space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
        if (!space_info) {
-               btrfs_err(fs_info, "No space info for %llu", flags);
+               btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
                return -ENOSPC;
        }
 
-       ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
+       ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
        if (ret < 0)
                return ret;
 
-       ffe_ctl.search_start = max(ffe_ctl.search_start,
-                                  first_logical_byte(fs_info, 0));
-       ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
-       if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
+       ffe_ctl->search_start = max(ffe_ctl->search_start,
+                                   first_logical_byte(fs_info, 0));
+       ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
+       if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
                block_group = btrfs_lookup_block_group(fs_info,
-                                                      ffe_ctl.search_start);
+                                                      ffe_ctl->search_start);
                /*
                 * we don't want to use the block group if it doesn't match our
                 * allocation bits, or if its not cached.
@@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
                 * However if we are re-searching with an ideal block group
                 * picked out then we don't care that the block group is cached.
                 */
-               if (block_group && block_group_bits(block_group, flags) &&
+               if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
                    block_group->cached != BTRFS_CACHE_NO) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
@@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root,
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
-                               ffe_ctl.index = btrfs_bg_flags_to_raid_index(
-                                               block_group->flags);
-                               btrfs_lock_block_group(block_group, delalloc);
+                               ffe_ctl->index = btrfs_bg_flags_to_raid_index(
+                                                       block_group->flags);
+                               btrfs_lock_block_group(block_group,
+                                                      ffe_ctl->delalloc);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root,
                }
        }
 search:
-       ffe_ctl.have_caching_bg = false;
-       if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
-           ffe_ctl.index == 0)
+       ffe_ctl->have_caching_bg = false;
+       if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
+           ffe_ctl->index == 0)
                full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group,
-                           &space_info->block_groups[ffe_ctl.index], list) {
+                           &space_info->block_groups[ffe_ctl->index], list) {
                struct btrfs_block_group *bg_ret;
 
                /* If the block group is read-only, we can skip it entirely. */
                if (unlikely(block_group->ro)) {
-                       if (for_treelog)
+                       if (ffe_ctl->for_treelog)
                                btrfs_clear_treelog_bg(block_group);
+                       if (ffe_ctl->for_data_reloc)
+                               btrfs_clear_data_reloc_bg(block_group);
                        continue;
                }
 
-               btrfs_grab_block_group(block_group, delalloc);
-               ffe_ctl.search_start = block_group->start;
+               btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
+               ffe_ctl->search_start = block_group->start;
 
                /*
                 * this can happen if we end up cycling through all the
                 * raid types, but we want to make sure we only allocate
                 * for the proper type.
                 */
-               if (!block_group_bits(block_group, flags)) {
+               if (!block_group_bits(block_group, ffe_ctl->flags)) {
                        u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                BTRFS_BLOCK_GROUP_RAID1_MASK |
                                BTRFS_BLOCK_GROUP_RAID56_MASK |
@@ -4242,7 +4310,7 @@ search:
                         * doesn't provide them, bail.  This does allow us to
                         * fill raid0 from raid1.
                         */
-                       if ((flags & extra) && !(block_group->flags & extra))
+                       if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
                                goto loop;
 
                        /*
@@ -4250,14 +4318,14 @@ search:
                         * It's possible that we have MIXED_GROUP flag but no
                         * block group is mixed.  Just skip such block group.
                         */
-                       btrfs_release_block_group(block_group, delalloc);
+                       btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                        continue;
                }
 
 have_block_group:
-               ffe_ctl.cached = btrfs_block_group_done(block_group);
-               if (unlikely(!ffe_ctl.cached)) {
-                       ffe_ctl.have_caching_bg = true;
+               ffe_ctl->cached = btrfs_block_group_done(block_group);
+               if (unlikely(!ffe_ctl->cached)) {
+                       ffe_ctl->have_caching_bg = true;
                        ret = btrfs_cache_block_group(block_group, 0);
 
                        /*
@@ -4280,10 +4348,11 @@ have_block_group:
                        goto loop;
 
                bg_ret = NULL;
-               ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
+               ret = do_allocation(block_group, ffe_ctl, &bg_ret);
                if (ret == 0) {
                        if (bg_ret && bg_ret != block_group) {
-                               btrfs_release_block_group(block_group, delalloc);
+                               btrfs_release_block_group(block_group,
+                                                         ffe_ctl->delalloc);
                                block_group = bg_ret;
                        }
                } else if (ret == -EAGAIN) {
@@ -4293,46 +4362,49 @@ have_block_group:
                }
 
                /* Checks */
-               ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
-                                            fs_info->stripesize);
+               ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
+                                                fs_info->stripesize);
 
                /* move on to the next group */
-               if (ffe_ctl.search_start + num_bytes >
+               if (ffe_ctl->search_start + ffe_ctl->num_bytes >
                    block_group->start + block_group->length) {
                        btrfs_add_free_space_unused(block_group,
-                                           ffe_ctl.found_offset, num_bytes);
+                                           ffe_ctl->found_offset,
+                                           ffe_ctl->num_bytes);
                        goto loop;
                }
 
-               if (ffe_ctl.found_offset < ffe_ctl.search_start)
+               if (ffe_ctl->found_offset < ffe_ctl->search_start)
                        btrfs_add_free_space_unused(block_group,
-                                       ffe_ctl.found_offset,
-                                       ffe_ctl.search_start - ffe_ctl.found_offset);
+                                       ffe_ctl->found_offset,
+                                       ffe_ctl->search_start - ffe_ctl->found_offset);
 
-               ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
-                               num_bytes, delalloc);
+               ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
+                                              ffe_ctl->num_bytes,
+                                              ffe_ctl->delalloc);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space_unused(block_group,
-                                       ffe_ctl.found_offset, num_bytes);
+                                       ffe_ctl->found_offset,
+                                       ffe_ctl->num_bytes);
                        goto loop;
                }
                btrfs_inc_block_group_reservations(block_group);
 
                /* we are all good, lets return */
-               ins->objectid = ffe_ctl.search_start;
-               ins->offset = num_bytes;
+               ins->objectid = ffe_ctl->search_start;
+               ins->offset = ffe_ctl->num_bytes;
 
-               trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
-                                          num_bytes);
-               btrfs_release_block_group(block_group, delalloc);
+               trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
+                                          ffe_ctl->num_bytes);
+               btrfs_release_block_group(block_group, ffe_ctl->delalloc);
                break;
 loop:
-               release_block_group(block_group, &ffe_ctl, delalloc);
+               release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
                cond_resched();
        }
        up_read(&space_info->groups_sem);
 
-       ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
+       ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
        if (ret > 0)
                goto search;
 
@@ -4341,12 +4413,12 @@ loop:
                 * Use ffe_ctl->total_free_space as fallback if we can't find
                 * any contiguous hole.
                 */
-               if (!ffe_ctl.max_extent_size)
-                       ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
+               if (!ffe_ctl->max_extent_size)
+                       ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
                spin_lock(&space_info->lock);
-               space_info->max_extent_size = ffe_ctl.max_extent_size;
+               space_info->max_extent_size = ffe_ctl->max_extent_size;
                spin_unlock(&space_info->lock);
-               ins->offset = ffe_ctl.max_extent_size;
+               ins->offset = ffe_ctl->max_extent_size;
        } else if (ret == -ENOSPC) {
                ret = cache_block_group_error;
        }
@@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         struct btrfs_key *ins, int is_data, int delalloc)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct find_free_extent_ctl ffe_ctl = {};
        bool final_tried = num_bytes == min_alloc_size;
        u64 flags;
        int ret;
        bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+       bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
 
        flags = get_alloc_profile_by_root(root, is_data);
 again:
        WARN_ON(num_bytes < fs_info->sectorsize);
-       ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
-                              hint_byte, ins, flags, delalloc);
+
+       ffe_ctl.ram_bytes = ram_bytes;
+       ffe_ctl.num_bytes = num_bytes;
+       ffe_ctl.min_alloc_size = min_alloc_size;
+       ffe_ctl.empty_size = empty_size;
+       ffe_ctl.flags = flags;
+       ffe_ctl.delalloc = delalloc;
+       ffe_ctl.hint_byte = hint_byte;
+       ffe_ctl.for_treelog = for_treelog;
+       ffe_ctl.for_data_reloc = for_data_reloc;
+
+       ret = find_free_extent(root, ins, &ffe_ctl);
        if (!ret && !is_data) {
                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
        } else if (ret == -ENOSPC) {
@@ -4431,8 +4515,8 @@ again:
 
                        sinfo = btrfs_find_space_info(fs_info, flags);
                        btrfs_err(fs_info,
-                       "allocation failed flags %llu, wanted %llu tree-log %d",
-                                 flags, num_bytes, for_treelog);
+       "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
+                                 flags, num_bytes, for_treelog, for_data_reloc);
                        if (sinfo)
                                btrfs_dump_space_info(fs_info, sinfo,
                                                      num_bytes, 1);
@@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
+       ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                return ret;
 
        ret = btrfs_update_block_group(trans, extent_key.objectid,
-                                      fs_info->nodesize, 1);
+                                      fs_info->nodesize, true);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        extent_key.objectid, extent_key.offset);
@@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
        btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
                               ins->objectid, ins->offset, 0);
-       btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+       btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
+                           offset, 0, false);
        btrfs_ref_tree_mod(root->fs_info, &generic_ref);
 
        return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
@@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 
                btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
                                       ins.objectid, ins.offset, parent);
-               generic_ref.real_root = root->root_key.objectid;
-               btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+               btrfs_init_tree_ref(&generic_ref, level, root_objectid,
+                                   root->root_key.objectid, false);
                btrfs_ref_tree_mod(fs_info, &generic_ref);
                ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
                if (ret)
@@ -5265,7 +5350,8 @@ skip:
 
                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                                       fs_info->nodesize, parent);
-               btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
+                                   0, false);
                ret = btrfs_free_extent(trans, &ref);
                if (ret)
                        goto out_unlock;
@@ -5750,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       btrfs_assert_tree_locked(parent);
+       btrfs_assert_tree_write_locked(parent);
        parent_level = btrfs_header_level(parent);
        atomic_inc(&parent->refs);
        path->nodes[parent_level] = parent;
        path->slots[parent_level] = btrfs_header_nritems(parent);
 
-       btrfs_assert_tree_locked(node);
+       btrfs_assert_tree_write_locked(node);
        level = btrfs_header_level(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
index aaddd72..4e03a6d 100644 (file)
@@ -241,7 +241,7 @@ int __init extent_io_init(void)
                return -ENOMEM;
 
        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
-                       offsetof(struct btrfs_io_bio, bio),
+                       offsetof(struct btrfs_bio, bio),
                        BIOSET_NEED_BVECS))
                goto free_buffer_cache;
 
@@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 
 /*
  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
- * more than @max_bytes.  @Start and @end are used to return the range,
+ * more than @max_bytes.
  *
- * Return: true if we find something
- *         false if nothing was in the tree
+ * @start:     The original start bytenr to search.
+ *             Will store the extent range start bytenr.
+ * @end:       The original end bytenr of the search range
+ *             Will store the extent range end bytenr.
+ *
+ * Return true if we find a delalloc range which starts inside the original
+ * range, and @start/@end will store the delalloc range start/end.
+ *
+ * Return false if we can't find any delalloc range which starts inside the
+ * original range, and @start/@end will be the non-delalloc range start/end.
  */
 EXPORT_FOR_TESTS
 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
@@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
                                    u64 *end)
 {
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       const u64 orig_start = *start;
+       const u64 orig_end = *end;
        u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
        u64 delalloc_start;
        u64 delalloc_end;
@@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
        int ret;
        int loops = 0;
 
+       /* Caller should pass a valid @end to indicate the search range end */
+       ASSERT(orig_end > orig_start);
+
+       /* The range should at least cover part of the page */
+       ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
+                orig_end <= page_offset(locked_page)));
 again:
        /* step one, find a bunch of delalloc bytes starting at start */
        delalloc_start = *start;
        delalloc_end = 0;
        found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
                                          max_bytes, &cached_state);
-       if (!found || delalloc_end <= *start) {
+       if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
                *start = delalloc_start;
-               *end = delalloc_end;
+
+               /* @delalloc_end can be -1, never go beyond @orig_end */
+               *end = min(delalloc_end, orig_end);
                free_extent_state(cached_state);
                return false;
        }
@@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree,
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-                     u64 length, u64 logical, struct page *page,
-                     unsigned int pg_offset, int mirror_num)
+static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+                            u64 length, u64 logical, struct page *page,
+                            unsigned int pg_offset, int mirror_num)
 {
        struct bio *bio;
        struct btrfs_device *dev;
        u64 map_length = 0;
        u64 sector;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        int ret;
 
        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
@@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
        if (btrfs_is_zoned(fs_info))
                return btrfs_repair_one_zone(fs_info, logical);
 
-       bio = btrfs_io_bio_alloc(1);
+       bio = btrfs_bio_alloc(1);
        bio->bi_iter.bi_size = 0;
        map_length = length;
 
        /*
-        * Avoid races with device replace and make sure our bbio has devices
+        * Avoid races with device replace and make sure our bioc has devices
         * associated to its stripes that don't go away while we are doing the
         * read repair operation.
         */
@@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
                 * stripe's dev and sector.
                 */
                ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
-                                     &map_length, &bbio, 0);
+                                     &map_length, &bioc, 0);
                if (ret) {
                        btrfs_bio_counter_dec(fs_info);
                        bio_put(bio);
                        return -EIO;
                }
-               ASSERT(bbio->mirror_num == 1);
+               ASSERT(bioc->mirror_num == 1);
        } else {
                ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-                                     &map_length, &bbio, mirror_num);
+                                     &map_length, &bioc, mirror_num);
                if (ret) {
                        btrfs_bio_counter_dec(fs_info);
                        bio_put(bio);
                        return -EIO;
                }
-               BUG_ON(mirror_num != bbio->mirror_num);
+               BUG_ON(mirror_num != bioc->mirror_num);
        }
 
-       sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
+       sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
        bio->bi_iter.bi_sector = sector;
-       dev = bbio->stripes[bbio->mirror_num - 1].dev;
-       btrfs_put_bbio(bbio);
+       dev = bioc->stripes[bioc->mirror_num - 1].dev;
+       btrfs_put_bioc(bioc);
        if (!dev || !dev->bdev ||
            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
                btrfs_bio_counter_dec(fs_info);
@@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode,
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-       struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+       struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio);
        const int icsum = bio_offset >> fs_info->sectorsize_bits;
        struct bio *repair_bio;
-       struct btrfs_io_bio *repair_io_bio;
+       struct btrfs_bio *repair_bbio;
        blk_status_t status;
 
        btrfs_debug(fs_info,
@@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode,
                return -EIO;
        }
 
-       repair_bio = btrfs_io_bio_alloc(1);
-       repair_io_bio = btrfs_io_bio(repair_bio);
+       repair_bio = btrfs_bio_alloc(1);
+       repair_bbio = btrfs_bio(repair_bio);
        repair_bio->bi_opf = REQ_OP_READ;
        repair_bio->bi_end_io = failed_bio->bi_end_io;
        repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
        repair_bio->bi_private = failed_bio->bi_private;
 
-       if (failed_io_bio->csum) {
+       if (failed_bbio->csum) {
                const u32 csum_size = fs_info->csum_size;
 
-               repair_io_bio->csum = repair_io_bio->csum_inline;
-               memcpy(repair_io_bio->csum,
-                      failed_io_bio->csum + csum_size * icsum, csum_size);
+               repair_bbio->csum = repair_bbio->csum_inline;
+               memcpy(repair_bbio->csum,
+                      failed_bbio->csum + csum_size * icsum, csum_size);
        }
 
        bio_add_page(repair_bio, page, failrec->len, pgoff);
-       repair_io_bio->logical = failrec->start;
-       repair_io_bio->iter = repair_bio->bi_iter;
+       repair_bbio->iter = repair_bio->bi_iter;
 
        btrfs_debug(btrfs_sb(inode->i_sb),
                    "repair read error: submitting new read to mirror %d",
@@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage(
 static void end_bio_extent_readpage(struct bio *bio)
 {
        struct bio_vec *bvec;
-       struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       struct btrfs_bio *bbio = btrfs_bio(bio);
        struct extent_io_tree *tree, *failure_tree;
        struct processed_extent processed = { 0 };
        /*
@@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio)
                btrfs_debug(fs_info,
                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
                        bio->bi_iter.bi_sector, bio->bi_status,
-                       io_bio->mirror_num);
+                       bbio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
                failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
@@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio)
                end = start + bvec->bv_len - 1;
                len = bvec->bv_len;
 
-               mirror = io_bio->mirror_num;
+               mirror = bbio->mirror_num;
                if (likely(uptodate)) {
                        if (is_data_inode(inode)) {
-                               error_bitmap = btrfs_verify_data_csum(io_bio,
+                               error_bitmap = btrfs_verify_data_csum(bbio,
                                                bio_offset, page, start, end);
                                ret = error_bitmap;
                        } else {
-                               ret = btrfs_validate_metadata_buffer(io_bio,
+                               ret = btrfs_validate_metadata_buffer(bbio,
                                        page, start, end, mirror);
                        }
                        if (ret)
@@ -3106,7 +3123,7 @@ readpage_ok:
        }
        /* Release the last extent */
        endio_readpage_release_extent(&processed, NULL, 0, 0, false);
-       btrfs_io_bio_free_csum(io_bio);
+       btrfs_bio_free_csum(bbio);
        bio_put(bio);
 }
 
@@ -3115,53 +3132,43 @@ readpage_ok:
  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
  * 'bio' because use of __GFP_ZERO is not supported.
  */
-static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
+static inline void btrfs_bio_init(struct btrfs_bio *bbio)
 {
-       memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+       memset(bbio, 0, offsetof(struct btrfs_bio, bio));
 }
 
 /*
- * The following helpers allocate a bio. As it's backed by a bioset, it'll
- * never fail.  We're returning a bio right now but you can call btrfs_io_bio
- * for the appropriate container_of magic
+ * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
+ *
+ * The bio allocation is backed by bioset and does not fail.
  */
-struct bio *btrfs_bio_alloc(u64 first_byte)
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
 {
        struct bio *bio;
 
-       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset);
-       bio->bi_iter.bi_sector = first_byte >> 9;
-       btrfs_io_bio_init(btrfs_io_bio(bio));
+       ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
+       bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
+       btrfs_bio_init(btrfs_bio(bio));
        return bio;
 }
 
 struct bio *btrfs_bio_clone(struct bio *bio)
 {
-       struct btrfs_io_bio *btrfs_bio;
+       struct btrfs_bio *bbio;
        struct bio *new;
 
        /* Bio allocation backed by a bioset does not fail */
        new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
-       btrfs_bio = btrfs_io_bio(new);
-       btrfs_io_bio_init(btrfs_bio);
-       btrfs_bio->iter = bio->bi_iter;
+       bbio = btrfs_bio(new);
+       btrfs_bio_init(bbio);
+       bbio->iter = bio->bi_iter;
        return new;
 }
 
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
-{
-       struct bio *bio;
-
-       /* Bio allocation backed by a bioset does not fail */
-       bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
-       btrfs_io_bio_init(btrfs_io_bio(bio));
-       return bio;
-}
-
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
 {
        struct bio *bio;
-       struct btrfs_io_bio *btrfs_bio;
+       struct btrfs_bio *bbio;
 
        ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
 
@@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
        bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
        ASSERT(bio);
 
-       btrfs_bio = btrfs_io_bio(bio);
-       btrfs_io_bio_init(btrfs_bio);
+       bbio = btrfs_bio(bio);
+       btrfs_bio_init(bbio);
 
        bio_trim(bio, offset >> 9, size >> 9);
-       btrfs_bio->iter = bio->bi_iter;
+       bbio->iter = bio->bi_iter;
        return bio;
 }
 
@@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode,
        struct bio *bio;
        int ret;
 
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
        /*
         * For compressed page range, its disk_bytenr is always @disk_bytenr
         * passed in, no matter if we have added any range into previous bio.
         */
        if (bio_flags & EXTENT_BIO_COMPRESSED)
-               bio = btrfs_bio_alloc(disk_bytenr);
+               bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
        else
-               bio = btrfs_bio_alloc(disk_bytenr + offset);
+               bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
        bio_ctrl->bio = bio;
        bio_ctrl->bio_flags = bio_flags;
        bio->bi_end_io = end_io_func;
@@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
        if (wbc) {
                struct block_device *bdev;
 
-               bdev = fs_info->fs_devices->latest_bdev;
+               bdev = fs_info->fs_devices->latest_dev->bdev;
                bio_set_dev(bio, bdev);
                wbc_init_bio(wbc, bio);
        }
@@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode,
                        goto error;
                }
 
-               btrfs_io_bio(bio)->device = device;
+               btrfs_bio(bio)->device = device;
        }
        return 0;
 error:
@@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                bool force_bio_submit = false;
                u64 disk_bytenr;
 
+               ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
                if (cur >= last_byte) {
                        struct extent_state *cached = NULL;
 
@@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc,
  */
 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
                struct page *page, struct writeback_control *wbc,
-               u64 delalloc_start, unsigned long *nr_written)
+               unsigned long *nr_written)
 {
-       u64 page_end = delalloc_start + PAGE_SIZE - 1;
-       bool found;
+       const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
+       u64 delalloc_start = page_offset(page);
        u64 delalloc_to_write = 0;
-       u64 delalloc_end = 0;
        int ret;
        int page_started = 0;
 
+       while (delalloc_start < page_end) {
+               u64 delalloc_end = page_end;
+               bool found;
 
-       while (delalloc_end < page_end) {
                found = find_lock_delalloc_range(&inode->vfs_inode, page,
                                               &delalloc_start,
                                               &delalloc_end);
@@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
                                 struct page *page, u64 *start, u64 *end)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       struct btrfs_subpage_info *spi = fs_info->subpage_info;
        u64 orig_start = *start;
        /* Declare as unsigned long so we can use bitmap ops */
-       unsigned long dirty_bitmap;
        unsigned long flags;
-       int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits;
-       int range_start_bit = nbits;
+       int range_start_bit;
        int range_end_bit;
 
        /*
@@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
                return;
        }
 
+       range_start_bit = spi->dirty_offset +
+                         (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
+
        /* We should have the page locked, but just in case */
        spin_lock_irqsave(&subpage->lock, flags);
-       dirty_bitmap = subpage->dirty_bitmap;
+       bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
+                              spi->dirty_offset + spi->bitmap_nr_bits);
        spin_unlock_irqrestore(&subpage->lock, flags);
 
-       bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit,
-                              BTRFS_SUBPAGE_BITMAP_SIZE);
+       range_start_bit -= spi->dirty_offset;
+       range_end_bit -= spi->dirty_offset;
+
        *start = page_offset(page) + range_start_bit * fs_info->sectorsize;
        *end = page_offset(page) + range_end_bit * fs_info->sectorsize;
 }
@@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                              struct extent_page_data *epd)
 {
        struct inode *inode = page->mapping->host;
-       u64 start = page_offset(page);
-       u64 page_end = start + PAGE_SIZE - 1;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       const u64 page_start = page_offset(page);
+       const u64 page_end = page_start + PAGE_SIZE - 1;
        int ret;
        int nr = 0;
        size_t pg_offset;
@@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        }
 
        if (!epd->extent_locked) {
-               ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
-                                        &nr_written);
+               ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written);
                if (ret == 1)
                        return 0;
                if (ret)
@@ -4141,8 +4155,20 @@ done:
         * capable of that.
         */
        if (PageError(page))
-               end_extent_writepage(page, ret, start, page_end);
-       unlock_page(page);
+               end_extent_writepage(page, ret, page_start, page_end);
+       if (epd->extent_locked) {
+               /*
+                * If epd->extent_locked, it's from extent_write_locked_range(),
+                * the page can either be locked by lock_page() or
+                * process_one_page().
+                * Let btrfs_page_unlock_writer() handle both cases.
+                */
+               ASSERT(wbc);
+               btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
+                                        wbc->range_end + 1 - wbc->range_start);
+       } else {
+               unlock_page(page);
+       }
        ASSERT(ret <= 0);
        return ret;
 }
@@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
 
 static void end_extent_buffer_writeback(struct extent_buffer *eb)
 {
+       if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags))
+               btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len);
+
        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
        smp_mb__after_atomic();
        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
@@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page,
        int submitted = 0;
        u64 page_start = page_offset(page);
        int bit_start = 0;
-       const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
        int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
        int ret;
 
        /* Lock and write each dirty extent buffers in the range */
-       while (bit_start < nbits) {
+       while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
                struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
                struct extent_buffer *eb;
                unsigned long flags;
@@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page,
                        break;
                }
                spin_lock_irqsave(&subpage->lock, flags);
-               if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+               if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
+                             subpage->bitmaps)) {
                        spin_unlock_irqrestore(&subpage->lock, flags);
                        spin_unlock(&page->mapping->private_lock);
                        bit_start++;
@@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
                free_extent_buffer(eb);
                return ret;
        }
-       if (cache)
+       if (cache) {
+               /* Impiles write in zoned mode */
                btrfs_put_block_group(cache);
+               /* Mark the last eb in a block group */
+               if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
+                       set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+       }
        ret = write_one_eb(eb, wbc, epd);
        free_extent_buffer(eb);
        if (ret < 0)
@@ -4873,7 +4907,7 @@ retry:
         *   extent io tree. Thus we don't want to submit such wild eb
         *   if the fs already has error.
         */
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (!BTRFS_FS_ERROR(fs_info)) {
                ret = flush_write_bio(&epd);
        } else {
                ret = -EROFS;
@@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc)
        return ret;
 }
 
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
-                             int mode)
+/*
+ * Submit the pages in the range to bio for call sites which delalloc range has
+ * already been ran (aka, ordered extent inserted) and all pages are still
+ * locked.
+ */
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
 {
+       bool found_error = false;
+       int first_error = 0;
        int ret = 0;
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
-       unsigned long nr_pages = (end - start + PAGE_SIZE) >>
-               PAGE_SHIFT;
-
+       u64 cur = start;
+       unsigned long nr_pages;
+       const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
        struct extent_page_data epd = {
                .bio_ctrl = { 0 },
                .extent_locked = 1,
-               .sync_io = mode == WB_SYNC_ALL,
+               .sync_io = 1,
        };
        struct writeback_control wbc_writepages = {
-               .sync_mode      = mode,
-               .nr_to_write    = nr_pages * 2,
+               .sync_mode      = WB_SYNC_ALL,
                .range_start    = start,
                .range_end      = end + 1,
                /* We're called from an async helper function */
@@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
                .no_cgroup_owner = 1,
        };
 
+       ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+       nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+                  PAGE_SHIFT;
+       wbc_writepages.nr_to_write = nr_pages * 2;
+
        wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
-       while (start <= end) {
-               page = find_get_page(mapping, start >> PAGE_SHIFT);
-               if (clear_page_dirty_for_io(page))
-                       ret = __extent_writepage(page, &wbc_writepages, &epd);
-               else {
-                       btrfs_writepage_endio_finish_ordered(BTRFS_I(inode),
-                                       page, start, start + PAGE_SIZE - 1, true);
-                       unlock_page(page);
+       while (cur <= end) {
+               u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+
+               page = find_get_page(mapping, cur >> PAGE_SHIFT);
+               /*
+                * All pages in the range are locked since
+                * btrfs_run_delalloc_range(), thus there is no way to clear
+                * the page dirty flag.
+                */
+               ASSERT(PageLocked(page));
+               ASSERT(PageDirty(page));
+               clear_page_dirty_for_io(page);
+               ret = __extent_writepage(page, &wbc_writepages, &epd);
+               ASSERT(ret <= 0);
+               if (ret < 0) {
+                       found_error = true;
+                       first_error = ret;
                }
                put_page(page);
-               start += PAGE_SIZE;
+               cur = cur_end + 1;
        }
 
-       ASSERT(ret <= 0);
-       if (ret == 0)
+       if (!found_error)
                ret = flush_write_bio(&epd);
        else
                end_write_bio(&epd, ret);
 
        wbc_detach_inode(&wbc_writepages);
+       if (found_error)
+               return first_error;
        return ret;
 }
 
 int extent_writepages(struct address_space *mapping,
                      struct writeback_control *wbc)
 {
+       struct inode *inode = mapping->host;
+       const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root);
+       const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info);
        int ret = 0;
        struct extent_page_data epd = {
                .bio_ctrl = { 0 },
@@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping,
                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
        };
 
+       /*
+        * Allow only a single thread to do the reloc work in zoned mode to
+        * protect the write pointer updates.
+        */
+       if (data_reloc && zoned)
+               btrfs_inode_lock(inode, 0);
        ret = extent_write_cache_pages(mapping, wbc, &epd);
+       if (data_reloc && zoned)
+               btrfs_inode_unlock(inode, 0);
        ASSERT(ret <= 0);
        if (ret < 0) {
                end_write_bio(&epd, ret);
@@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                 * page, but it may change in the future for 16K page size
                 * support, so we still preallocate the memory in the loop.
                 */
-               ret = btrfs_alloc_subpage(fs_info, &prealloc,
-                                         BTRFS_SUBPAGE_METADATA);
-               if (ret < 0) {
-                       unlock_page(p);
-                       put_page(p);
-                       exists = ERR_PTR(ret);
-                       goto free_eb;
+               if (fs_info->sectorsize < PAGE_SIZE) {
+                       prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+                       if (IS_ERR(prealloc)) {
+                               ret = PTR_ERR(prealloc);
+                               unlock_page(p);
+                               put_page(p);
+                               exists = ERR_PTR(ret);
+                               goto free_eb;
+                       }
                }
 
                spin_lock(&mapping->private_lock);
@@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
        }
 }
 
+#define GANG_LOOKUP_SIZE       16
 static struct extent_buffer *get_next_extent_buffer(
                struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
 {
-       struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE];
+       struct extent_buffer *gang[GANG_LOOKUP_SIZE];
        struct extent_buffer *found = NULL;
        u64 page_start = page_offset(page);
-       int ret;
-       int i;
+       u64 cur = page_start;
 
        ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
-       ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE);
        lockdep_assert_held(&fs_info->buffer_lock);
 
-       ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang,
-                       bytenr >> fs_info->sectorsize_bits,
-                       PAGE_SIZE / fs_info->nodesize);
-       for (i = 0; i < ret; i++) {
-               /* Already beyond page end */
-               if (gang[i]->start >= page_start + PAGE_SIZE)
-                       break;
-               /* Found one */
-               if (gang[i]->start >= bytenr) {
-                       found = gang[i];
-                       break;
+       while (cur < page_start + PAGE_SIZE) {
+               int ret;
+               int i;
+
+               ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+                               (void **)gang, cur >> fs_info->sectorsize_bits,
+                               min_t(unsigned int, GANG_LOOKUP_SIZE,
+                                     PAGE_SIZE / fs_info->nodesize));
+               if (ret == 0)
+                       goto out;
+               for (i = 0; i < ret; i++) {
+                       /* Already beyond page end */
+                       if (gang[i]->start >= page_start + PAGE_SIZE)
+                               goto out;
+                       /* Found one */
+                       if (gang[i]->start >= bytenr) {
+                               found = gang[i];
+                               goto out;
+                       }
                }
+               cur = gang[ret - 1]->start + gang[ret - 1]->len;
        }
+out:
        return found;
 }
 
index 53abdc2..0399cf8 100644 (file)
@@ -32,6 +32,7 @@ enum {
        /* write IO error */
        EXTENT_BUFFER_WRITE_ERR,
        EXTENT_BUFFER_NO_CHECK,
+       EXTENT_BUFFER_ZONE_FINISH,
 };
 
 /* these are flags for __process_pages_contig */
@@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                      struct btrfs_bio_ctrl *bio_ctrl,
                      unsigned int read_flags, u64 *prev_em_start);
 int extent_write_full_page(struct page *page, struct writeback_control *wbc);
-int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
-                             int mode);
+int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
 int extent_writepages(struct address_space *mapping,
                      struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
@@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
                                  u32 bits_to_clear, unsigned long page_ops);
-struct bio *btrfs_bio_alloc(u64 first_byte);
-struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
+struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
 struct bio *btrfs_bio_clone(struct bio *bio);
 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
-int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
-                     u64 length, u64 logical, struct page *page,
-                     unsigned int pg_offset, int mirror_num);
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
 
index 4a8e02f..5a36add 100644 (file)
@@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
        int i;
 
        for (i = 0; i < map->num_stripes; i++) {
-               struct btrfs_bio_stripe *stripe = &map->stripes[i];
+               struct btrfs_io_stripe *stripe = &map->stripes[i];
                struct btrfs_device *device = stripe->dev;
 
                set_extent_bits_nowait(&device->alloc_state, stripe->physical,
@@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
        int i;
 
        for (i = 0; i < map->num_stripes; i++) {
-               struct btrfs_bio_stripe *stripe = &map->stripes[i];
+               struct btrfs_io_stripe *stripe = &map->stripes[i];
                struct btrfs_device *device = stripe->dev;
 
                __clear_extent_bit(&device->alloc_state, stripe->physical,
index 0b9401a..d1cbb64 100644 (file)
@@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
  * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return
  *       checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If
  *       NULL, the checksum buffer is allocated and returned in
- *       btrfs_io_bio(bio)->csum instead.
+ *       btrfs_bio(bio)->csum instead.
  *
  * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
  */
@@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
                return BLK_STS_RESOURCE;
 
        if (!dst) {
-               struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+               struct btrfs_bio *bbio = btrfs_bio(bio);
 
                if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-                       btrfs_bio->csum = kmalloc_array(nblocks, csum_size,
-                                                       GFP_NOFS);
-                       if (!btrfs_bio->csum) {
+                       bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
+                       if (!bbio->csum) {
                                btrfs_free_path(path);
                                return BLK_STS_RESOURCE;
                        }
                } else {
-                       btrfs_bio->csum = btrfs_bio->csum_inline;
+                       bbio->csum = bbio->csum_inline;
                }
-               csum = btrfs_bio->csum;
+               csum = bbio->csum;
        } else {
                csum = dst;
        }
@@ -709,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
                                index = 0;
                        }
 
-                       data = kmap_atomic(bvec.bv_page);
-                       crypto_shash_digest(shash, data + bvec.bv_offset
-                                           + (i * fs_info->sectorsize),
+                       data = bvec_kmap_local(&bvec);
+                       crypto_shash_digest(shash,
+                                           data + (i * fs_info->sectorsize),
                                            fs_info->sectorsize,
                                            sums->sums + index);
-                       kunmap_atomic(data);
+                       kunmap_local(data);
                        index += fs_info->csum_size;
                        offset += fs_info->sectorsize;
                        this_sum_bytes += fs_info->sectorsize;
index a176236..9a3db13 100644 (file)
@@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
+                            struct page **pages, size_t num_pages,
+                            u64 pos, u64 copied)
 {
        size_t i;
+       u64 block_start = round_down(pos, fs_info->sectorsize);
+       u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
+
+       ASSERT(block_len <= U32_MAX);
        for (i = 0; i < num_pages; i++) {
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
@@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
                 * accessed as prepare_pages should have marked them accessed
                 * in prepare_pages via find_or_create_page()
                 */
-               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
+                                              block_len);
                unlock_page(pages[i]);
                put_page(pages[i]);
        }
@@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
                struct page *p = pages[i];
 
                btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
-               ClearPageChecked(p);
+               btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
                btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
        }
 
@@ -869,7 +876,8 @@ next_slot:
                                btrfs_init_data_ref(&ref,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               args->start - extent_offset);
+                                               args->start - extent_offset,
+                                               0, false);
                                ret = btrfs_inc_extent_ref(trans, &ref);
                                BUG_ON(ret); /* -ENOMEM */
                        }
@@ -955,7 +963,8 @@ delete_extent_item:
                                btrfs_init_data_ref(&ref,
                                                root->root_key.objectid,
                                                key.objectid,
-                                               key.offset - extent_offset);
+                                               key.offset - extent_offset, 0,
+                                               false);
                                ret = btrfs_free_extent(trans, &ref);
                                BUG_ON(ret); /* -ENOMEM */
                                args->bytes_found += extent_end - key.offset;
@@ -1020,8 +1029,7 @@ delete_extent_item:
                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
                                path->slots[0]++;
                }
-               setup_items_for_insert(root, path, &key,
-                                      &args->extent_item_size, 1);
+               btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size);
                args->extent_inserted = true;
        }
 
@@ -1232,7 +1240,7 @@ again:
                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
                                       num_bytes, 0);
                btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
-                                   orig_offset);
+                                   orig_offset, 0, false);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1257,7 +1265,8 @@ again:
        other_end = 0;
        btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                               num_bytes, 0);
-       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
+       btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
+                           0, false);
        if (extent_mergeable(leaf, path->slots[0] + 1,
                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
@@ -1844,7 +1853,7 @@ again:
 
                btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
                if (ret) {
-                       btrfs_drop_pages(pages, num_pages);
+                       btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
                        break;
                }
 
@@ -1852,7 +1861,7 @@ again:
                if (only_release_metadata)
                        btrfs_check_nocow_unlock(BTRFS_I(inode));
 
-               btrfs_drop_pages(pages, num_pages);
+               btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
 
                cond_resched();
 
@@ -2012,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         * have opened a file as writable, we have to stop this write operation
         * to ensure consistency.
         */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
+       if (BTRFS_FS_ERROR(inode->root->fs_info))
                return -EROFS;
 
        if (!(iocb->ki_flags & IOCB_DIRECT) &&
@@ -2620,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
                                       extent_info->disk_len, 0);
                ref_offset = extent_info->file_offset - extent_info->data_offset;
                btrfs_init_data_ref(&ref, root->root_key.objectid,
-                                   btrfs_ino(inode), ref_offset);
+                                   btrfs_ino(inode), ref_offset, 0, false);
                ret = btrfs_inc_extent_ref(trans, &ref);
        }
 
index da0eee7..f3fee88 100644 (file)
@@ -22,6 +22,7 @@
 #include "delalloc-space.h"
 #include "block-group.h"
 #include "discard.h"
+#include "subpage.h"
 
 #define BITS_PER_BITMAP                (PAGE_SIZE * 8UL)
 #define MAX_CACHE_BYTES_PER_GIG        SZ_64K
@@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 
        for (i = 0; i < io_ctl->num_pages; i++) {
                if (io_ctl->pages[i]) {
-                       ClearPageChecked(io_ctl->pages[i]);
+                       btrfs_page_clear_checked(io_ctl->fs_info,
+                                       io_ctl->pages[i],
+                                       page_offset(io_ctl->pages[i]),
+                                       PAGE_SIZE);
                        unlock_page(io_ctl->pages[i]);
                        put_page(io_ctl->pages[i]);
                }
@@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
        u64 offset = bytenr - block_group->start;
        u64 to_free, to_unusable;
        const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
+       bool initial = (size == block_group->length);
+       u64 reclaimable_unusable;
+
+       WARN_ON(!initial && offset + size > block_group->zone_capacity);
 
        spin_lock(&ctl->tree_lock);
        if (!used)
                to_free = size;
+       else if (initial)
+               to_free = block_group->zone_capacity;
        else if (offset >= block_group->alloc_offset)
                to_free = size;
        else if (offset + size <= block_group->alloc_offset)
@@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
                spin_unlock(&block_group->lock);
        }
 
+       reclaimable_unusable = block_group->zone_unusable -
+                              (block_group->length - block_group->zone_capacity);
        /* All the region is now unusable. Mark it as unused and reclaim */
        if (block_group->zone_unusable == block_group->length) {
                btrfs_mark_bg_unused(block_group);
        } else if (bg_reclaim_threshold &&
-                  block_group->zone_unusable >=
-                  div_factor_fine(block_group->length, bg_reclaim_threshold)) {
+                  reclaimable_unusable >=
+                  div_factor_fine(block_group->zone_capacity,
+                                  bg_reclaim_threshold)) {
                btrfs_mark_bg_to_reclaim(block_group);
        }
 
@@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
         * out the free space after the allocation offset.
         */
        if (btrfs_is_zoned(fs_info)) {
-               btrfs_info(fs_info, "free space %llu",
-                          block_group->length - block_group->alloc_offset);
+               btrfs_info(fs_info, "free space %llu active %d",
+                          block_group->zone_capacity - block_group->alloc_offset,
+                          block_group->zone_is_active);
                return;
        }
 
index 954b53a..b8c911a 100644 (file)
@@ -457,11 +457,10 @@ struct async_chunk {
        struct list_head extents;
        struct cgroup_subsys_state *blkcg_css;
        struct btrfs_work work;
-       atomic_t *pending;
+       struct async_cow *async_cow;
 };
 
 struct async_cow {
-       /* Number of chunks in flight; must be first in the structure */
        atomic_t num_chunks;
        struct async_chunk chunks[];
 };
@@ -492,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow,
  */
 static inline bool inode_can_compress(struct btrfs_inode *inode)
 {
-       /* Subpage doesn't support compression yet */
-       if (inode->root->fs_info->sectorsize < PAGE_SIZE)
-               return false;
        if (inode->flags & BTRFS_INODE_NODATACOW ||
            inode->flags & BTRFS_INODE_NODATASUM)
                return false;
@@ -516,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
                        btrfs_ino(inode));
                return 0;
        }
+       /*
+        * Special check for subpage.
+        *
+        * We lock the full page then run each delalloc range in the page, thus
+        * for the following case, we will hit some subpage specific corner case:
+        *
+        * 0            32K             64K
+        * |    |///////|       |///////|
+        *              \- A            \- B
+        *
+        * In above case, both range A and range B will try to unlock the full
+        * page [0, 64K), causing the one finished later will have page
+        * unlocked already, triggering various page lock requirement BUG_ON()s.
+        *
+        * So here we add an artificial limit that subpage compression can only
+        * if the range is fully page aligned.
+        *
+        * In theory we only need to ensure the first page is fully covered, but
+        * the tailing partial page will be locked until the full compression
+        * finishes, delaying the write of other range.
+        *
+        * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
+        * first to prevent any submitted async extent to unlock the full page.
+        * By this, we can ensure for subpage case that only the last async_cow
+        * will unlock the full page.
+        */
+       if (fs_info->sectorsize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(end + 1, PAGE_SIZE))
+                       return 0;
+       }
+
        /* force compress */
        if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
                return 1;
@@ -617,13 +645,24 @@ again:
        total_compressed = actual_end - start;
 
        /*
-        * skip compression for a small file range(<=blocksize) that
+        * Skip compression for a small file range(<=blocksize) that
         * isn't an inline extent, since it doesn't save disk space at all.
         */
        if (total_compressed <= blocksize &&
           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                goto cleanup_and_bail_uncompressed;
 
+       /*
+        * For subpage case, we require full page alignment for the sector
+        * aligned range.
+        * Thus we must also check against @actual_end, not just @end.
+        */
+       if (blocksize < PAGE_SIZE) {
+               if (!IS_ALIGNED(start, PAGE_SIZE) ||
+                   !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+                       goto cleanup_and_bail_uncompressed;
+       }
+
        total_compressed = min_t(unsigned long, total_compressed,
                        BTRFS_MAX_UNCOMPRESSED);
        total_in = 0;
@@ -761,7 +800,7 @@ cont:
                 * win, compare the page count read with the blocks on disk,
                 * compression must free at least one sector size
                 */
-               total_in = ALIGN(total_in, PAGE_SIZE);
+               total_in = round_up(total_in, fs_info->sectorsize);
                if (total_compressed + blocksize <= total_in) {
                        compressed_extents++;
 
@@ -842,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent)
        async_extent->pages = NULL;
 }
 
-/*
- * phase two of compressed writeback.  This is the ordered portion
- * of the code, which only gets called in the order the work was
- * queued.  We walk all the async extents created by compress_file_range
- * and send them down to the disk.
- */
-static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+static int submit_uncompressed_range(struct btrfs_inode *inode,
+                                    struct async_extent *async_extent,
+                                    struct page *locked_page)
 {
-       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct async_extent *async_extent;
-       u64 alloc_hint = 0;
-       struct btrfs_key ins;
-       struct extent_map *em;
-       struct btrfs_root *root = inode->root;
-       struct extent_io_tree *io_tree = &inode->io_tree;
-       int ret = 0;
-
-again:
-       while (!list_empty(&async_chunk->extents)) {
-               async_extent = list_entry(async_chunk->extents.next,
-                                         struct async_extent, list);
-               list_del(&async_extent->list);
-
-retry:
-               lock_extent(io_tree, async_extent->start,
-                           async_extent->start + async_extent->ram_size - 1);
-               /* did the compression code fall back to uncompressed IO? */
-               if (!async_extent->pages) {
-                       int page_started = 0;
-                       unsigned long nr_written = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
+       unsigned long nr_written = 0;
+       int page_started = 0;
+       int ret;
 
-                       /* allocate blocks */
-                       ret = cow_file_range(inode, async_chunk->locked_page,
-                                            async_extent->start,
-                                            async_extent->start +
-                                            async_extent->ram_size - 1,
-                                            &page_started, &nr_written, 0);
+       /*
+        * Call cow_file_range() to run the delalloc range directly, since we
+        * won't go to NOCOW or async path again.
+        *
+        * Also we call cow_file_range() with @unlock_page == 0, so that we
+        * can directly submit them without interruption.
+        */
+       ret = cow_file_range(inode, locked_page, start, end, &page_started,
+                            &nr_written, 0);
+       /* Inline extent inserted, page gets unlocked and everything is done */
+       if (page_started) {
+               ret = 0;
+               goto out;
+       }
+       if (ret < 0) {
+               if (locked_page)
+                       unlock_page(locked_page);
+               goto out;
+       }
 
-                       /* JDM XXX */
+       ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+       /* All pages will be unlocked, including @locked_page */
+out:
+       kfree(async_extent);
+       return ret;
+}
 
-                       /*
-                        * if page_started, cow_file_range inserted an
-                        * inline extent and took care of all the unlocking
-                        * and IO for us.  Otherwise, we need to submit
-                        * all those pages down to the drive.
-                        */
-                       if (!page_started && !ret)
-                               extent_write_locked_range(&inode->vfs_inode,
-                                                 async_extent->start,
-                                                 async_extent->start +
-                                                 async_extent->ram_size - 1,
-                                                 WB_SYNC_ALL);
-                       else if (ret && async_chunk->locked_page)
-                               unlock_page(async_chunk->locked_page);
-                       kfree(async_extent);
-                       cond_resched();
-                       continue;
-               }
+static int submit_one_async_extent(struct btrfs_inode *inode,
+                                  struct async_chunk *async_chunk,
+                                  struct async_extent *async_extent,
+                                  u64 *alloc_hint)
+{
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_key ins;
+       struct page *locked_page = NULL;
+       struct extent_map *em;
+       int ret = 0;
+       u64 start = async_extent->start;
+       u64 end = async_extent->start + async_extent->ram_size - 1;
 
-               ret = btrfs_reserve_extent(root, async_extent->ram_size,
-                                          async_extent->compressed_size,
-                                          async_extent->compressed_size,
-                                          0, alloc_hint, &ins, 1, 1);
-               if (ret) {
-                       free_async_extent_pages(async_extent);
+       /*
+        * If async_chunk->locked_page is in the async_extent range, we need to
+        * handle it.
+        */
+       if (async_chunk->locked_page) {
+               u64 locked_page_start = page_offset(async_chunk->locked_page);
+               u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
 
-                       if (ret == -ENOSPC) {
-                               unlock_extent(io_tree, async_extent->start,
-                                             async_extent->start +
-                                             async_extent->ram_size - 1);
+               if (!(start >= locked_page_end || end <= locked_page_start))
+                       locked_page = async_chunk->locked_page;
+       }
+       lock_extent(io_tree, start, end);
 
-                               /*
-                                * we need to redirty the pages if we decide to
-                                * fallback to uncompressed IO, otherwise we
-                                * will not submit these pages down to lower
-                                * layers.
-                                */
-                               extent_range_redirty_for_io(&inode->vfs_inode,
-                                               async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1);
+       /* We have fall back to uncompressed write */
+       if (!async_extent->pages)
+               return submit_uncompressed_range(inode, async_extent, locked_page);
 
-                               goto retry;
-                       }
-                       goto out_free;
-               }
+       ret = btrfs_reserve_extent(root, async_extent->ram_size,
+                                  async_extent->compressed_size,
+                                  async_extent->compressed_size,
+                                  0, *alloc_hint, &ins, 1, 1);
+       if (ret) {
+               free_async_extent_pages(async_extent);
                /*
-                * here we're doing allocation and writeback of the
-                * compressed pages
+                * Here we used to try again by going back to non-compressed
+                * path for ENOSPC.  But we can't reserve space even for
+                * compressed size, how could it work for uncompressed size
+                * which requires larger size?  So here we directly go error
+                * path.
                 */
-               em = create_io_em(inode, async_extent->start,
-                                 async_extent->ram_size, /* len */
-                                 async_extent->start, /* orig_start */
-                                 ins.objectid, /* block_start */
-                                 ins.offset, /* block_len */
-                                 ins.offset, /* orig_block_len */
-                                 async_extent->ram_size, /* ram_bytes */
-                                 async_extent->compress_type,
-                                 BTRFS_ORDERED_COMPRESSED);
-               if (IS_ERR(em))
-                       /* ret value is not necessary due to void function */
-                       goto out_free_reserve;
-               free_extent_map(em);
-
-               ret = btrfs_add_ordered_extent_compress(inode,
-                                               async_extent->start,
-                                               ins.objectid,
-                                               async_extent->ram_size,
-                                               ins.offset,
-                                               async_extent->compress_type);
-               if (ret) {
-                       btrfs_drop_extent_cache(inode, async_extent->start,
-                                               async_extent->start +
-                                               async_extent->ram_size - 1, 0);
-                       goto out_free_reserve;
-               }
-               btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+               goto out_free;
+       }
+
+       /* Here we're doing allocation and writeback of the compressed pages */
+       em = create_io_em(inode, start,
+                         async_extent->ram_size,       /* len */
+                         start,                        /* orig_start */
+                         ins.objectid,                 /* block_start */
+                         ins.offset,                   /* block_len */
+                         ins.offset,                   /* orig_block_len */
+                         async_extent->ram_size,       /* ram_bytes */
+                         async_extent->compress_type,
+                         BTRFS_ORDERED_COMPRESSED);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out_free_reserve;
+       }
+       free_extent_map(em);
 
-               /*
-                * clear dirty, set writeback and unlock the pages.
-                */
-               extent_clear_unlock_delalloc(inode, async_extent->start,
-                               async_extent->start +
-                               async_extent->ram_size - 1,
-                               NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
-                               PAGE_UNLOCK | PAGE_START_WRITEBACK);
-               if (btrfs_submit_compressed_write(inode, async_extent->start,
-                                   async_extent->ram_size,
-                                   ins.objectid,
-                                   ins.offset, async_extent->pages,
-                                   async_extent->nr_pages,
-                                   async_chunk->write_flags,
-                                   async_chunk->blkcg_css)) {
-                       struct page *p = async_extent->pages[0];
-                       const u64 start = async_extent->start;
-                       const u64 end = start + async_extent->ram_size - 1;
-
-                       p->mapping = inode->vfs_inode.i_mapping;
-                       btrfs_writepage_endio_finish_ordered(inode, p, start,
-                                                            end, false);
-
-                       p->mapping = NULL;
-                       extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
-                                                    PAGE_END_WRITEBACK |
-                                                    PAGE_SET_ERROR);
-                       free_async_extent_pages(async_extent);
-               }
-               alloc_hint = ins.objectid + ins.offset;
-               kfree(async_extent);
-               cond_resched();
+       ret = btrfs_add_ordered_extent_compress(inode, start,   /* file_offset */
+                                       ins.objectid,           /* disk_bytenr */
+                                       async_extent->ram_size, /* num_bytes */
+                                       ins.offset,             /* disk_num_bytes */
+                                       async_extent->compress_type);
+       if (ret) {
+               btrfs_drop_extent_cache(inode, start, end, 0);
+               goto out_free_reserve;
        }
-       return;
+       btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+       /* Clear dirty, set writeback and unlock the pages. */
+       extent_clear_unlock_delalloc(inode, start, end,
+                       NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+                       PAGE_UNLOCK | PAGE_START_WRITEBACK);
+       if (btrfs_submit_compressed_write(inode, start, /* file_offset */
+                           async_extent->ram_size,     /* num_bytes */
+                           ins.objectid,               /* disk_bytenr */
+                           ins.offset,                 /* compressed_len */
+                           async_extent->pages,        /* compressed_pages */
+                           async_extent->nr_pages,
+                           async_chunk->write_flags,
+                           async_chunk->blkcg_css)) {
+               const u64 start = async_extent->start;
+               const u64 end = start + async_extent->ram_size - 1;
+
+               btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+
+               extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+                                            PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+               free_async_extent_pages(async_extent);
+       }
+       *alloc_hint = ins.objectid + ins.offset;
+       kfree(async_extent);
+       return ret;
+
 out_free_reserve:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
 out_free:
-       extent_clear_unlock_delalloc(inode, async_extent->start,
-                                    async_extent->start +
-                                    async_extent->ram_size - 1,
+       extent_clear_unlock_delalloc(inode, start, end,
                                     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
                                     EXTENT_DELALLOC_NEW |
                                     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
@@ -1009,7 +1030,39 @@ out_free:
                                     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
        free_async_extent_pages(async_extent);
        kfree(async_extent);
-       goto again;
+       return ret;
+}
+
+/*
+ * Phase two of compressed writeback.  This is the ordered portion of the code,
+ * which only gets called in the order the work was queued.  We walk all the
+ * async extents created by compress_file_range and send them down to the disk.
+ */
+static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
+{
+       struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct async_extent *async_extent;
+       u64 alloc_hint = 0;
+       int ret = 0;
+
+       while (!list_empty(&async_chunk->extents)) {
+               u64 extent_start;
+               u64 ram_size;
+
+               async_extent = list_entry(async_chunk->extents.next,
+                                         struct async_extent, list);
+               list_del(&async_extent->list);
+               extent_start = async_extent->start;
+               ram_size = async_extent->ram_size;
+
+               ret = submit_one_async_extent(inode, async_chunk, async_extent,
+                                             &alloc_hint);
+               btrfs_debug(fs_info,
+"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
+                           inode->root->root_key.objectid,
+                           btrfs_ino(inode), extent_start, ram_size, ret);
+       }
 }
 
 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
@@ -1152,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
         * fails during the stage where it updates the bytenr of file extent
         * items.
         */
-       if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+       if (btrfs_is_data_reloc_root(root))
                min_alloc_size = num_bytes;
        else
                min_alloc_size = fs_info->sectorsize;
@@ -1188,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
                if (ret)
                        goto out_drop_extent_cache;
 
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID) {
+               if (btrfs_is_data_reloc_root(root)) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
                        /*
@@ -1327,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 static noinline void async_cow_free(struct btrfs_work *work)
 {
        struct async_chunk *async_chunk;
+       struct async_cow *async_cow;
 
        async_chunk = container_of(work, struct async_chunk, work);
        if (async_chunk->inode)
                btrfs_add_delayed_iput(async_chunk->inode);
        if (async_chunk->blkcg_css)
                css_put(async_chunk->blkcg_css);
-       /*
-        * Since the pointer to 'pending' is at the beginning of the array of
-        * async_chunk's, freeing it ensures the whole array has been freed.
-        */
-       if (atomic_dec_and_test(async_chunk->pending))
-               kvfree(async_chunk->pending);
+
+       async_cow = async_chunk->async_cow;
+       if (atomic_dec_and_test(&async_cow->num_chunks))
+               kvfree(async_cow);
 }
 
 static int cow_file_range_async(struct btrfs_inode *inode,
@@ -1399,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
                 * lightweight reference for the callback lifetime
                 */
                ihold(&inode->vfs_inode);
-               async_chunk[i].pending = &ctx->num_chunks;
+               async_chunk[i].async_cow = ctx;
                async_chunk[i].inode = &inode->vfs_inode;
                async_chunk[i].start = start;
                async_chunk[i].end = cur_end;
@@ -1472,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
 
        __set_page_dirty_nobuffers(locked_page);
        account_page_redirty(locked_page);
-       extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL);
+       extent_write_locked_range(&inode->vfs_inode, start, end);
        *page_started = 1;
 
        return 0;
@@ -1505,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
                           int *page_started, unsigned long *nr_written)
 {
        const bool is_space_ino = btrfs_is_free_space_inode(inode);
-       const bool is_reloc_ino = (inode->root->root_key.objectid ==
-                                  BTRFS_DATA_RELOC_TREE_OBJECTID);
+       const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
        const u64 range_bytes = end + 1 - start;
        struct extent_io_tree *io_tree = &inode->io_tree;
        u64 range_start = start;
@@ -1868,8 +1918,7 @@ out_check:
                        btrfs_dec_nocow_writers(fs_info, disk_bytenr);
                nocow = false;
 
-               if (root->root_key.objectid ==
-                   BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                        /*
                         * Error handled later, as we must prevent
                         * extent_clear_unlock_delalloc() in error handler
@@ -1948,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
        int ret;
        const bool zoned = btrfs_is_zoned(inode->root->fs_info);
 
+       /*
+        * The range must cover part of the @locked_page, or the returned
+        * @page_started can confuse the caller.
+        */
+       ASSERT(!(end <= page_offset(locked_page) ||
+                start >= page_offset(locked_page) + PAGE_SIZE));
+
        if (should_nocow(inode, start, end)) {
-               ASSERT(!zoned);
+               /*
+                * Normally on a zoned device we're only doing COW writes, but
+                * in case of relocation on a zoned filesystem we have taken
+                * precaution, that we're only writing sequentially. It's safe
+                * to use run_delalloc_nocow() here, like for  regular
+                * preallocated inodes.
+                */
+               ASSERT(!zoned ||
+                      (zoned && btrfs_is_data_reloc_root(inode->root)));
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, nr_written);
        } else if (!inode_can_compress(inode) ||
@@ -2208,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
                if (btrfs_is_testing(fs_info))
                        return;
 
-               if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (!btrfs_is_data_reloc_root(root) &&
                    do_list && !(state->state & EXTENT_NORESERVE) &&
                    (*bits & EXTENT_CLEAR_DATA_RESV))
                        btrfs_free_reserved_data_space_noquota(fs_info, len);
@@ -2236,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
 }
 
 /*
- * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
- * in a chunk's stripe. This function ensures that bios do not span a
- * stripe/chunk
- *
- * @page - The page we are about to add to the bio
- * @size - size we want to add to the bio
- * @bio - bio we want to ensure is smaller than a stripe
- * @bio_flags - flags of the bio
- *
- * return 1 if page cannot be added to the bio
- * return 0 if page can be added to the bio
- * return error otherwise
- */
-int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
-                            unsigned long bio_flags)
-{
-       struct inode *inode = page->mapping->host;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       u64 logical = bio->bi_iter.bi_sector << 9;
-       u32 bio_len = bio->bi_iter.bi_size;
-       struct extent_map *em;
-       int ret = 0;
-       struct btrfs_io_geometry geom;
-
-       if (bio_flags & EXTENT_BIO_COMPRESSED)
-               return 0;
-
-       em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
-       if (IS_ERR(em))
-               return PTR_ERR(em);
-       ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom);
-       if (ret < 0)
-               goto out;
-
-       if (geom.len < bio_len + size)
-               ret = 1;
-out:
-       free_extent_map(em);
-       return ret;
-}
-
-/*
  * in order to insert checksums into the metadata in large chunks,
  * we wait until bio submission time.   All the pages in the bio are
  * checksummed and sums are attached onto the ordered extent record.
@@ -2533,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
                goto mapit;
        } else if (async && !skip_sum) {
                /* csum items have already been cloned */
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+               if (btrfs_is_data_reloc_root(root))
                        goto mapit;
                /* we're doing a write, do the async checksumming */
                ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
@@ -2766,7 +2788,7 @@ out_page:
                clear_page_dirty_for_io(page);
                SetPageError(page);
        }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
        unlock_page(page);
        put_page(page);
        kfree(fixup);
@@ -2821,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
         * page->mapping outside of the page lock.
         */
        ihold(inode);
-       SetPageChecked(page);
+       btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
        get_page(page);
        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
@@ -3012,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
-       if (ordered_extent->bdev)
+       /* A valid bdev implies a write on a sequential zone */
+       if (ordered_extent->bdev) {
                btrfs_rewrite_logical_zoned(ordered_extent);
+               btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
+                                       ordered_extent->disk_num_bytes);
+       }
 
        btrfs_free_io_failure_record(inode, start, end);
 
@@ -3210,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
  *
  * The length of such check is always one sector size.
  */
-static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
                           u32 bio_offset, struct page *page, u32 pgoff,
                           u64 start)
 {
@@ -3226,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
        ASSERT(pgoff + len <= PAGE_SIZE);
 
        offset_sectors = bio_offset >> fs_info->sectorsize_bits;
-       csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size;
+       csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
 
        kaddr = kmap_atomic(page);
        shash->tfm = fs_info->csum_shash;
@@ -3240,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
        return 0;
 zeroit:
        btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-                                   io_bio->mirror_num);
-       if (io_bio->device)
-               btrfs_dev_stat_inc_and_print(io_bio->device,
+                                   bbio->mirror_num);
+       if (bbio->device)
+               btrfs_dev_stat_inc_and_print(bbio->device,
                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
        memset(kaddr + pgoff, 1, len);
        flush_dcache_page(page);
@@ -3262,33 +3288,29 @@ zeroit:
  * Return a bitmap where bit set means a csum mismatch, and bit not set means
  * csum match.
  */
-unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
-                                   struct page *page, u64 start, u64 end)
+unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
+                                   u32 bio_offset, struct page *page,
+                                   u64 start, u64 end)
 {
        struct inode *inode = page->mapping->host;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        const u32 sectorsize = root->fs_info->sectorsize;
        u32 pg_off;
        unsigned int result = 0;
 
-       if (PageChecked(page)) {
-               ClearPageChecked(page);
+       if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
+               btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
                return 0;
        }
 
        /*
-        * For subpage case, above PageChecked is not safe as it's not subpage
-        * compatible.
-        * But for now only cow fixup and compressed read utilize PageChecked
-        * flag, while in this context we can easily use io_bio->csum to
-        * determine if we really need to do csum verification.
-        *
-        * So for now, just exit if io_bio->csum is NULL, as it means it's
-        * compressed read, and its compressed data csum has already been
-        * verified.
+        * This only happens for NODATASUM or compressed read.
+        * Normally this should be covered by above check for compressed read
+        * or the next check for NODATASUM.  Just do a quicker exit here.
         */
-       if (io_bio->csum == NULL)
+       if (bbio->csum == NULL)
                return 0;
 
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
@@ -3305,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                u64 file_offset = pg_off + page_offset(page);
                int ret;
 
-               if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+               if (btrfs_is_data_reloc_root(root) &&
                    test_range_bit(io_tree, file_offset,
                                   file_offset + sectorsize - 1,
                                   EXTENT_NODATASUM, 1, NULL)) {
@@ -3315,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
                                          EXTENT_NODATASUM);
                        continue;
                }
-               ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+               ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
                                      page_offset(page) + pg_off);
                if (ret < 0) {
                        const int nr_bit = (pg_off - offset_in_page(start)) >>
@@ -4006,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
         * without delay
         */
        if (!btrfs_is_free_space_inode(inode)
-           && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+           && !btrfs_is_data_reloc_root(root)
            && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
                btrfs_update_root_times(trans, root);
 
@@ -4036,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
  * also drops the back refs in the inode to the directory
  */
 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
                                struct btrfs_inode *dir,
                                struct btrfs_inode *inode,
                                const char *name, int name_len)
 {
+       struct btrfs_root *root = dir->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        int ret = 0;
@@ -4100,19 +4122,9 @@ skip_backref:
                goto err;
        }
 
-       ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
-                       dir_ino);
-       if (ret != 0 && ret != -ENOENT) {
-               btrfs_abort_transaction(trans, ret);
-               goto err;
-       }
-
-       ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
-                       index);
-       if (ret == -ENOENT)
-               ret = 0;
-       else if (ret)
-               btrfs_abort_transaction(trans, ret);
+       btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+                                  dir_ino);
+       btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
 
        /*
         * If we have a pending delayed iput we could end up with the final iput
@@ -4140,15 +4152,14 @@ out:
 }
 
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root,
                       struct btrfs_inode *dir, struct btrfs_inode *inode,
                       const char *name, int name_len)
 {
        int ret;
-       ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+       ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
        if (!ret) {
                drop_nlink(&inode->vfs_inode);
-               ret = btrfs_update_inode(trans, root, inode);
+               ret = btrfs_update_inode(trans, inode->root, inode);
        }
        return ret;
 }
@@ -4177,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        struct inode *inode = d_inode(dentry);
        int ret;
@@ -4189,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
                        0);
 
-       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                        dentry->d_name.len);
        if (ret)
@@ -4203,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 out:
        btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
        return ret;
 }
 
@@ -4370,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
        struct inode *inode;
        u64 objectid = 0;
 
-       if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (!BTRFS_FS_ERROR(fs_info))
                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 
        spin_lock(&root->inode_lock);
@@ -4554,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = d_inode(dentry);
        int err = 0;
-       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        u64 last_unlink_trans;
 
@@ -4579,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
 
        /* now the directory is empty */
-       err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+       err = btrfs_unlink_inode(trans, BTRFS_I(dir),
                        BTRFS_I(d_inode(dentry)), dentry->d_name.name,
                        dentry->d_name.len);
        if (!err) {
@@ -4600,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        }
 out:
        btrfs_end_transaction(trans);
-       btrfs_btree_balance_dirty(root->fs_info);
+       btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
 
        return err;
 }
@@ -4909,9 +4918,9 @@ delete:
 
                        btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
                                        extent_start, extent_num_bytes, 0);
-                       ref.real_root = root->root_key.objectid;
                        btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                       ino, extent_offset);
+                                       ino, extent_offset,
+                                       root->root_key.objectid, false);
                        ret = btrfs_free_extent(trans, &ref);
                        if (ret) {
                                btrfs_abort_transaction(trans, ret);
@@ -5107,7 +5116,8 @@ again:
                                     len);
                flush_dcache_page(page);
        }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, block_start,
+                                block_end + 1 - block_start);
        btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
        unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
 
@@ -6437,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        struct btrfs_inode_ref *ref;
        struct btrfs_key key[2];
        u32 sizes[2];
-       int nitems = name ? 2 : 1;
+       struct btrfs_item_batch batch;
        unsigned long ptr;
        unsigned int nofs_flag;
        int ret;
@@ -6529,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                goto fail;
        }
 
-       ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
+       batch.keys = &key[0];
+       batch.data_sizes = &sizes[0];
+       batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
+       batch.nr = name ? 2 : 1;
+       ret = btrfs_insert_empty_items(trans, root, path, &batch);
        if (ret != 0)
                goto fail_unlock;
 
@@ -7963,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
                iomap->type = IOMAP_MAPPED;
        }
        iomap->offset = start;
-       iomap->bdev = fs_info->fs_devices->latest_bdev;
+       iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
        iomap->length = len;
 
        if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
@@ -8040,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
 
        if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
                __endio_write_update_ordered(BTRFS_I(dip->inode),
-                                            dip->logical_offset,
+                                            dip->file_offset,
                                             dip->bytes,
                                             !dip->dio_bio->bi_status);
        } else {
                unlock_extent(&BTRFS_I(dip->inode)->io_tree,
-                             dip->logical_offset,
-                             dip->logical_offset + dip->bytes - 1);
+                             dip->file_offset,
+                             dip->file_offset + dip->bytes - 1);
        }
 
        bio_endio(dip->dio_bio);
@@ -8074,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
        return ret;
 }
 
-static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
-                                            struct btrfs_io_bio *io_bio,
+static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
+                                            struct btrfs_bio *bbio,
                                             const bool uptodate)
 {
+       struct inode *inode = dip->inode;
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
        const u32 sectorsize = fs_info->sectorsize;
        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
@@ -8085,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
        const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
        struct bio_vec bvec;
        struct bvec_iter iter;
-       u64 start = io_bio->logical;
+       const u64 orig_file_offset = dip->file_offset;
+       u64 start = orig_file_offset;
        u32 bio_offset = 0;
        blk_status_t err = BLK_STS_OK;
 
-       __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+       __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
                unsigned int i, nr_sectors, pgoff;
 
                nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
@@ -8097,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                for (i = 0; i < nr_sectors; i++) {
                        ASSERT(pgoff < PAGE_SIZE);
                        if (uptodate &&
-                           (!csum || !check_data_csum(inode, io_bio,
+                           (!csum || !check_data_csum(inode, bbio,
                                                       bio_offset, bvec.bv_page,
                                                       pgoff, start))) {
                                clean_io_failure(fs_info, failure_tree, io_tree,
@@ -8107,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
                        } else {
                                int ret;
 
-                               ASSERT((start - io_bio->logical) < UINT_MAX);
+                               ASSERT((start - orig_file_offset) < UINT_MAX);
                                ret = btrfs_repair_one_sector(inode,
-                                               &io_bio->bio,
-                                               start - io_bio->logical,
+                                               &bbio->bio,
+                                               start - orig_file_offset,
                                                bvec.bv_page, pgoff,
-                                               start, io_bio->mirror_num,
+                                               start, bbio->mirror_num,
                                                submit_dio_repair_bio);
                                if (ret)
                                        err = errno_to_blk_status(ret);
@@ -8153,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
                           bio->bi_opf, bio->bi_iter.bi_sector,
                           bio->bi_iter.bi_size, err);
 
-       if (bio_op(bio) == REQ_OP_READ) {
-               err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
-                                              !err);
-       }
+       if (bio_op(bio) == REQ_OP_READ)
+               err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
 
        if (err)
                dip->dio_bio->bi_status = err;
 
-       btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio);
+       btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
 
        bio_put(bio);
        btrfs_dio_private_put(dip);
@@ -8203,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
        } else {
                u64 csum_offset;
 
-               csum_offset = file_offset - dip->logical_offset;
+               csum_offset = file_offset - dip->file_offset;
                csum_offset >>= fs_info->sectorsize_bits;
                csum_offset *= fs_info->csum_size;
-               btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
+               btrfs_bio(bio)->csum = dip->csums + csum_offset;
        }
 map:
        ret = btrfs_map_bio(fs_info, bio, 0);
@@ -8241,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
                return NULL;
 
        dip->inode = inode;
-       dip->logical_offset = file_offset;
+       dip->file_offset = file_offset;
        dip->bytes = dio_bio->bi_iter.bi_size;
        dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
        dip->dio_bio = dio_bio;
@@ -8322,7 +8336,6 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
                bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
                bio->bi_private = dip;
                bio->bi_end_io = btrfs_end_dio_bio;
-               btrfs_io_bio(bio)->logical = file_offset;
 
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                        status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -8696,9 +8709,9 @@ next:
         * did something wrong.
         */
        ASSERT(!PageOrdered(page));
+       btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
        if (!inode_evicting)
                __btrfs_releasepage(page, GFP_NOFS);
-       ClearPageChecked(page);
        clear_page_extent_mapped(page);
 }
 
@@ -8842,7 +8855,7 @@ again:
                memzero_page(page, zero_start, PAGE_SIZE - zero_start);
                flush_dcache_page(page);
        }
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
        btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
        btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
 
@@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
        WARN_ON(inode->block_rsv.reserved);
        WARN_ON(inode->block_rsv.size);
        WARN_ON(inode->outstanding_extents);
-       WARN_ON(inode->delalloc_bytes);
-       WARN_ON(inode->new_delalloc_bytes);
+       if (!S_ISDIR(vfs_inode->i_mode)) {
+               WARN_ON(inode->delalloc_bytes);
+               WARN_ON(inode->new_delalloc_bytes);
+       }
        WARN_ON(inode->csum_bytes);
        WARN_ON(inode->defrag_bytes);
 
@@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
        } else { /* src is an inode */
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                           BTRFS_I(old_dentry->d_inode),
                                           old_dentry->d_name.name,
                                           old_dentry->d_name.len);
@@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
        if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
                ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
        } else { /* dest is an inode */
-               ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                           BTRFS_I(new_dentry->d_inode),
                                           new_dentry->d_name.name,
                                           new_dentry->d_name.len);
@@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
                 */
                btrfs_pin_log_trans(root);
                log_pinned = true;
-               ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
+               ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
                                        BTRFS_I(d_inode(old_dentry)),
                                        old_dentry->d_name.name,
                                        old_dentry->d_name.len);
@@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
                        ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
                        BUG_ON(new_inode->i_nlink == 0);
                } else {
-                       ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
                                                 BTRFS_I(d_inode(new_dentry)),
                                                 new_dentry->d_name.name,
                                                 new_dentry->d_name.len);
@@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte
        };
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;
 
        return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
@@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
        struct list_head splice;
        int ret;
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;
 
        INIT_LIST_HEAD(&splice);
index 36ff713..02ff085 100644 (file)
@@ -48,6 +48,7 @@
 #include "space-info.h"
 #include "delalloc-space.h"
 #include "block-group.h"
+#include "subpage.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 {
        compat_uptr_t clone_sources;    /* in */
        __u64 parent_root;              /* in */
        __u64 flags;                    /* in */
-       __u64 reserved[4];              /* in */
+       __u32 version;                  /* in */
+       __u8  reserved[28];             /* in */
 } __attribute__ ((__packed__));
 
 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
@@ -985,129 +987,32 @@ out:
        return ret;
 }
 
-/*
- * When we're defragging a range, we don't want to kick it off again
- * if it is really just waiting for delalloc to send it down.
- * If we find a nice big extent or delalloc range for the bytes in the
- * file you want to defrag, we return 0 to let you know to skip this
- * part of the file
- */
-static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
-{
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map *em = NULL;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-       u64 end;
-
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
-       read_unlock(&em_tree->lock);
-
-       if (em) {
-               end = extent_map_end(em);
-               free_extent_map(em);
-               if (end - offset > thresh)
-                       return 0;
-       }
-       /* if we already have a nice delalloc here, just stop */
-       thresh /= 2;
-       end = count_range_bits(io_tree, &offset, offset + thresh,
-                              thresh, EXTENT_DELALLOC, 1);
-       if (end >= thresh)
-               return 0;
-       return 1;
-}
-
-/*
- * helper function to walk through a file and find extents
- * newer than a specific transid, and smaller than thresh.
- *
- * This is used by the defragging code to find new and small
- * extents
- */
-static int find_new_extents(struct btrfs_root *root,
-                           struct inode *inode, u64 newer_than,
-                           u64 *off, u32 thresh)
-{
-       struct btrfs_path *path;
-       struct btrfs_key min_key;
-       struct extent_buffer *leaf;
-       struct btrfs_file_extent_item *extent;
-       int type;
-       int ret;
-       u64 ino = btrfs_ino(BTRFS_I(inode));
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       min_key.objectid = ino;
-       min_key.type = BTRFS_EXTENT_DATA_KEY;
-       min_key.offset = *off;
-
-       while (1) {
-               ret = btrfs_search_forward(root, &min_key, path, newer_than);
-               if (ret != 0)
-                       goto none;
-process_slot:
-               if (min_key.objectid != ino)
-                       goto none;
-               if (min_key.type != BTRFS_EXTENT_DATA_KEY)
-                       goto none;
-
-               leaf = path->nodes[0];
-               extent = btrfs_item_ptr(leaf, path->slots[0],
-                                       struct btrfs_file_extent_item);
-
-               type = btrfs_file_extent_type(leaf, extent);
-               if (type == BTRFS_FILE_EXTENT_REG &&
-                   btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
-                   check_defrag_in_cache(inode, min_key.offset, thresh)) {
-                       *off = min_key.offset;
-                       btrfs_free_path(path);
-                       return 0;
-               }
-
-               path->slots[0]++;
-               if (path->slots[0] < btrfs_header_nritems(leaf)) {
-                       btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
-                       goto process_slot;
-               }
-
-               if (min_key.offset == (u64)-1)
-                       goto none;
-
-               min_key.offset++;
-               btrfs_release_path(path);
-       }
-none:
-       btrfs_free_path(path);
-       return -ENOENT;
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+                                              bool locked)
 {
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_map *em;
-       u64 len = PAGE_SIZE;
+       const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
 
        /*
         * hopefully we have this extent in the tree already, try without
         * the full extent lock
         */
        read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, start, len);
+       em = lookup_extent_mapping(em_tree, start, sectorsize);
        read_unlock(&em_tree->lock);
 
        if (!em) {
                struct extent_state *cached = NULL;
-               u64 end = start + len - 1;
+               u64 end = start + sectorsize - 1;
 
                /* get the big lock and read metadata off disk */
-               lock_extent_bits(io_tree, start, end, &cached);
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
-               unlock_extent_cached(io_tree, start, end, &cached);
+               if (!locked)
+                       lock_extent_bits(io_tree, start, end, &cached);
+               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+               if (!locked)
+                       unlock_extent_cached(io_tree, start, end, &cached);
 
                if (IS_ERR(em))
                        return NULL;
@@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
        return em;
 }
 
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
+                                    bool locked)
 {
        struct extent_map *next;
        bool ret = true;
@@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
        if (em->start + em->len >= i_size_read(inode))
                return false;
 
-       next = defrag_lookup_extent(inode, em->start + em->len);
+       next = defrag_lookup_extent(inode, em->start + em->len, locked);
        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
                ret = false;
        else if ((em->block_start + em->block_len == next->block_start) &&
@@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
        return ret;
 }
 
-static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
-                              u64 *last_len, u64 *skip, u64 *defrag_end,
-                              int compress)
+/*
+ * Prepare one page to be defragged.
+ *
+ * This will ensure:
+ *
+ * - Returned page is locked and has been set up properly.
+ * - No ordered extent exists in the page.
+ * - The page is uptodate.
+ *
+ * NOTE: Caller should also wait for page writeback after the cluster is
+ * prepared, here we don't do writeback wait for each page.
+ */
+static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
+                                           pgoff_t index)
 {
-       struct extent_map *em;
-       int ret = 1;
-       bool next_mergeable = true;
-       bool prev_mergeable = true;
+       struct address_space *mapping = inode->vfs_inode.i_mapping;
+       gfp_t mask = btrfs_alloc_write_mask(mapping);
+       u64 page_start = (u64)index << PAGE_SHIFT;
+       u64 page_end = page_start + PAGE_SIZE - 1;
+       struct extent_state *cached_state = NULL;
+       struct page *page;
+       int ret;
+
+again:
+       page = find_or_create_page(mapping, index, mask);
+       if (!page)
+               return ERR_PTR(-ENOMEM);
 
        /*
-        * make sure that once we start defragging an extent, we keep on
-        * defragging it
+        * Since we can defragment files opened read-only, we can encounter
+        * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
+        * can't do I/O using huge pages yet, so return an error for now.
+        * Filesystem transparent huge pages are typically only used for
+        * executables that explicitly enable them, so this isn't very
+        * restrictive.
         */
-       if (start < *defrag_end)
-               return 1;
+       if (PageCompound(page)) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(-ETXTBSY);
+       }
 
-       *skip = 0;
+       ret = set_page_extent_mapped(page);
+       if (ret < 0) {
+               unlock_page(page);
+               put_page(page);
+               return ERR_PTR(ret);
+       }
 
-       em = defrag_lookup_extent(inode, start);
-       if (!em)
-               return 0;
+       /* Wait for any existing ordered extent in the range */
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
 
-       /* this will cover holes, and inline extents */
-       if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-               ret = 0;
-               goto out;
-       }
+               lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+               ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
+               unlock_extent_cached(&inode->io_tree, page_start, page_end,
+                                    &cached_state);
+               if (!ordered)
+                       break;
 
-       if (!*defrag_end)
-               prev_mergeable = false;
+               unlock_page(page);
+               btrfs_start_ordered_extent(ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+               lock_page(page);
+               /*
+                * We unlocked the page above, so we need check if it was
+                * released or not.
+                */
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+       }
 
-       next_mergeable = defrag_check_next_extent(inode, em);
-       /*
-        * we hit a real extent, if it is big or the next extent is not a
-        * real extent, don't bother defragging it
-        */
-       if (!compress && (*last_len == 0 || *last_len >= thresh) &&
-           (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
-               ret = 0;
-out:
        /*
-        * last_len ends up being a counter of how many bytes we've defragged.
-        * every time we choose not to defrag an extent, we reset *last_len
-        * so that the next tiny extent will force a defrag.
-        *
-        * The end result of this is that tiny extents before a single big
-        * extent will force at least part of that big extent to be defragged.
+        * Now the page range has no ordered extent any more.  Read the page to
+        * make it uptodate.
         */
-       if (ret) {
-               *defrag_end = extent_map_end(em);
-       } else {
-               *last_len = 0;
-               *skip = extent_map_end(em);
-               *defrag_end = 0;
+       if (!PageUptodate(page)) {
+               btrfs_readpage(NULL, page);
+               lock_page(page);
+               if (page->mapping != mapping || !PagePrivate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto again;
+               }
+               if (!PageUptodate(page)) {
+                       unlock_page(page);
+                       put_page(page);
+                       return ERR_PTR(-EIO);
+               }
        }
-
-       free_extent_map(em);
-       return ret;
+       return page;
 }
 
+struct defrag_target_range {
+       struct list_head list;
+       u64 start;
+       u64 len;
+};
+
 /*
- * it doesn't do much good to defrag one or two pages
- * at a time.  This pulls in a nice chunk of pages
- * to COW and defrag.
- *
- * It also makes sure the delalloc code has enough
- * dirty data to avoid making new small extents as part
- * of the defrag
+ * Collect all valid target extents.
  *
- * It's a good idea to start RA on this range
- * before calling this.
+ * @start:        file offset to lookup
+ * @len:          length to lookup
+ * @extent_thresh: file extent size threshold, any extent size >= this value
+ *                will be ignored
+ * @newer_than:    only defrag extents newer than this value
+ * @do_compress:   whether the defrag is doing compression
+ *                if true, @extent_thresh will be ignored and all regular
+ *                file extents meeting @newer_than will be targets.
+ * @locked:       if the range has already held extent lock
+ * @target_list:   list of targets file extents
  */
-static int cluster_pages_for_defrag(struct inode *inode,
-                                   struct page **pages,
-                                   unsigned long start_index,
-                                   unsigned long num_pages)
+static int defrag_collect_targets(struct btrfs_inode *inode,
+                                 u64 start, u64 len, u32 extent_thresh,
+                                 u64 newer_than, bool do_compress,
+                                 bool locked, struct list_head *target_list)
 {
-       unsigned long file_end;
-       u64 isize = i_size_read(inode);
-       u64 page_start;
-       u64 page_end;
-       u64 page_cnt;
-       u64 start = (u64)start_index << PAGE_SHIFT;
-       u64 search_start;
-       int ret;
-       int i;
-       int i_done;
-       struct btrfs_ordered_extent *ordered;
-       struct extent_state *cached_state = NULL;
-       struct extent_io_tree *tree;
-       struct extent_changeset *data_reserved = NULL;
-       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+       u64 cur = start;
+       int ret = 0;
 
-       file_end = (isize - 1) >> PAGE_SHIFT;
-       if (!isize || start_index > file_end)
-               return 0;
+       while (cur < start + len) {
+               struct extent_map *em;
+               struct defrag_target_range *new;
+               bool next_mergeable = true;
+               u64 range_len;
 
-       page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
+               em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+               if (!em)
+                       break;
 
-       ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
-                       start, page_cnt << PAGE_SHIFT);
-       if (ret)
-               return ret;
-       i_done = 0;
-       tree = &BTRFS_I(inode)->io_tree;
+               /* Skip hole/inline/preallocated extents */
+               if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
+                   test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       goto next;
 
-       /* step one, lock all the pages */
-       for (i = 0; i < page_cnt; i++) {
-               struct page *page;
-again:
-               page = find_or_create_page(inode->i_mapping,
-                                          start_index + i, mask);
-               if (!page)
-                       break;
+               /* Skip older extent */
+               if (em->generation < newer_than)
+                       goto next;
 
-               ret = set_page_extent_mapped(page);
-               if (ret < 0) {
-                       unlock_page(page);
-                       put_page(page);
-                       break;
+               /*
+                * For do_compress case, we want to compress all valid file
+                * extents, thus no @extent_thresh or mergeable check.
+                */
+               if (do_compress)
+                       goto add;
+
+               /* Skip too large extent */
+               if (em->len >= extent_thresh)
+                       goto next;
+
+               next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
+                                                         locked);
+               if (!next_mergeable) {
+                       struct defrag_target_range *last;
+
+                       /* Empty target list, no way to merge with last entry */
+                       if (list_empty(target_list))
+                               goto next;
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       /* Not mergeable with last entry */
+                       if (last->start + last->len != cur)
+                               goto next;
+
+                       /* Mergeable, fall through to add it to @target_list. */
                }
 
-               page_start = page_offset(page);
-               page_end = page_start + PAGE_SIZE - 1;
-               while (1) {
-                       lock_extent_bits(tree, page_start, page_end,
-                                        &cached_state);
-                       ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode),
-                                                             page_start);
-                       unlock_extent_cached(tree, page_start, page_end,
-                                            &cached_state);
-                       if (!ordered)
-                               break;
-
-                       unlock_page(page);
-                       btrfs_start_ordered_extent(ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       lock_page(page);
-                       /*
-                        * we unlocked the page above, so we need check if
-                        * it was released or not.
-                        */
-                       if (page->mapping != inode->i_mapping) {
-                               unlock_page(page);
-                               put_page(page);
-                               goto again;
+add:
+               range_len = min(extent_map_end(em), start + len) - cur;
+               /*
+                * This one is a good target, check if it can be merged into
+                * last range of the target list.
+                */
+               if (!list_empty(target_list)) {
+                       struct defrag_target_range *last;
+
+                       last = list_entry(target_list->prev,
+                                         struct defrag_target_range, list);
+                       ASSERT(last->start + last->len <= cur);
+                       if (last->start + last->len == cur) {
+                               /* Mergeable, enlarge the last entry */
+                               last->len += range_len;
+                               goto next;
                        }
+                       /* Fall through to allocate a new entry */
                }
 
-               if (!PageUptodate(page)) {
-                       btrfs_readpage(NULL, page);
-                       lock_page(page);
-                       if (!PageUptodate(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               ret = -EIO;
-                               break;
-                       }
+               /* Allocate new defrag_target_range */
+               new = kmalloc(sizeof(*new), GFP_NOFS);
+               if (!new) {
+                       free_extent_map(em);
+                       ret = -ENOMEM;
+                       break;
                }
+               new->start = cur;
+               new->len = range_len;
+               list_add_tail(&new->list, target_list);
 
-               if (page->mapping != inode->i_mapping) {
-                       unlock_page(page);
-                       put_page(page);
-                       goto again;
+next:
+               cur = extent_map_end(em);
+               free_extent_map(em);
+       }
+       if (ret < 0) {
+               struct defrag_target_range *entry;
+               struct defrag_target_range *tmp;
+
+               list_for_each_entry_safe(entry, tmp, target_list, list) {
+                       list_del_init(&entry->list);
+                       kfree(entry);
                }
+       }
+       return ret;
+}
+
+#define CLUSTER_SIZE   (SZ_256K)
+
+/*
+ * Defrag one contiguous target range.
+ *
+ * @inode:     target inode
+ * @target:    target range to defrag
+ * @pages:     locked pages covering the defrag range
+ * @nr_pages:  number of locked pages
+ *
+ * Caller should ensure:
+ *
+ * - Pages are prepared
+ *   Pages should be locked, no ordered extent in the pages range,
+ *   no writeback.
+ *
+ * - Extent bits are locked
+ */
+static int defrag_one_locked_target(struct btrfs_inode *inode,
+                                   struct defrag_target_range *target,
+                                   struct page **pages, int nr_pages,
+                                   struct extent_state **cached_state)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_changeset *data_reserved = NULL;
+       const u64 start = target->start;
+       const u64 len = target->len;
+       unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
+       unsigned long start_index = start >> PAGE_SHIFT;
+       unsigned long first_index = page_index(pages[0]);
+       int ret = 0;
+       int i;
+
+       ASSERT(last_index - first_index + 1 <= nr_pages);
+
+       ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
+       if (ret < 0)
+               return ret;
+       clear_extent_bit(&inode->io_tree, start, start + len - 1,
+                        EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                        EXTENT_DEFRAG, 0, 0, cached_state);
+       set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
 
-               pages[i] = page;
-               i_done++;
+       /* Update the page status */
+       for (i = start_index - first_index; i <= last_index - first_index; i++) {
+               ClearPageChecked(pages[i]);
+               btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
        }
-       if (!i_done || ret)
-               goto out;
+       btrfs_delalloc_release_extents(inode, len);
+       extent_changeset_free(data_reserved);
 
-       if (!(inode->i_sb->s_flags & SB_ACTIVE))
-               goto out;
+       return ret;
+}
 
-       /*
-        * so now we have a nice long stream of locked
-        * and up to date pages, lets wait on them
-        */
-       for (i = 0; i < i_done; i++)
-               wait_on_page_writeback(pages[i]);
+static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
+                           u32 extent_thresh, u64 newer_than, bool do_compress)
+{
+       struct extent_state *cached_state = NULL;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       struct page **pages;
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       u64 last_index = (start + len - 1) >> PAGE_SHIFT;
+       u64 start_index = start >> PAGE_SHIFT;
+       unsigned int nr_pages = last_index - start_index + 1;
+       int ret = 0;
+       int i;
+
+       ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
+       ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
 
-       page_start = page_offset(pages[0]);
-       page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
+       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+       if (!pages)
+               return -ENOMEM;
 
-       lock_extent_bits(&BTRFS_I(inode)->io_tree,
-                        page_start, page_end - 1, &cached_state);
+       /* Prepare all pages */
+       for (i = 0; i < nr_pages; i++) {
+               pages[i] = defrag_prepare_one_page(inode, start_index + i);
+               if (IS_ERR(pages[i])) {
+                       ret = PTR_ERR(pages[i]);
+                       pages[i] = NULL;
+                       goto free_pages;
+               }
+       }
+       for (i = 0; i < nr_pages; i++)
+               wait_on_page_writeback(pages[i]);
 
+       /* Lock the pages range */
+       lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
+                        (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                        &cached_state);
        /*
-        * When defragmenting we skip ranges that have holes or inline extents,
-        * (check should_defrag_range()), to avoid unnecessary IO and wasting
-        * space. At btrfs_defrag_file(), we check if a range should be defragged
-        * before locking the inode and then, if it should, we trigger a sync
-        * page cache readahead - we lock the inode only after that to avoid
-        * blocking for too long other tasks that possibly want to operate on
-        * other file ranges. But before we were able to get the inode lock,
-        * some other task may have punched a hole in the range, or we may have
-        * now an inline extent, in which case we should not defrag. So check
-        * for that here, where we have the inode and the range locked, and bail
-        * out if that happened.
+        * Now we have a consistent view about the extent map, re-check
+        * which range really needs to be defragged.
+        *
+        * And this time we have extent locked already, pass @locked = true
+        * so that we won't relock the extent range and cause deadlock.
         */
-       search_start = page_start;
-       while (search_start < page_end) {
-               struct extent_map *em;
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, true,
+                                    &target_list);
+       if (ret < 0)
+               goto unlock_extent;
 
-               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start,
-                                     page_end - search_start);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out_unlock_range;
-               }
-               if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                       free_extent_map(em);
-                       /* Ok, 0 means we did not defrag anything */
-                       ret = 0;
-                       goto out_unlock_range;
+       list_for_each_entry(entry, &target_list, list) {
+               ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
+                                              &cached_state);
+               if (ret < 0)
+                       break;
+       }
+
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
+       }
+unlock_extent:
+       unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
+                            (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                            &cached_state);
+free_pages:
+       for (i = 0; i < nr_pages; i++) {
+               if (pages[i]) {
+                       unlock_page(pages[i]);
+                       put_page(pages[i]);
                }
-               search_start = extent_map_end(em);
-               free_extent_map(em);
        }
+       kfree(pages);
+       return ret;
+}
 
-       clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
-                         page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                         EXTENT_DEFRAG, 0, 0, &cached_state);
+static int defrag_one_cluster(struct btrfs_inode *inode,
+                             struct file_ra_state *ra,
+                             u64 start, u32 len, u32 extent_thresh,
+                             u64 newer_than, bool do_compress,
+                             unsigned long *sectors_defragged,
+                             unsigned long max_sectors)
+{
+       const u32 sectorsize = inode->root->fs_info->sectorsize;
+       struct defrag_target_range *entry;
+       struct defrag_target_range *tmp;
+       LIST_HEAD(target_list);
+       int ret;
 
-       if (i_done != page_cnt) {
-               spin_lock(&BTRFS_I(inode)->lock);
-               btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
-               spin_unlock(&BTRFS_I(inode)->lock);
-               btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                               start, (page_cnt - i_done) << PAGE_SHIFT, true);
-       }
+       BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
+       ret = defrag_collect_targets(inode, start, len, extent_thresh,
+                                    newer_than, do_compress, false,
+                                    &target_list);
+       if (ret < 0)
+               goto out;
 
+       list_for_each_entry(entry, &target_list, list) {
+               u32 range_len = entry->len;
 
-       set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
-                         &cached_state);
+               /* Reached the limit */
+               if (max_sectors && max_sectors == *sectors_defragged)
+                       break;
 
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
+               if (max_sectors)
+                       range_len = min_t(u32, range_len,
+                               (max_sectors - *sectors_defragged) * sectorsize);
 
-       for (i = 0; i < i_done; i++) {
-               clear_page_dirty_for_io(pages[i]);
-               ClearPageChecked(pages[i]);
-               set_page_dirty(pages[i]);
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+               if (ra)
+                       page_cache_sync_readahead(inode->vfs_inode.i_mapping,
+                               ra, NULL, entry->start >> PAGE_SHIFT,
+                               ((entry->start + range_len - 1) >> PAGE_SHIFT) -
+                               (entry->start >> PAGE_SHIFT) + 1);
+               /*
+                * Here we may not defrag any range if holes are punched before
+                * we locked the pages.
+                * But that's fine, it only affects the @sectors_defragged
+                * accounting.
+                */
+               ret = defrag_one_range(inode, entry->start, range_len,
+                                      extent_thresh, newer_than, do_compress);
+               if (ret < 0)
+                       break;
+               *sectors_defragged += range_len;
        }
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
-       return i_done;
-
-out_unlock_range:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                            page_start, page_end - 1, &cached_state);
 out:
-       for (i = 0; i < i_done; i++) {
-               unlock_page(pages[i]);
-               put_page(pages[i]);
+       list_for_each_entry_safe(entry, tmp, &target_list, list) {
+               list_del_init(&entry->list);
+               kfree(entry);
        }
-       btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
-                       start, page_cnt << PAGE_SHIFT, true);
-       btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
-       extent_changeset_free(data_reserved);
        return ret;
-
 }
 
-int btrfs_defrag_file(struct inode *inode, struct file *file,
+/*
+ * Entry point to file defragmentation.
+ *
+ * @inode:        inode to be defragged
+ * @ra:                   readahead state (can be NUL)
+ * @range:        defrag options including range and flags
+ * @newer_than:           minimum transid to defrag
+ * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
+ *                will be defragged.
+ */
+int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                      struct btrfs_ioctl_defrag_range_args *range,
                      u64 newer_than, unsigned long max_to_defrag)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct file_ra_state *ra = NULL;
-       unsigned long last_index;
+       unsigned long sectors_defragged = 0;
        u64 isize = i_size_read(inode);
-       u64 last_len = 0;
-       u64 skip = 0;
-       u64 defrag_end = 0;
-       u64 newer_off = range->start;
-       unsigned long i;
-       unsigned long ra_index = 0;
-       int ret;
-       int defrag_count = 0;
+       u64 cur;
+       u64 last_byte;
+       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
+       bool ra_allocated = false;
        int compress_type = BTRFS_COMPRESS_ZLIB;
+       int ret = 0;
        u32 extent_thresh = range->extent_thresh;
-       unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
-       unsigned long cluster = max_cluster;
-       u64 new_align = ~((u64)SZ_128K - 1);
-       struct page **pages = NULL;
-       bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
 
        if (isize == 0)
                return 0;
@@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        if (extent_thresh == 0)
                extent_thresh = SZ_256K;
 
+       if (range->start + range->len > range->start) {
+               /* Got a specific range */
+               last_byte = min(isize, range->start + range->len) - 1;
+       } else {
+               /* Defrag until file end */
+               last_byte = isize - 1;
+       }
+
        /*
-        * If we were not given a file, allocate a readahead context. As
+        * If we were not given a ra, allocate a readahead context. As
         * readahead is just an optimization, defrag will work without it so
         * we don't error out.
         */
-       if (!file) {
+       if (!ra) {
+               ra_allocated = true;
                ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                if (ra)
                        file_ra_state_init(ra, inode->i_mapping);
-       } else {
-               ra = &file->f_ra;
-       }
-
-       pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
-               ret = -ENOMEM;
-               goto out_ra;
-       }
-
-       /* find the last page to defrag */
-       if (range->start + range->len > range->start) {
-               last_index = min_t(u64, isize - 1,
-                        range->start + range->len - 1) >> PAGE_SHIFT;
-       } else {
-               last_index = (isize - 1) >> PAGE_SHIFT;
-       }
-
-       if (newer_than) {
-               ret = find_new_extents(root, inode, newer_than,
-                                      &newer_off, SZ_64K);
-               if (!ret) {
-                       range->start = newer_off;
-                       /*
-                        * we always align our defrag to help keep
-                        * the extents in the file evenly spaced
-                        */
-                       i = (newer_off & new_align) >> PAGE_SHIFT;
-               } else
-                       goto out_ra;
-       } else {
-               i = range->start >> PAGE_SHIFT;
        }
-       if (!max_to_defrag)
-               max_to_defrag = last_index - i + 1;
 
-       /*
-        * make writeback starts from i, so the defrag range can be
-        * written sequentially.
-        */
-       if (i < inode->i_mapping->writeback_index)
-               inode->i_mapping->writeback_index = i;
-
-       while (i <= last_index && defrag_count < max_to_defrag &&
-              (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
-               /*
-                * make sure we stop running if someone unmounts
-                * the FS
-                */
-               if (!(inode->i_sb->s_flags & SB_ACTIVE))
-                       break;
-
-               if (btrfs_defrag_cancelled(fs_info)) {
-                       btrfs_debug(fs_info, "defrag_file cancelled");
-                       ret = -EAGAIN;
-                       goto error;
-               }
+       /* Align the range */
+       cur = round_down(range->start, fs_info->sectorsize);
+       last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
 
-               if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
-                                        extent_thresh, &last_len, &skip,
-                                        &defrag_end, do_compress)){
-                       unsigned long next;
-                       /*
-                        * the should_defrag function tells us how much to skip
-                        * bump our counter by the suggested amount
-                        */
-                       next = DIV_ROUND_UP(skip, PAGE_SIZE);
-                       i = max(i + 1, next);
-                       continue;
-               }
+       while (cur < last_byte) {
+               u64 cluster_end;
 
-               if (!newer_than) {
-                       cluster = (PAGE_ALIGN(defrag_end) >>
-                                  PAGE_SHIFT) - i;
-                       cluster = min(cluster, max_cluster);
-               } else {
-                       cluster = max_cluster;
-               }
+               /* The cluster size 256K should always be page aligned */
+               BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
 
-               if (i + cluster > ra_index) {
-                       ra_index = max(i, ra_index);
-                       if (ra)
-                               page_cache_sync_readahead(inode->i_mapping, ra,
-                                               file, ra_index, cluster);
-                       ra_index += cluster;
-               }
+               /* We want the cluster end at page boundary when possible */
+               cluster_end = (((cur >> PAGE_SHIFT) +
+                              (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
+               cluster_end = min(cluster_end, last_byte);
 
                btrfs_inode_lock(inode, 0);
                if (IS_SWAPFILE(inode)) {
                        ret = -ETXTBSY;
-               } else {
-                       if (do_compress)
-                               BTRFS_I(inode)->defrag_compress = compress_type;
-                       ret = cluster_pages_for_defrag(inode, pages, i, cluster);
+                       btrfs_inode_unlock(inode, 0);
+                       break;
                }
-               if (ret < 0) {
+               if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
                        btrfs_inode_unlock(inode, 0);
-                       goto out_ra;
+                       break;
                }
-
-               defrag_count += ret;
-               balance_dirty_pages_ratelimited(inode->i_mapping);
+               if (do_compress)
+                       BTRFS_I(inode)->defrag_compress = compress_type;
+               ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+                               cluster_end + 1 - cur, extent_thresh,
+                               newer_than, do_compress,
+                               &sectors_defragged, max_to_defrag);
                btrfs_inode_unlock(inode, 0);
-
-               if (newer_than) {
-                       if (newer_off == (u64)-1)
-                               break;
-
-                       if (ret > 0)
-                               i += ret;
-
-                       newer_off = max(newer_off + 1,
-                                       (u64)i << PAGE_SHIFT);
-
-                       ret = find_new_extents(root, inode, newer_than,
-                                              &newer_off, SZ_64K);
-                       if (!ret) {
-                               range->start = newer_off;
-                               i = (newer_off & new_align) >> PAGE_SHIFT;
-                       } else {
-                               break;
-                       }
-               } else {
-                       if (ret > 0) {
-                               i += ret;
-                               last_len += ret << PAGE_SHIFT;
-                       } else {
-                               i++;
-                               last_len = 0;
-                       }
-               }
+               if (ret < 0)
+                       break;
+               cur = cluster_end + 1;
        }
 
-       ret = defrag_count;
-error:
-       if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
-               filemap_flush(inode->i_mapping);
-               if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-                            &BTRFS_I(inode)->runtime_flags))
+       if (ra_allocated)
+               kfree(ra);
+       if (sectors_defragged) {
+               /*
+                * We have defragged some sectors, for compression case they
+                * need to be written back immediately.
+                */
+               if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
                        filemap_flush(inode->i_mapping);
+                       if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                                    &BTRFS_I(inode)->runtime_flags))
+                               filemap_flush(inode->i_mapping);
+               }
+               if (range->compress_type == BTRFS_COMPRESS_LZO)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+               else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
+                       btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
+               ret = sectors_defragged;
        }
-
-       if (range->compress_type == BTRFS_COMPRESS_LZO) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
-       } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
-               btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
-       }
-
-out_ra:
        if (do_compress) {
                btrfs_inode_lock(inode, 0);
                BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
                btrfs_inode_unlock(inode, 0);
        }
-       if (!file)
-               kfree(ra);
-       kfree(pages);
        return ret;
 }
 
@@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
 static noinline int btrfs_ioctl_resize(struct file *file,
                                        void __user *arg)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 new_size;
@@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                btrfs_info(fs_info, "resizing devid %llu", devid);
        }
 
-       device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       args.devid = devid;
+       device = btrfs_find_device(fs_info->fs_devices, &args);
        if (!device) {
                btrfs_info(fs_info, "resizer unable to find device %llu",
                           devid);
@@ -3136,12 +3097,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                goto out;
        }
 
-       /* Subpage defrag will be supported in later commits */
-       if (root->fs_info->sectorsize < PAGE_SIZE) {
-               ret = -ENOTTY;
-               goto out;
-       }
-
        switch (inode->i_mode & S_IFMT) {
        case S_IFDIR:
                if (!capable(CAP_SYS_ADMIN)) {
@@ -3176,7 +3131,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range.len = (u64)-1;
                }
-               ret = btrfs_defrag_file(file_inode(file), file,
+               ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
                                        &range, BTRFS_OLDEST_GENERATION, 0);
                if (ret > 0)
                        ret = 0;
@@ -3220,6 +3175,7 @@ out:
 
 static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_vol_args_v2 *vol_args;
@@ -3231,35 +3187,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
-
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args)) {
                ret = PTR_ERR(vol_args);
-               goto err_drop;
+               goto out;
        }
 
        if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
                ret = -EOPNOTSUPP;
                goto out;
        }
+
        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
-       if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) &&
-           strcmp("cancel", vol_args->name) == 0)
+       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
+               args.devid = vol_args->devid;
+       } else if (!strcmp("cancel", vol_args->name)) {
                cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
 
        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                           cancel);
        if (ret)
-               goto out;
-       /* Exclusive operation is now claimed */
+               goto err_drop;
 
-       if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
-               ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode);
-       else
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+       /* Exclusive operation is now claimed */
+       ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
 
        btrfs_exclop_finish(fs_info);
 
@@ -3271,17 +3231,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
                        btrfs_info(fs_info, "device deleted: %s",
                                        vol_args->name);
        }
-out:
-       kfree(vol_args);
 err_drop:
        mnt_drop_write_file(file);
        if (bdev)
                blkdev_put(bdev, mode);
+out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
        return ret;
 }
 
 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct inode *inode = file_inode(file);
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_ioctl_vol_args *vol_args;
@@ -3293,32 +3255,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       ret = mnt_want_write_file(file);
-       if (ret)
-               return ret;
-
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args)) {
-               ret = PTR_ERR(vol_args);
-               goto out_drop_write;
-       }
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-       cancel = (strcmp("cancel", vol_args->name) == 0);
+       if (!strcmp("cancel", vol_args->name)) {
+               cancel = true;
+       } else {
+               ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
+               if (ret)
+                       goto out;
+       }
+
+       ret = mnt_want_write_file(file);
+       if (ret)
+               goto out;
 
        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
                                           cancel);
        if (ret == 0) {
-               ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode);
+               ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
                if (!ret)
                        btrfs_info(fs_info, "disk deleted %s", vol_args->name);
                btrfs_exclop_finish(fs_info);
        }
 
-       kfree(vol_args);
-out_drop_write:
        mnt_drop_write_file(file);
        if (bdev)
                blkdev_put(bdev, mode);
+out:
+       btrfs_put_dev_args_from_path(&args);
+       kfree(vol_args);
        return ret;
 }
 
@@ -3379,22 +3347,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
 static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
                                 void __user *arg)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_ioctl_dev_info_args *di_args;
        struct btrfs_device *dev;
        int ret = 0;
-       char *s_uuid = NULL;
 
        di_args = memdup_user(arg, sizeof(*di_args));
        if (IS_ERR(di_args))
                return PTR_ERR(di_args);
 
+       args.devid = di_args->devid;
        if (!btrfs_is_empty_uuid(di_args->uuid))
-               s_uuid = di_args->uuid;
+               args.uuid = di_args->uuid;
 
        rcu_read_lock();
-       dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
-                               NULL);
-
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev) {
                ret = -ENODEV;
                goto out;
@@ -4430,7 +4397,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
                                                void __user *arg)
 {
        struct btrfs_ioctl_quota_rescan_args qsa = {0};
-       int ret = 0;
 
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -4441,9 +4407,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
        }
 
        if (copy_to_user(arg, &qsa, sizeof(qsa)))
-               ret = -EFAULT;
+               return -EFAULT;
 
-       return ret;
+       return 0;
 }
 
 static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
index a2e1f1f..bbc4553 100644 (file)
@@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
 
 #ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
-       lockdep_assert_held(&eb->lock);
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
+{
+       lockdep_assert_held_write(&eb->lock);
 }
 #else
-static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
 #endif
 
 void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
index 295bbc1..65cb076 100644 (file)
  *     payload.
  *     One regular LZO compressed extent can have one or more segments.
  *     For inlined LZO compressed extent, only one segment is allowed.
- *     One segment represents at most one page of uncompressed data.
+ *     One segment represents at most one sector of uncompressed data.
  *
  * 2.1 Segment header
  *     Fixed size. LZO_LEN (4) bytes long, LE32.
  *     Records the total size of the segment (not including the header).
- *     Segment header never crosses page boundary, thus it's possible to
- *     have at most 3 padding zeros at the end of the page.
+ *     Segment header never crosses sector boundary, thus it's possible to
+ *     have at most 3 padding zeros at the end of the sector.
  *
  * 2.2 Data Payload
- *     Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE)
- *     which is 4419 for a 4KiB page.
+ *     Variable size. Size up limit should be lzo1x_worst_compress(sectorsize)
+ *     which is 4419 for a 4KiB sectorsize.
  *
- * Example:
+ * Example with 4K sectorsize:
  * Page 1:
  *          0     0x2   0x4   0x6   0x8   0xa   0xc   0xe     0x10
  * 0x0000   |  Header   | SegHdr 01 | Data payload 01 ...     |
@@ -112,170 +112,174 @@ static inline size_t read_compress_length(const char *buf)
        return le32_to_cpu(dlen);
 }
 
+/*
+ * Will do:
+ *
+ * - Write a segment header into the destination
+ * - Copy the compressed buffer into the destination
+ * - Make sure we have enough space in the last sector to fit a segment header
+ *   If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros.
+ *
+ * Will allocate new pages when needed.
+ */
+static int copy_compressed_data_to_page(char *compressed_data,
+                                       size_t compressed_size,
+                                       struct page **out_pages,
+                                       u32 *cur_out,
+                                       const u32 sectorsize)
+{
+       u32 sector_bytes_left;
+       u32 orig_out;
+       struct page *cur_page;
+       char *kaddr;
+
+       /*
+        * We never allow a segment header crossing sector boundary, previous
+        * run should ensure we have enough space left inside the sector.
+        */
+       ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize);
+
+       cur_page = out_pages[*cur_out / PAGE_SIZE];
+       /* Allocate a new page */
+       if (!cur_page) {
+               cur_page = alloc_page(GFP_NOFS);
+               if (!cur_page)
+                       return -ENOMEM;
+               out_pages[*cur_out / PAGE_SIZE] = cur_page;
+       }
+
+       kaddr = kmap(cur_page);
+       write_compress_length(kaddr + offset_in_page(*cur_out),
+                             compressed_size);
+       *cur_out += LZO_LEN;
+
+       orig_out = *cur_out;
+
+       /* Copy compressed data */
+       while (*cur_out - orig_out < compressed_size) {
+               u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize,
+                                    orig_out + compressed_size - *cur_out);
+
+               kunmap(cur_page);
+               cur_page = out_pages[*cur_out / PAGE_SIZE];
+               /* Allocate a new page */
+               if (!cur_page) {
+                       cur_page = alloc_page(GFP_NOFS);
+                       if (!cur_page)
+                               return -ENOMEM;
+                       out_pages[*cur_out / PAGE_SIZE] = cur_page;
+               }
+               kaddr = kmap(cur_page);
+
+               memcpy(kaddr + offset_in_page(*cur_out),
+                      compressed_data + *cur_out - orig_out, copy_len);
+
+               *cur_out += copy_len;
+       }
+
+       /*
+        * Check if we can fit the next segment header into the remaining space
+        * of the sector.
+        */
+       sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out;
+       if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0)
+               goto out;
+
+       /* The remaining size is not enough, pad it with zeros */
+       memset(kaddr + offset_in_page(*cur_out), 0,
+              sector_bytes_left);
+       *cur_out += sector_bytes_left;
+
+out:
+       kunmap(cur_page);
+       return 0;
+}
+
 int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
                u64 start, struct page **pages, unsigned long *out_pages,
                unsigned long *total_in, unsigned long *total_out)
 {
        struct workspace *workspace = list_entry(ws, struct workspace, list);
+       const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize;
+       struct page *page_in = NULL;
+       char *sizes_ptr;
        int ret = 0;
-       char *data_in;
-       char *cpage_out, *sizes_ptr;
-       int nr_pages = 0;
-       struct page *in_page = NULL;
-       struct page *out_page = NULL;
-       unsigned long bytes_left;
-       unsigned long len = *total_out;
-       unsigned long nr_dest_pages = *out_pages;
-       const unsigned long max_out = nr_dest_pages * PAGE_SIZE;
-       size_t in_len;
-       size_t out_len;
-       char *buf;
-       unsigned long tot_in = 0;
-       unsigned long tot_out = 0;
-       unsigned long pg_bytes_left;
-       unsigned long out_offset;
-       unsigned long bytes;
+       /* Points to the file offset of input data */
+       u64 cur_in = start;
+       /* Points to the current output byte */
+       u32 cur_out = 0;
+       u32 len = *total_out;
 
        *out_pages = 0;
        *total_out = 0;
        *total_in = 0;
 
-       in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-       data_in = kmap(in_page);
-
        /*
-        * store the size of all chunks of compressed data in
-        * the first 4 bytes
+        * Skip the header for now, we will later come back and write the total
+        * compressed size
         */
-       out_page = alloc_page(GFP_NOFS);
-       if (out_page == NULL) {
-               ret = -ENOMEM;
-               goto out;
-       }
-       cpage_out = kmap(out_page);
-       out_offset = LZO_LEN;
-       tot_out = LZO_LEN;
-       pages[0] = out_page;
-       nr_pages = 1;
-       pg_bytes_left = PAGE_SIZE - LZO_LEN;
-
-       /* compress at most one page of data each time */
-       in_len = min(len, PAGE_SIZE);
-       while (tot_in < len) {
-               ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
-                                      &out_len, workspace->mem);
-               if (ret != LZO_E_OK) {
-                       pr_debug("BTRFS: lzo in loop returned %d\n",
-                              ret);
+       cur_out += LZO_LEN;
+       while (cur_in < start + len) {
+               char *data_in;
+               const u32 sectorsize_mask = sectorsize - 1;
+               u32 sector_off = (cur_in - start) & sectorsize_mask;
+               u32 in_len;
+               size_t out_len;
+
+               /* Get the input page first */
+               if (!page_in) {
+                       page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT);
+                       ASSERT(page_in);
+               }
+
+               /* Compress at most one sector of data each time */
+               in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
+               ASSERT(in_len);
+               data_in = kmap(page_in);
+               ret = lzo1x_1_compress(data_in +
+                                      offset_in_page(cur_in), in_len,
+                                      workspace->cbuf, &out_len,
+                                      workspace->mem);
+               kunmap(page_in);
+               if (ret < 0) {
+                       pr_debug("BTRFS: lzo in loop returned %d\n", ret);
                        ret = -EIO;
                        goto out;
                }
 
-               /* store the size of this chunk of compressed data */
-               write_compress_length(cpage_out + out_offset, out_len);
-               tot_out += LZO_LEN;
-               out_offset += LZO_LEN;
-               pg_bytes_left -= LZO_LEN;
-
-               tot_in += in_len;
-               tot_out += out_len;
-
-               /* copy bytes from the working buffer into the pages */
-               buf = workspace->cbuf;
-               while (out_len) {
-                       bytes = min_t(unsigned long, pg_bytes_left, out_len);
-
-                       memcpy(cpage_out + out_offset, buf, bytes);
-
-                       out_len -= bytes;
-                       pg_bytes_left -= bytes;
-                       buf += bytes;
-                       out_offset += bytes;
-
-                       /*
-                        * we need another page for writing out.
-                        *
-                        * Note if there's less than 4 bytes left, we just
-                        * skip to a new page.
-                        */
-                       if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
-                           pg_bytes_left == 0) {
-                               if (pg_bytes_left) {
-                                       memset(cpage_out + out_offset, 0,
-                                              pg_bytes_left);
-                                       tot_out += pg_bytes_left;
-                               }
-
-                               /* we're done, don't allocate new page */
-                               if (out_len == 0 && tot_in >= len)
-                                       break;
-
-                               kunmap(out_page);
-                               if (nr_pages == nr_dest_pages) {
-                                       out_page = NULL;
-                                       ret = -E2BIG;
-                                       goto out;
-                               }
-
-                               out_page = alloc_page(GFP_NOFS);
-                               if (out_page == NULL) {
-                                       ret = -ENOMEM;
-                                       goto out;
-                               }
-                               cpage_out = kmap(out_page);
-                               pages[nr_pages++] = out_page;
-
-                               pg_bytes_left = PAGE_SIZE;
-                               out_offset = 0;
-                       }
-               }
+               ret = copy_compressed_data_to_page(workspace->cbuf, out_len,
+                                                  pages, &cur_out, sectorsize);
+               if (ret < 0)
+                       goto out;
 
-               /* we're making it bigger, give up */
-               if (tot_in > 8192 && tot_in < tot_out) {
+               cur_in += in_len;
+
+               /*
+                * Check if we're making it bigger after two sectors.  And if
+                * it is so, give up.
+                */
+               if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) {
                        ret = -E2BIG;
                        goto out;
                }
 
-               /* we're all done */
-               if (tot_in >= len)
-                       break;
-
-               if (tot_out > max_out)
-                       break;
-
-               bytes_left = len - tot_in;
-               kunmap(in_page);
-               put_page(in_page);
-
-               start += PAGE_SIZE;
-               in_page = find_get_page(mapping, start >> PAGE_SHIFT);
-               data_in = kmap(in_page);
-               in_len = min(bytes_left, PAGE_SIZE);
-       }
-
-       if (tot_out >= tot_in) {
-               ret = -E2BIG;
-               goto out;
+               /* Check if we have reached page boundary */
+               if (IS_ALIGNED(cur_in, PAGE_SIZE)) {
+                       put_page(page_in);
+                       page_in = NULL;
+               }
        }
 
-       /* store the size of all chunks of compressed data */
+       /* Store the size of all chunks of compressed data */
        sizes_ptr = kmap_local_page(pages[0]);
-       write_compress_length(sizes_ptr, tot_out);
+       write_compress_length(sizes_ptr, cur_out);
        kunmap_local(sizes_ptr);
 
        ret = 0;
-       *total_out = tot_out;
-       *total_in = tot_in;
+       *total_out = cur_out;
+       *total_in = cur_in - start;
 out:
-       *out_pages = nr_pages;
-       if (out_page)
-               kunmap(out_page);
-
-       if (in_page) {
-               kunmap(in_page);
-               put_page(in_page);
-       }
-
+       *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE);
        return ret;
 }
 
index d8d268c..0e239a4 100644 (file)
@@ -60,8 +60,7 @@ enum btrfs_rbio_ops {
 };
 
 struct btrfs_raid_bio {
-       struct btrfs_fs_info *fs_info;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
 
        /* while we're doing rmw on a stripe
         * we put it into a hash table so we can
@@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work);
 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
 {
        btrfs_init_work(&rbio->work, work_func, NULL, NULL);
-       btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+       btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
 }
 
 /*
@@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 {
-       u64 num = rbio->bbio->raid_map[0];
+       u64 num = rbio->bioc->raid_map[0];
 
        /*
         * we shift down quite a bit.  We're using byte
@@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
                return;
 
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
        h = table->table + bucket;
 
        /* hold the lock for the bucket because we may be
@@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
                return;
 
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
 
        spin_lock_irqsave(&table->cache_lock, flags);
        __remove_rbio_from_cache(rbio);
@@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
                return;
 
-       table = rbio->fs_info->stripe_hash_table;
+       table = rbio->bioc->fs_info->stripe_hash_table;
 
        spin_lock_irqsave(&table->cache_lock, flags);
        spin_lock(&rbio->bio_list_lock);
@@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
            test_bit(RBIO_CACHE_BIT, &cur->flags))
                return 0;
 
-       if (last->bbio->raid_map[0] !=
-           cur->bbio->raid_map[0])
+       if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
                return 0;
 
        /* we can't merge with different operations */
@@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
        struct btrfs_raid_bio *cache_drop = NULL;
        int ret = 0;
 
-       h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+       h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 
        spin_lock_irqsave(&h->lock, flags);
        list_for_each_entry(cur, &h->hash_list, hash_list) {
-               if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+               if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
                        continue;
 
                spin_lock(&cur->bio_list_lock);
@@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
        int keep_cache = 0;
 
        bucket = rbio_bucket(rbio);
-       h = rbio->fs_info->stripe_hash_table->table + bucket;
+       h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
 
        if (list_empty(&rbio->plug_list))
                cache_rbio(rbio);
@@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                }
        }
 
-       btrfs_put_bbio(rbio->bbio);
+       btrfs_put_bioc(rbio->bioc);
        kfree(rbio);
 }
 
@@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
        struct bio *extra;
 
        if (rbio->generic_bio_cnt)
-               btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+               btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
 
        /*
         * At this moment, rbio->bio_list is empty, however since rbio does not
@@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio)
 
        /* OK, we have read all the stripes we need to. */
        max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
-                    0 : rbio->bbio->max_errors;
+                    0 : rbio->bioc->max_errors;
        if (atomic_read(&rbio->error) > max_errors)
                err = BLK_STS_IOERR;
 
@@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
  * this does not allocate any pages for rbio->pages.
  */
 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
-                                        struct btrfs_bio *bbio,
+                                        struct btrfs_io_context *bioc,
                                         u64 stripe_len)
 {
        struct btrfs_raid_bio *rbio;
        int nr_data = 0;
-       int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
        int num_pages = rbio_nr_pages(stripe_len, real_stripes);
        int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
        void *p;
@@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
        spin_lock_init(&rbio->bio_list_lock);
        INIT_LIST_HEAD(&rbio->stripe_cache);
        INIT_LIST_HEAD(&rbio->hash_list);
-       rbio->bbio = bbio;
-       rbio->fs_info = fs_info;
+       rbio->bioc = bioc;
        rbio->stripe_len = stripe_len;
        rbio->nr_pages = num_pages;
        rbio->real_stripes = real_stripes;
@@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
        CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
 #undef  CONSUME_ALLOC
 
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
                nr_data = real_stripes - 1;
-       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+       else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
                nr_data = real_stripes - 2;
        else
                BUG();
@@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
        struct bio *last = bio_list->tail;
        int ret;
        struct bio *bio;
-       struct btrfs_bio_stripe *stripe;
+       struct btrfs_io_stripe *stripe;
        u64 disk_start;
 
-       stripe = &rbio->bbio->stripes[stripe_nr];
+       stripe = &rbio->bioc->stripes[stripe_nr];
        disk_start = stripe->physical + (page_index << PAGE_SHIFT);
 
        /* if the device is missing, just fail this stripe */
@@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
        }
 
        /* put a new bio on the list */
-       bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
-       btrfs_io_bio(bio)->device = stripe->dev;
+       bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
+       btrfs_bio(bio)->device = stripe->dev;
        bio->bi_iter.bi_size = 0;
        bio_set_dev(bio, stripe->dev->bdev);
        bio->bi_iter.bi_sector = disk_start >> 9;
@@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
                int i = 0;
 
                start = bio->bi_iter.bi_sector << 9;
-               stripe_offset = start - rbio->bbio->raid_map[0];
+               stripe_offset = start - rbio->bioc->raid_map[0];
                page_index = stripe_offset >> PAGE_SHIFT;
 
                if (bio_flagged(bio, BIO_CLONED))
-                       bio->bi_iter = btrfs_io_bio(bio)->iter;
+                       bio->bi_iter = btrfs_bio(bio)->iter;
 
                bio_for_each_segment(bvec, bio, iter) {
                        rbio->bio_pages[page_index + i] = bvec.bv_page;
@@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
  */
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
-       struct btrfs_bio *bbio = rbio->bbio;
+       struct btrfs_io_context *bioc = rbio->bioc;
        void **pointers = rbio->finish_pointers;
        int nr_data = rbio->nr_data;
        int stripe;
@@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                }
        }
 
-       if (likely(!bbio->num_tgtdevs))
+       if (likely(!bioc->num_tgtdevs))
                goto write_data;
 
        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-               if (!bbio->tgtdev_map[stripe])
+               if (!bioc->tgtdev_map[stripe])
                        continue;
 
                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
@@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
                        }
 
                        ret = rbio_add_io_page(rbio, &bio_list, page,
-                                              rbio->bbio->tgtdev_map[stripe],
+                                              rbio->bioc->tgtdev_map[stripe],
                                               pagenr, rbio->stripe_len);
                        if (ret)
                                goto cleanup;
@@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 {
        u64 physical = bio->bi_iter.bi_sector;
        int i;
-       struct btrfs_bio_stripe *stripe;
+       struct btrfs_io_stripe *stripe;
 
        physical <<= 9;
 
-       for (i = 0; i < rbio->bbio->num_stripes; i++) {
-               stripe = &rbio->bbio->stripes[i];
+       for (i = 0; i < rbio->bioc->num_stripes; i++) {
+               stripe = &rbio->bioc->stripes[i];
                if (in_range(physical, stripe->physical, rbio->stripe_len) &&
                    stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
                        return i;
@@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
        int i;
 
        for (i = 0; i < rbio->nr_data; i++) {
-               u64 stripe_start = rbio->bbio->raid_map[i];
+               u64 stripe_start = rbio->bioc->raid_map[i];
 
                if (in_range(logical, stripe_start, rbio->stripe_len))
                        return i;
@@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio)
        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
 
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                goto cleanup;
 
        /*
@@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
        }
 
        /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
         */
        atomic_set(&rbio->stripes_pending, bios_to_read);
        while ((bio = bio_list_pop(&bio_list))) {
@@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
                bio->bi_end_io = raid_rmw_end_io;
                bio->bi_opf = REQ_OP_READ;
 
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
                submit_bio(bio);
        }
@@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 /*
  * our main entry point for writes from the rest of the FS.
  */
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
-                       struct btrfs_bio *bbio, u64 stripe_len)
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+                       u64 stripe_len)
 {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
        struct btrfs_raid_bio *rbio;
        struct btrfs_plug_cb *plug = NULL;
        struct blk_plug_cb *cb;
        int ret;
 
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
        if (IS_ERR(rbio)) {
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                return PTR_ERR(rbio);
        }
        bio_list_add(&rbio->bio_list, bio);
@@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                }
 
                /* all raid6 handling here */
-               if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
+               if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
                        /*
                         * single failure, rebuild from parity raid5
                         * style
@@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                         * here due to a crc mismatch and we can't give them the
                         * data they want
                         */
-                       if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
-                               if (rbio->bbio->raid_map[faila] ==
+                       if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+                               if (rbio->bioc->raid_map[faila] ==
                                    RAID5_P_STRIPE) {
                                        err = BLK_STS_IOERR;
                                        goto cleanup;
@@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                                goto pstripe;
                        }
 
-                       if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
+                       if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
                                raid6_datap_recov(rbio->real_stripes,
                                                  PAGE_SIZE, faila, pointers);
                        } else {
@@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio)
        if (!atomic_dec_and_test(&rbio->stripes_pending))
                return;
 
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                rbio_orig_end_io(rbio, BLK_STS_IOERR);
        else
                __raid_recover_end_io(rbio);
@@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                 * were up to date, or we might have no bios to read because
                 * the devices were gone.
                 */
-               if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
+               if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
                        __raid_recover_end_io(rbio);
                        return 0;
                } else {
@@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
        }
 
        /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
         */
        atomic_set(&rbio->stripes_pending, bios_to_read);
        while ((bio = bio_list_pop(&bio_list))) {
@@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
                bio->bi_end_io = raid_recover_end_io;
                bio->bi_opf = REQ_OP_READ;
 
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
                submit_bio(bio);
        }
@@ -2116,22 +2114,22 @@ cleanup:
  * so we assume the bio they send down corresponds to a failed part
  * of the drive.
  */
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 stripe_len,
-                         int mirror_num, int generic_io)
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 stripe_len, int mirror_num, int generic_io)
 {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
        struct btrfs_raid_bio *rbio;
        int ret;
 
        if (generic_io) {
-               ASSERT(bbio->mirror_num == mirror_num);
-               btrfs_io_bio(bio)->mirror_num = mirror_num;
+               ASSERT(bioc->mirror_num == mirror_num);
+               btrfs_bio(bio)->mirror_num = mirror_num;
        }
 
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
        if (IS_ERR(rbio)) {
                if (generic_io)
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                return PTR_ERR(rbio);
        }
 
@@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
                btrfs_warn(fs_info,
-       "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
+"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
                           __func__, bio->bi_iter.bi_sector << 9,
-                          (u64)bio->bi_iter.bi_size, bbio->map_type);
+                          (u64)bio->bi_iter.bi_size, bioc->map_type);
                if (generic_io)
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                kfree(rbio);
                return -EIO;
        }
@@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
                btrfs_bio_counter_inc_noblocked(fs_info);
                rbio->generic_bio_cnt = 1;
        } else {
-               btrfs_get_bbio(bbio);
+               btrfs_get_bioc(bioc);
        }
 
        /*
@@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work)
 /*
  * The following code is used to scrub/replace the parity stripe
  *
- * Caller must have already increased bio_counter for getting @bbio.
+ * Caller must have already increased bio_counter for getting @bioc.
  *
  * Note: We need make sure all the pages that add into the scrub/replace
  * raid bio are correct and not be changed during the scrub/replace. That
  * is those pages just hold metadata or file data with checksum.
  */
 
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len,
-                              struct btrfs_device *scrub_dev,
-                              unsigned long *dbitmap, int stripe_nsectors)
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+                               struct btrfs_io_context *bioc,
+                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors)
 {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
        struct btrfs_raid_bio *rbio;
        int i;
 
-       rbio = alloc_rbio(fs_info, bbio, stripe_len);
+       rbio = alloc_rbio(fs_info, bioc, stripe_len);
        if (IS_ERR(rbio))
                return NULL;
        bio_list_add(&rbio->bio_list, bio);
@@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
        rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
 
        /*
-        * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
+        * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
         * to the end position, so this search can start from the first parity
         * stripe.
         */
        for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
-               if (bbio->stripes[i].dev == scrub_dev) {
+               if (bioc->stripes[i].dev == scrub_dev) {
                        rbio->scrubp = i;
                        break;
                }
@@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
 
        /*
-        * We have already increased bio_counter when getting bbio, record it
+        * We have already increased bio_counter when getting bioc, record it
         * so we can free it at rbio_orig_end_io().
         */
        rbio->generic_bio_cnt = 1;
@@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
        int stripe_offset;
        int index;
 
-       ASSERT(logical >= rbio->bbio->raid_map[0]);
-       ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
+       ASSERT(logical >= rbio->bioc->raid_map[0]);
+       ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] +
                                rbio->stripe_len * rbio->nr_data);
-       stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
+       stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
        index = stripe_offset >> PAGE_SHIFT;
        rbio->bio_pages[index] = page;
 }
@@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
                                         int need_check)
 {
-       struct btrfs_bio *bbio = rbio->bbio;
+       struct btrfs_io_context *bioc = rbio->bioc;
        void **pointers = rbio->finish_pointers;
        unsigned long *pbitmap = rbio->finish_pbitmap;
        int nr_data = rbio->nr_data;
@@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
        else
                BUG();
 
-       if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
+       if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
                is_replace = 1;
                bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
        }
@@ -2435,7 +2433,7 @@ writeback:
 
                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
                ret = rbio_add_io_page(rbio, &bio_list, page,
-                                      bbio->tgtdev_map[rbio->scrubp],
+                                      bioc->tgtdev_map[rbio->scrubp],
                                       pagenr, rbio->stripe_len);
                if (ret)
                        goto cleanup;
@@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
  */
 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
 {
-       if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+       if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
                goto cleanup;
 
        if (rbio->faila >= 0 || rbio->failb >= 0) {
@@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
                 * the data, so the capability of the repair is declined.
                 * (In the case of RAID5, we can not repair anything)
                 */
-               if (dfail > rbio->bbio->max_errors - 1)
+               if (dfail > rbio->bioc->max_errors - 1)
                        goto cleanup;
 
                /*
@@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
        }
 
        /*
-        * the bbio may be freed once we submit the last bio.  Make sure
-        * not to touch it after that
+        * The bioc may be freed once we submit the last bio. Make sure not to
+        * touch it after that.
         */
        atomic_set(&rbio->stripes_pending, bios_to_read);
        while ((bio = bio_list_pop(&bio_list))) {
@@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
                bio->bi_end_io = raid56_parity_scrub_end_io;
                bio->bi_opf = REQ_OP_READ;
 
-               btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
+               btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
 
                submit_bio(bio);
        }
@@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
 /* The following code is used for dev replace of a missing RAID 5/6 device. */
 
 struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 length)
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 length)
 {
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
        struct btrfs_raid_bio *rbio;
 
-       rbio = alloc_rbio(fs_info, bbio, length);
+       rbio = alloc_rbio(fs_info, bioc, length);
        if (IS_ERR(rbio))
                return NULL;
 
@@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
        }
 
        /*
-        * When we get bbio, we have already increased bio_counter, record it
+        * When we get bioc, we have already increased bio_counter, record it
         * so we can free it at rbio_orig_end_io()
         */
        rbio->generic_bio_cnt = 1;
index 2503485..72c00fc 100644 (file)
@@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 struct btrfs_raid_bio;
 struct btrfs_device;
 
-int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 stripe_len,
-                         int mirror_num, int generic_io);
-int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len);
+int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 stripe_len, int mirror_num, int generic_io);
+int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
+                       u64 stripe_len);
 
 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
                            u64 logical);
 
-struct btrfs_raid_bio *
-raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                              struct btrfs_bio *bbio, u64 stripe_len,
-                              struct btrfs_device *scrub_dev,
-                              unsigned long *dbitmap, int stripe_nsectors);
+struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+                               struct btrfs_io_context *bioc, u64 stripe_len,
+                               struct btrfs_device *scrub_dev,
+                               unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
 
 struct btrfs_raid_bio *
-raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
-                         struct btrfs_bio *bbio, u64 length);
+raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
+                         u64 length);
 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
index 06713a8..eb96fdc 100644 (file)
@@ -227,7 +227,7 @@ start_machine:
 }
 
 static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
-                                         struct btrfs_bio *bbio)
+                                         struct btrfs_io_context *bioc)
 {
        struct btrfs_fs_info *fs_info = dev->fs_info;
        int ret;
@@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
        kref_init(&zone->refcnt);
        zone->elems = 0;
        zone->device = dev; /* our device always sits at index 0 */
-       for (i = 0; i < bbio->num_stripes; ++i) {
+       for (i = 0; i < bioc->num_stripes; ++i) {
                /* bounds have already been checked */
-               zone->devs[i] = bbio->stripes[i].dev;
+               zone->devs[i] = bioc->stripes[i].dev;
        }
-       zone->ndevs = bbio->num_stripes;
+       zone->ndevs = bioc->num_stripes;
 
        spin_lock(&fs_info->reada_lock);
        ret = radix_tree_insert(&dev->reada_zones,
@@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        int ret;
        struct reada_extent *re = NULL;
        struct reada_extent *re_exist = NULL;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        struct btrfs_device *dev;
        struct btrfs_device *prev_dev;
        u64 length;
@@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
         */
        length = fs_info->nodesize;
        ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio, 0);
-       if (ret || !bbio || length < fs_info->nodesize)
+                             &length, &bioc, 0);
+       if (ret || !bioc || length < fs_info->nodesize)
                goto error;
 
-       if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
+       if (bioc->num_stripes > BTRFS_MAX_MIRRORS) {
                btrfs_err(fs_info,
                           "readahead: more than %d copies not supported",
                           BTRFS_MAX_MIRRORS);
                goto error;
        }
 
-       real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+       real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
        for (nzones = 0; nzones < real_stripes; ++nzones) {
                struct reada_zone *zone;
 
-               dev = bbio->stripes[nzones].dev;
+               dev = bioc->stripes[nzones].dev;
 
                /* cannot read ahead on missing device. */
                if (!dev->bdev)
                        continue;
 
-               zone = reada_find_zone(dev, logical, bbio);
+               zone = reada_find_zone(dev, logical, bioc);
                if (!zone)
                        continue;
 
@@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
        if (!have_zone)
                goto error;
 
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
        return re;
 
 error:
@@ -488,7 +488,7 @@ error:
                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
        }
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
        kfree(re);
        return re_exist;
 }
index d2062d5..e2b9f86 100644 (file)
@@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
 
        if (generic_ref->type == BTRFS_REF_METADATA) {
                if (!parent)
-                       ref_root = generic_ref->tree_ref.root;
+                       ref_root = generic_ref->tree_ref.owning_root;
                owner = generic_ref->tree_ref.level;
        } else if (!parent) {
-               ref_root = generic_ref->data_ref.ref_root;
+               ref_root = generic_ref->data_ref.owning_root;
                owner = generic_ref->data_ref.ino;
                offset = generic_ref->data_ref.offset;
        }
index 9b08143..e0f93b3 100644 (file)
@@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
        }
 
        btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
-       ClearPageChecked(page);
+       btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
        btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
 out_unlock:
        if (page) {
@@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
                             struct inode *dst, u64 dst_loff)
 {
-       int ret;
+       int ret = 0;
        u64 i, tail_len, chunk_count;
        struct btrfs_root *root_dst = BTRFS_I(dst)->root;
 
index 914d403..33a0ee7 100644 (file)
@@ -25,6 +25,7 @@
 #include "backref.h"
 #include "misc.h"
 #include "subpage.h"
+#include "zoned.h"
 
 /*
  * Relocation overview
@@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                key.offset -= btrfs_file_extent_offset(leaf, fi);
                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
                                       num_bytes, parent);
-               ref.real_root = root->root_key.objectid;
                btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                   key.objectid, key.offset);
+                                   key.objectid, key.offset,
+                                   root->root_key.objectid, false);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
 
                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
                                       num_bytes, parent);
-               ref.real_root = root->root_key.objectid;
                btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
-                                   key.objectid, key.offset);
+                                   key.objectid, key.offset,
+                                   root->root_key.objectid, false);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1367,8 +1368,8 @@ again:
 
                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
                                       blocksize, path->nodes[level]->start);
-               ref.skip_qgroup = true;
-               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+                                   0, true);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1376,8 +1377,8 @@ again:
                }
                btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
                                       blocksize, 0);
-               ref.skip_qgroup = true;
-               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
+               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
+                                   true);
                ret = btrfs_inc_extent_ref(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1386,8 +1387,8 @@ again:
 
                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
                                       blocksize, path->nodes[level]->start);
-               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
-               ref.skip_qgroup = true;
+               btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
+                                   0, true);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -1396,8 +1397,8 @@ again:
 
                btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
                                       blocksize, 0);
-               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
-               ref.skip_qgroup = true;
+               btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
+                                   0, true);
                ret = btrfs_free_extent(trans, &ref);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
@@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
                                               node->eb->start, blocksize,
                                               upper->eb->start);
-                       ref.real_root = root->root_key.objectid;
                        btrfs_init_tree_ref(&ref, node->level,
-                                           btrfs_header_owner(upper->eb));
+                                           btrfs_header_owner(upper->eb),
+                                           root->root_key.objectid, false);
                        ret = btrfs_inc_extent_ref(trans, &ref);
                        if (!ret)
                                ret = btrfs_drop_subtree(trans, root, eb,
@@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                        list_add_tail(&node->list, &rc->backref_cache.changed);
                } else {
                        path->lowest_level = node->level;
+                       if (root == root->fs_info->chunk_root)
+                               btrfs_reserve_chunk_metadata(trans, false);
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
                        btrfs_release_path(path);
+                       if (root == root->fs_info->chunk_root)
+                               btrfs_trans_release_chunk_metadata(trans);
                        if (ret > 0)
                                ret = 0;
                }
@@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
        if (ret)
                return ret;
 
-       /*
-        * On a zoned filesystem, we cannot preallocate the file region.
-        * Instead, we dirty and fiemap_write the region.
-        */
-       if (btrfs_is_zoned(inode->root->fs_info)) {
-               struct btrfs_root *root = inode->root;
-               struct btrfs_trans_handle *trans;
-
-               end = cluster->end - offset + 1;
-               trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
-                       return PTR_ERR(trans);
-
-               inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
-               i_size_write(&inode->vfs_inode, end);
-               ret = btrfs_update_inode(trans, root, inode);
-               if (ret) {
-                       btrfs_abort_transaction(trans, ret);
-                       btrfs_end_transaction(trans);
-                       return ret;
-               }
-
-               return btrfs_end_transaction(trans);
-       }
-
        btrfs_inode_lock(&inode->vfs_inode, 0);
        for (nr = 0; nr < cluster->nr; nr++) {
                start = cluster->boundary[nr] - offset;
@@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster(
        return ret;
 }
 
-static noinline_for_stack
-int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
-                        u64 block_start)
+static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
+                               u64 start, u64 end, u64 block_start)
 {
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_map *em;
@@ -3084,7 +3063,6 @@ release_page:
 static int relocate_file_extent_cluster(struct inode *inode,
                                        struct file_extent_cluster *cluster)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
@@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 
        file_ra_state_init(ra, inode->i_mapping);
 
-       ret = setup_extent_mapping(inode, cluster->start - offset,
+       ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
                goto out;
@@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
        for (index = (cluster->start - offset) >> PAGE_SHIFT;
             index <= last_index && !ret; index++)
                ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
-       if (btrfs_is_zoned(fs_info) && !ret)
-               ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
        if (ret == 0)
                WARN_ON(cluster_nr != cluster->nr);
 out:
@@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct btrfs_inode_item *item;
        struct extent_buffer *leaf;
-       u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
        int ret;
 
-       if (btrfs_is_zoned(trans->fs_info))
-               flags &= ~BTRFS_INODE_PREALLOC;
-
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-       btrfs_set_inode_flags(leaf, item, flags);
+       btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                         BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
 out:
        btrfs_free_path(path);
@@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
                                 rc->block_group->start,
                                 rc->block_group->length);
 
+       ret = btrfs_zone_finish(rc->block_group);
+       WARN_ON(ret && ret != -EAGAIN);
+
        while (1) {
                int finishes_stage;
 
@@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
        if (!rc)
                return 0;
 
-       BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
-              root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+       BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
 
        level = btrfs_header_level(buf);
        if (btrfs_header_generation(buf) <=
index 088641b..cf82ea6 100644 (file)
@@ -57,7 +57,7 @@ struct scrub_ctx;
 
 struct scrub_recover {
        refcount_t              refs;
-       struct btrfs_bio        *bbio;
+       struct btrfs_io_context *bioc;
        u64                     map_length;
 };
 
@@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx);
 static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
 {
        return spage->recover &&
-              (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+              (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 }
 
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 {
        if (refcount_dec_and_test(&recover->refs)) {
                btrfs_bio_counter_dec(fs_info);
-               btrfs_put_bbio(recover->bbio);
+               btrfs_put_bioc(recover->bioc);
                kfree(recover);
        }
 }
@@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                        sblock_other = sblocks_for_recheck + mirror_index;
                } else {
                        struct scrub_recover *r = sblock_bad->pagev[0]->recover;
-                       int max_allowed = r->bbio->num_stripes -
-                                               r->bbio->num_tgtdevs;
+                       int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
 
                        if (mirror_index >= max_allowed)
                                break;
@@ -1218,14 +1217,14 @@ out:
        return 0;
 }
 
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
+static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
 {
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
                return 2;
-       else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+       else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
                return 3;
        else
-               return (int)bbio->num_stripes;
+               return (int)bioc->num_stripes;
 }
 
 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
@@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
        u64 flags = original_sblock->pagev[0]->flags;
        u64 have_csum = original_sblock->pagev[0]->have_csum;
        struct scrub_recover *recover;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
        u64 sublen;
        u64 mapped_length;
        u64 stripe_offset;
@@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
        while (length > 0) {
                sublen = min_t(u64, length, fs_info->sectorsize);
                mapped_length = sublen;
-               bbio = NULL;
+               bioc = NULL;
 
                /*
                 * With a length of sectorsize, each returned stripe represents
@@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                 */
                btrfs_bio_counter_inc_blocked(fs_info);
                ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &mapped_length, &bbio);
-               if (ret || !bbio || mapped_length < sublen) {
-                       btrfs_put_bbio(bbio);
+                                      logical, &mapped_length, &bioc);
+               if (ret || !bioc || mapped_length < sublen) {
+                       btrfs_put_bioc(bioc);
                        btrfs_bio_counter_dec(fs_info);
                        return -EIO;
                }
 
                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                if (!recover) {
-                       btrfs_put_bbio(bbio);
+                       btrfs_put_bioc(bioc);
                        btrfs_bio_counter_dec(fs_info);
                        return -ENOMEM;
                }
 
                refcount_set(&recover->refs, 1);
-               recover->bbio = bbio;
+               recover->bioc = bioc;
                recover->map_length = mapped_length;
 
                BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
 
-               nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+               nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
 
                for (mirror_index = 0; mirror_index < nmirrors;
                     mirror_index++) {
@@ -1348,17 +1347,17 @@ leave_nomem:
                                       sctx->fs_info->csum_size);
 
                        scrub_stripe_index_and_offset(logical,
-                                                     bbio->map_type,
-                                                     bbio->raid_map,
+                                                     bioc->map_type,
+                                                     bioc->raid_map,
                                                      mapped_length,
-                                                     bbio->num_stripes -
-                                                     bbio->num_tgtdevs,
+                                                     bioc->num_stripes -
+                                                     bioc->num_tgtdevs,
                                                      mirror_index,
                                                      &stripe_index,
                                                      &stripe_offset);
-                       spage->physical = bbio->stripes[stripe_index].physical +
+                       spage->physical = bioc->stripes[stripe_index].physical +
                                         stripe_offset;
-                       spage->dev = bbio->stripes[stripe_index].dev;
+                       spage->dev = bioc->stripes[stripe_index].dev;
 
                        BUG_ON(page_index >= original_sblock->page_count);
                        spage->physical_for_dev_replace =
@@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
        bio->bi_end_io = scrub_bio_wait_endio;
 
        mirror_num = spage->sblock->pagev[0]->mirror_num;
-       ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio,
+       ret = raid56_parity_recover(bio, spage->recover->bioc,
                                    spage->recover->map_length,
                                    mirror_num, 0);
        if (ret)
@@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
        if (!first_page->dev->bdev)
                goto out;
 
-       bio = btrfs_io_bio_alloc(BIO_MAX_VECS);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
        bio_set_dev(bio, first_page->dev->bdev);
 
        for (page_num = 0; page_num < sblock->page_count; page_num++) {
@@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                }
 
                WARN_ON(!spage->page);
-               bio = btrfs_io_bio_alloc(1);
+               bio = btrfs_bio_alloc(1);
                bio_set_dev(bio, spage->dev->bdev);
 
                bio_add_page(bio, spage->page, fs_info->sectorsize, 0);
@@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                        return -EIO;
                }
 
-               bio = btrfs_io_bio_alloc(1);
+               bio = btrfs_bio_alloc(1);
                bio_set_dev(bio, spage_bad->dev->bdev);
                bio->bi_iter.bi_sector = spage_bad->physical >> 9;
                bio->bi_opf = REQ_OP_WRITE;
@@ -1676,7 +1675,7 @@ again:
                sbio->dev = sctx->wr_tgtdev;
                bio = sbio->bio;
                if (!bio) {
-                       bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
+                       bio = btrfs_bio_alloc(sctx->pages_per_wr_bio);
                        sbio->bio = bio;
                }
 
@@ -2102,7 +2101,7 @@ again:
                sbio->dev = spage->dev;
                bio = sbio->bio;
                if (!bio) {
-                       bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
+                       bio = btrfs_bio_alloc(sctx->pages_per_rd_bio);
                        sbio->bio = bio;
                }
 
@@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        u64 length = sblock->page_count * PAGE_SIZE;
        u64 logical = sblock->pagev[0]->logical;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        struct bio *bio;
        struct btrfs_raid_bio *rbio;
        int ret;
@@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 
        btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                       &length, &bbio);
-       if (ret || !bbio || !bbio->raid_map)
-               goto bbio_out;
+                              &length, &bioc);
+       if (ret || !bioc || !bioc->raid_map)
+               goto bioc_out;
 
        if (WARN_ON(!sctx->is_dev_replace ||
-                   !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+                   !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
                /*
                 * We shouldn't be scrubbing a missing device. Even for dev
                 * replace, we should only get here for RAID 5/6. We either
                 * managed to mount something with no mirrors remaining or
                 * there's a bug in scrub_remap_extent()/btrfs_map_block().
                 */
-               goto bbio_out;
+               goto bioc_out;
        }
 
-       bio = btrfs_io_bio_alloc(0);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
        bio->bi_iter.bi_sector = logical >> 9;
        bio->bi_private = sblock;
        bio->bi_end_io = scrub_missing_raid56_end_io;
 
-       rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
+       rbio = raid56_alloc_missing_rbio(bio, bioc, length);
        if (!rbio)
                goto rbio_out;
 
@@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
 
 rbio_out:
        bio_put(bio);
-bbio_out:
+bioc_out:
        btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
        spin_lock(&sctx->stat_lock);
        sctx->stat.malloc_errors++;
        spin_unlock(&sctx->stat_lock);
@@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        struct bio *bio;
        struct btrfs_raid_bio *rbio;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        u64 length;
        int ret;
 
@@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 
        btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
-                              &length, &bbio);
-       if (ret || !bbio || !bbio->raid_map)
-               goto bbio_out;
+                              &length, &bioc);
+       if (ret || !bioc || !bioc->raid_map)
+               goto bioc_out;
 
-       bio = btrfs_io_bio_alloc(0);
+       bio = btrfs_bio_alloc(BIO_MAX_VECS);
        bio->bi_iter.bi_sector = sparity->logic_start >> 9;
        bio->bi_private = sparity;
        bio->bi_end_io = scrub_parity_bio_endio;
 
-       rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
-                                             length, sparity->scrub_dev,
+       rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
+                                             sparity->scrub_dev,
                                              sparity->dbitmap,
                                              sparity->nsectors);
        if (!rbio)
@@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 
 rbio_out:
        bio_put(bio);
-bbio_out:
+bioc_out:
        btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                  sparity->nsectors);
        spin_lock(&sctx->stat_lock);
@@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
        struct btrfs_extent_item *extent;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        u64 flags;
        int ret;
        int slot;
@@ -3044,22 +3043,22 @@ again:
                                                       extent_len);
 
                        mapped_length = extent_len;
-                       bbio = NULL;
+                       bioc = NULL;
                        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
-                                       extent_logical, &mapped_length, &bbio,
+                                       extent_logical, &mapped_length, &bioc,
                                        0);
                        if (!ret) {
-                               if (!bbio || mapped_length < extent_len)
+                               if (!bioc || mapped_length < extent_len)
                                        ret = -EIO;
                        }
                        if (ret) {
-                               btrfs_put_bbio(bbio);
+                               btrfs_put_bioc(bioc);
                                goto out;
                        }
-                       extent_physical = bbio->stripes[0].physical;
-                       extent_mirror_num = bbio->mirror_num;
-                       extent_dev = bbio->stripes[0].dev;
-                       btrfs_put_bbio(bbio);
+                       extent_physical = bioc->stripes[0].physical;
+                       extent_mirror_num = bioc->mirror_num;
+                       extent_dev = bioc->stripes[0].dev;
+                       btrfs_put_bioc(bioc);
 
                        ret = btrfs_lookup_csums_range(csum_root,
                                                extent_logical,
@@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
        int     ret;
        struct btrfs_fs_info *fs_info = sctx->fs_info;
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                return -EROFS;
 
        /* Seed devices of a new filesystem has their own generation. */
@@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                    u64 end, struct btrfs_scrub_progress *progress,
                    int readonly, int is_dev_replace)
 {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
        struct scrub_ctx *sctx;
        int ret;
        struct btrfs_device *dev;
@@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                goto out_free_ctx;
 
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
                     !is_dev_replace)) {
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                         struct btrfs_scrub_progress *progress)
 {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
        struct btrfs_device *dev;
        struct scrub_ctx *sctx = NULL;
 
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (dev)
                sctx = dev->scrub_ctx;
        if (sctx)
@@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
                               int *extent_mirror_num)
 {
        u64 mapped_length;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        int ret;
 
        mapped_length = extent_len;
        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
-                             &mapped_length, &bbio, 0);
-       if (ret || !bbio || mapped_length < extent_len ||
-           !bbio->stripes[0].dev->bdev) {
-               btrfs_put_bbio(bbio);
+                             &mapped_length, &bioc, 0);
+       if (ret || !bioc || mapped_length < extent_len ||
+           !bioc->stripes[0].dev->bdev) {
+               btrfs_put_bioc(bioc);
                return;
        }
 
-       *extent_physical = bbio->stripes[0].physical;
-       *extent_mirror_num = bbio->mirror_num;
-       *extent_dev = bbio->stripes[0].dev;
-       btrfs_put_bbio(bbio);
+       *extent_physical = bioc->stripes[0].physical;
+       *extent_mirror_num = bioc->mirror_num;
+       *extent_dev = bioc->stripes[0].dev;
+       btrfs_put_bioc(bioc);
 }
index 72f9b86..040324d 100644 (file)
@@ -84,6 +84,8 @@ struct send_ctx {
        u64 total_send_size;
        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
        u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
+       /* Protocol version compatibility requested */
+       u32 proto;
 
        struct btrfs_root *send_root;
        struct btrfs_root *parent_root;
@@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
                   sctx->parent_root->root_key.objectid : 0));
 }
 
+__maybe_unused
+static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
+{
+       switch (sctx->proto) {
+       case 1:  return cmd < __BTRFS_SEND_C_MAX_V1;
+       case 2:  return cmd < __BTRFS_SEND_C_MAX_V2;
+       default: return false;
+       }
+}
+
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 
 static struct waiting_dir_move *
@@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx)
        if (S_ISDIR(sctx->cur_inode_mode)) {
                ret = did_create_dir(sctx, sctx->cur_ino);
                if (ret < 0)
-                       goto out;
-               if (ret) {
-                       ret = 0;
-                       goto out;
-               }
+                       return ret;
+               else if (ret > 0)
+                       return 0;
        }
 
-       ret = send_create_inode(sctx, sctx->cur_ino);
-       if (ret < 0)
-               goto out;
-
-out:
-       return ret;
+       return send_create_inode(sctx, sctx->cur_ino);
 }
 
 struct recorded_ref {
@@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 
        sctx->flags = arg->flags;
 
+       if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
+               if (arg->version > BTRFS_SEND_STREAM_VERSION) {
+                       ret = -EPROTO;
+                       goto out;
+               }
+               /* Zero means "use the highest version" */
+               sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
+       } else {
+               sctx->proto = 1;
+       }
+
        sctx->send_filp = fget(arg->send_fd);
        if (!sctx->send_filp) {
                ret = -EBADF;
index de91488..23bcefc 100644 (file)
@@ -48,6 +48,7 @@ struct btrfs_tlv_header {
 enum btrfs_send_cmd {
        BTRFS_SEND_C_UNSPEC,
 
+       /* Version 1 */
        BTRFS_SEND_C_SUBVOL,
        BTRFS_SEND_C_SNAPSHOT,
 
@@ -76,6 +77,12 @@ enum btrfs_send_cmd {
 
        BTRFS_SEND_C_END,
        BTRFS_SEND_C_UPDATE_EXTENT,
+       __BTRFS_SEND_C_MAX_V1,
+
+       /* Version 2 */
+       __BTRFS_SEND_C_MAX_V2,
+
+       /* End */
        __BTRFS_SEND_C_MAX,
 };
 #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
index aa5be0b..48d77f3 100644 (file)
@@ -885,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
 {
        struct reserve_ticket *ticket;
        u64 tickets_id = space_info->tickets_id;
+       const bool aborted = BTRFS_FS_ERROR(fs_info);
 
        trace_btrfs_fail_all_tickets(fs_info, space_info);
 
@@ -898,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                ticket = list_first_entry(&space_info->tickets,
                                          struct reserve_ticket, list);
 
-               if (ticket->steal &&
+               if (!aborted && ticket->steal &&
                    steal_from_global_rsv(fs_info, space_info, ticket))
                        return true;
 
-               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+               if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
                        btrfs_info(fs_info, "failing ticket with %llu bytes",
                                   ticket->bytes);
 
                remove_ticket(space_info, ticket);
-               ticket->error = -ENOSPC;
+               if (aborted)
+                       ticket->error = -EIO;
+               else
+                       ticket->error = -ENOSPC;
                wake_up(&ticket->wait);
 
                /*
@@ -916,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
                 * here to see if we can make progress with the next ticket in
                 * the list.
                 */
-               btrfs_try_granting_tickets(fs_info, space_info);
+               if (!aborted)
+                       btrfs_try_granting_tickets(fs_info, space_info);
        }
        return (tickets_id != space_info->tickets_id);
 }
@@ -1172,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                        spin_unlock(&space_info->lock);
                        return;
                }
+
+               /* Something happened, fail everything and bail. */
+               if (BTRFS_FS_ERROR(fs_info))
+                       goto aborted_fs;
                last_tickets_id = space_info->tickets_id;
                spin_unlock(&space_info->lock);
        }
@@ -1202,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
                        } else {
                                flush_state = 0;
                        }
+
+                       /* Something happened, fail everything and bail. */
+                       if (BTRFS_FS_ERROR(fs_info))
+                               goto aborted_fs;
+
                }
                spin_unlock(&space_info->lock);
        }
+       return;
+
+aborted_fs:
+       maybe_fail_all_tickets(fs_info, space_info);
+       space_info->flush = 0;
+       spin_unlock(&space_info->lock);
 }
 
 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
index cb10e56..29bd8c7 100644 (file)
  *   This means a slightly higher tree locking latency.
  */
 
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
+{
+       unsigned int cur = 0;
+       unsigned int nr_bits;
+
+       ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize));
+
+       nr_bits = PAGE_SIZE / sectorsize;
+       subpage_info->bitmap_nr_bits = nr_bits;
+
+       subpage_info->uptodate_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->error_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->dirty_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->writeback_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->ordered_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->checked_offset = cur;
+       cur += nr_bits;
+
+       subpage_info->total_nr_bits = cur;
+}
+
 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
                         struct page *page, enum btrfs_subpage_type type)
 {
-       struct btrfs_subpage *subpage = NULL;
-       int ret;
+       struct btrfs_subpage *subpage;
 
        /*
         * We have cases like a dummy extent buffer page, which is not mappped
@@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
         */
        if (page->mapping)
                ASSERT(PageLocked(page));
+
        /* Either not subpage, or the page already has private attached */
        if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
                return 0;
 
-       ret = btrfs_alloc_subpage(fs_info, &subpage, type);
-       if (ret < 0)
-               return ret;
+       subpage = btrfs_alloc_subpage(fs_info, type);
+       if (IS_ERR(subpage))
+               return  PTR_ERR(subpage);
+
        attach_page_private(page, subpage);
        return 0;
 }
@@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
        btrfs_free_subpage(subpage);
 }
 
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
-                       struct btrfs_subpage **ret,
-                       enum btrfs_subpage_type type)
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+                                         enum btrfs_subpage_type type)
 {
-       if (fs_info->sectorsize == PAGE_SIZE)
-               return 0;
+       struct btrfs_subpage *ret;
+       unsigned int real_size;
+
+       ASSERT(fs_info->sectorsize < PAGE_SIZE);
+
+       real_size = struct_size(ret, bitmaps,
+                       BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits));
+       ret = kzalloc(real_size, GFP_NOFS);
+       if (!ret)
+               return ERR_PTR(-ENOMEM);
 
-       *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
-       if (!*ret)
-               return -ENOMEM;
-       spin_lock_init(&(*ret)->lock);
+       spin_lock_init(&ret->lock);
        if (type == BTRFS_SUBPAGE_METADATA) {
-               atomic_set(&(*ret)->eb_refs, 0);
+               atomic_set(&ret->eb_refs, 0);
        } else {
-               atomic_set(&(*ret)->readers, 0);
-               atomic_set(&(*ret)->writers, 0);
+               atomic_set(&ret->readers, 0);
+               atomic_set(&ret->writers, 0);
        }
-       return 0;
+       return ret;
 }
 
 void btrfs_free_subpage(struct btrfs_subpage *subpage)
@@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
        u32 orig_len = *len;
 
        *start = max_t(u64, page_offset(page), orig_start);
-       *len = min_t(u64, page_offset(page) + PAGE_SIZE,
-                    orig_start + orig_len) - *start;
+       /*
+        * For certain call sites like btrfs_drop_pages(), we may have pages
+        * beyond the target range. In that case, just set @len to 0, subpage
+        * helpers can handle @len == 0 without any problem.
+        */
+       if (page_offset(page) >= orig_start + orig_len)
+               *len = 0;
+       else
+               *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+                            orig_start + orig_len) - *start;
 }
 
 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
@@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
 
        btrfs_subpage_assert(fs_info, page, start, len);
 
+       /*
+        * We have call sites passing @lock_page into
+        * extent_clear_unlock_delalloc() for compression path.
+        *
+        * This @locked_page is locked by plain lock_page(), thus its
+        * subpage::writers is 0.  Handle them in a special way.
+        */
+       if (atomic_read(&subpage->writers) == 0)
+               return true;
+
        ASSERT(atomic_read(&subpage->writers) >= nbits);
        return atomic_sub_and_test(nbits, &subpage->writers);
 }
@@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
                unlock_page(page);
 }
 
-/*
- * Convert the [start, start + len) range into a u16 bitmap
- *
- * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
- */
-static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
-               struct page *page, u64 start, u32 len)
+static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start,
+                                     unsigned int nbits)
 {
-       const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
-       const int nbits = len >> fs_info->sectorsize_bits;
+       unsigned int found_zero;
 
-       btrfs_subpage_assert(fs_info, page, start, len);
+       found_zero = find_next_zero_bit(addr, start + nbits, start);
+       if (found_zero == start + nbits)
+               return true;
+       return false;
+}
 
-       /*
-        * Here nbits can be 16, thus can go beyond u16 range. We make the
-        * first left shift to be calculate in unsigned long (at least u32),
-        * then truncate the result to u16.
-        */
-       return (u16)(((1UL << nbits) - 1) << bit_start);
+static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start,
+                                      unsigned int nbits)
+{
+       unsigned int found_set;
+
+       found_set = find_next_bit(addr, start + nbits, start);
+       if (found_set == start + nbits)
+               return true;
+       return false;
 }
 
+#define subpage_calc_start_bit(fs_info, page, name, start, len)                \
+({                                                                     \
+       unsigned int start_bit;                                         \
+                                                                       \
+       btrfs_subpage_assert(fs_info, page, start, len);                \
+       start_bit = offset_in_page(start) >> fs_info->sectorsize_bits;  \
+       start_bit += fs_info->subpage_info->name##_offset;              \
+       start_bit;                                                      \
+})
+
+#define subpage_test_bitmap_all_set(fs_info, subpage, name)            \
+       bitmap_test_range_all_set(subpage->bitmaps,                     \
+                       fs_info->subpage_info->name##_offset,           \
+                       fs_info->subpage_info->bitmap_nr_bits)
+
+#define subpage_test_bitmap_all_zero(fs_info, subpage, name)           \
+       bitmap_test_range_all_zero(subpage->bitmaps,                    \
+                       fs_info->subpage_info->name##_offset,           \
+                       fs_info->subpage_info->bitmap_nr_bits)
+
 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       uptodate, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->uptodate_bitmap |= tmp;
-       if (subpage->uptodate_bitmap == U16_MAX)
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
                SetPageUptodate(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       uptodate, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->uptodate_bitmap &= ~tmp;
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        ClearPageUptodate(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       error, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->error_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        SetPageError(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       error, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->error_bitmap &= ~tmp;
-       if (subpage->error_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, error))
                ClearPageError(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       dirty, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->dirty_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        spin_unlock_irqrestore(&subpage->lock, flags);
        set_page_dirty(page);
 }
@@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       dirty, start, len);
        unsigned long flags;
        bool last = false;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->dirty_bitmap &= ~tmp;
-       if (subpage->dirty_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
                last = true;
        spin_unlock_irqrestore(&subpage->lock, flags);
        return last;
@@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       writeback, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->writeback_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        set_page_writeback(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       writeback, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->writeback_bitmap &= ~tmp;
-       if (subpage->writeback_bitmap == 0) {
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
                ASSERT(PageWriteback(page));
                end_page_writeback(page);
        }
@@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       ordered, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->ordered_bitmap |= tmp;
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
        SetPageOrdered(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
@@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len)
 {
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       ordered, start, len);
        unsigned long flags;
 
        spin_lock_irqsave(&subpage->lock, flags);
-       subpage->ordered_bitmap &= ~tmp;
-       if (subpage->ordered_bitmap == 0)
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
                ClearPageOrdered(page);
        spin_unlock_irqrestore(&subpage->lock, flags);
 }
+
+void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
+                              struct page *page, u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       checked, start, len);
+       unsigned long flags;
+
+       spin_lock_irqsave(&subpage->lock, flags);
+       bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+               SetPageChecked(page);
+       spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
+                                struct page *page, u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+                                                       checked, start, len);
+       unsigned long flags;
+
+       spin_lock_irqsave(&subpage->lock, flags);
+       bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+       ClearPageChecked(page);
+       spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
 /*
  * Unlike set/clear which is dependent on each page status, for test all bits
  * are tested in the same way.
@@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info,       \
                struct page *page, u64 start, u32 len)                  \
 {                                                                      \
        struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
-       const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
+       unsigned int start_bit = subpage_calc_start_bit(fs_info, page,  \
+                                               name, start, len);      \
        unsigned long flags;                                            \
        bool ret;                                                       \
                                                                        \
        spin_lock_irqsave(&subpage->lock, flags);                       \
-       ret = ((subpage->name##_bitmap & tmp) == tmp);                  \
+       ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit,    \
+                               len >> fs_info->sectorsize_bits);       \
        spin_unlock_irqrestore(&subpage->lock, flags);                  \
        return ret;                                                     \
 }
@@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
 
 /*
  * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
                         PageWriteback);
 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
                         PageOrdered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
 
 /*
  * Make sure not only the page dirty bit is cleared, but also subpage dirty bit
@@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
                return;
 
        ASSERT(PagePrivate(page) && page->private);
-       ASSERT(subpage->dirty_bitmap == 0);
+       ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
+}
+
+/*
+ * Handle different locked pages with different page sizes:
+ *
+ * - Page locked by plain lock_page()
+ *   It should not have any subpage::writers count.
+ *   Can be unlocked by unlock_page().
+ *   This is the most common locked page for __extent_writepage() called
+ *   inside extent_write_cache_pages() or extent_write_full_page().
+ *   Rarer cases include the @locked_page from extent_write_locked_range().
+ *
+ * - Page locked by lock_delalloc_pages()
+ *   There is only one caller, all pages except @locked_page for
+ *   extent_write_locked_range().
+ *   In this case, we have to call subpage helper to handle the case.
+ */
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+                             u64 start, u32 len)
+{
+       struct btrfs_subpage *subpage;
+
+       ASSERT(PageLocked(page));
+       /* For regular page size case, we just unlock the page */
+       if (fs_info->sectorsize == PAGE_SIZE)
+               return unlock_page(page);
+
+       ASSERT(PagePrivate(page) && page->private);
+       subpage = (struct btrfs_subpage *)page->private;
+
+       /*
+        * For subpage case, there are two types of locked page.  With or
+        * without writers number.
+        *
+        * Since we own the page lock, no one else could touch subpage::writers
+        * and we are safe to do several atomic operations without spinlock.
+        */
+       if (atomic_read(&subpage->writers))
+               /* No writers, locked by plain lock_page() */
+               return unlock_page(page);
+
+       /* Have writers, use proper subpage helper to end it */
+       btrfs_page_end_writer_lock(fs_info, page, start, len);
 }
index 0120948..7accb5c 100644 (file)
@@ -6,10 +6,38 @@
 #include <linux/spinlock.h>
 
 /*
- * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap
- * is sufficient. Regular bitmap_* is not used due to size reasons.
+ * Extra info for subpapge bitmap.
+ *
+ * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into
+ * one larger bitmap.
+ *
+ * This structure records how they are organized in the bitmap:
+ *
+ * /- uptodate_offset  /- error_offset /- dirty_offset
+ * |                   |               |
+ * v                   v               v
+ * |u|u|u|u|........|u|u|e|e|.......|e|e| ...  |o|o|
+ * |<- bitmap_nr_bits ->|
+ * |<--------------- total_nr_bits ---------------->|
  */
-#define BTRFS_SUBPAGE_BITMAP_SIZE      16
+struct btrfs_subpage_info {
+       /* Number of bits for each bitmap */
+       unsigned int bitmap_nr_bits;
+
+       /* Total number of bits for the whole bitmap */
+       unsigned int total_nr_bits;
+
+       /*
+        * *_start indicates where the bitmap starts, the length is always
+        * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize.
+        */
+       unsigned int uptodate_offset;
+       unsigned int error_offset;
+       unsigned int dirty_offset;
+       unsigned int writeback_offset;
+       unsigned int ordered_offset;
+       unsigned int checked_offset;
+};
 
 /*
  * Structure to trace status of each sector inside a page, attached to
 struct btrfs_subpage {
        /* Common members for both data and metadata pages */
        spinlock_t lock;
-       u16 uptodate_bitmap;
-       u16 error_bitmap;
-       u16 dirty_bitmap;
-       u16 writeback_bitmap;
        /*
         * Both data and metadata needs to track how many readers are for the
         * page.
@@ -38,14 +62,11 @@ struct btrfs_subpage {
                 * manages whether the subpage can be detached.
                 */
                atomic_t eb_refs;
-               /* Structures only used by data */
-               struct {
-                       atomic_t writers;
 
-                       /* Tracke pending ordered extent in this sector */
-                       u16 ordered_bitmap;
-               };
+               /* Structures only used by data */
+               atomic_t writers;
        };
+       unsigned long bitmaps[];
 };
 
 enum btrfs_subpage_type {
@@ -53,15 +74,15 @@ enum btrfs_subpage_type {
        BTRFS_SUBPAGE_DATA,
 };
 
+void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
                         struct page *page, enum btrfs_subpage_type type);
 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
                          struct page *page);
 
 /* Allocate additional data where page represents more than one sector */
-int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
-                       struct btrfs_subpage **ret,
-                       enum btrfs_subpage_type type);
+struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
+                                         enum btrfs_subpage_type type);
 void btrfs_free_subpage(struct btrfs_subpage *subpage);
 
 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
@@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error);
 DECLARE_BTRFS_SUBPAGE_OPS(dirty);
 DECLARE_BTRFS_SUBPAGE_OPS(writeback);
 DECLARE_BTRFS_SUBPAGE_OPS(ordered);
+DECLARE_BTRFS_SUBPAGE_OPS(checked);
 
 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
                struct page *page, u64 start, u32 len);
 
 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
                                 struct page *page);
+void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
+                             u64 start, u32 len);
 
 #endif
index 537d90b..a1c54a2 100644 (file)
@@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
                goto error_close_devices;
        }
 
-       bdev = fs_devices->latest_bdev;
+       bdev = fs_devices->latest_dev->bdev;
        s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
                 fs_info);
        if (IS_ERR(s)) {
@@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (ret)
                        goto restore;
        } else {
-               if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+               if (BTRFS_FS_ERROR(fs_info)) {
                        btrfs_err(fs_info,
                                "Remounting read-write after error is not allowed");
                        ret = -EINVAL;
@@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb)
 static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
-       struct btrfs_device *dev, *first_dev = NULL;
 
        /*
-        * Lightweight locking of the devices. We should not need
-        * device_list_mutex here as we only read the device data and the list
-        * is protected by RCU.  Even if a device is deleted during the list
-        * traversals, we'll get valid data, the freeing callback will wait at
-        * least until the rcu_read_unlock.
+        * There should be always a valid pointer in latest_dev, it may be stale
+        * for a short moment in case it's being deleted but still valid until
+        * the end of RCU grace period.
         */
        rcu_read_lock();
-       list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
-               if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
-                       continue;
-               if (!dev->name)
-                       continue;
-               if (!first_dev || dev->devid < first_dev->devid)
-                       first_dev = dev;
-       }
-
-       if (first_dev)
-               seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
-       else
-               WARN_ON(1);
+       seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
        rcu_read_unlock();
+
        return 0;
 }
 
index 25a6f58..f9eff3b 100644 (file)
@@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
        } else
                val = can_modify_feature(fa);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
 }
 
 static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
@@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = {
 static ssize_t rmdir_subvol_show(struct kobject *kobj,
                                 struct kobj_attribute *ka, char *buf)
 {
-       return scnprintf(buf, PAGE_SIZE, "0\n");
+       return sysfs_emit(buf, "0\n");
 }
 BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
 
@@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
                 * This "trick" only works as long as 'enum btrfs_csum_type' has
                 * no holes in it
                 */
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
-                               (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+               ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "),
+                                    btrfs_super_csum_name(i));
 
        }
 
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+       ret += sysfs_emit_at(buf, ret, "\n");
        return ret;
 }
 BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
@@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
 static ssize_t send_stream_version_show(struct kobject *kobj,
                                        struct kobj_attribute *ka, char *buf)
 {
-       return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+       return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION);
 }
 BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
 
@@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
        int i;
 
        for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
-                                (i ? " " : ""), rescue_opts[i]);
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+               ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]);
+       ret += sysfs_emit_at(buf, ret, "\n");
        return ret;
 }
 BTRFS_ATTR(static_feature, supported_rescue_options,
@@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
 
        /* 4K sector size is also supported with 64K page size */
        if (PAGE_SIZE == SZ_64K)
-               ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K);
+               ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
 
        /* Only sectorsize == PAGE_SIZE is now supported */
-       ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+       ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
 
        return ret;
 }
@@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%lld\n",
+       return sysfs_emit(buf, "%lld\n",
                        atomic64_read(&fs_info->discard_ctl.discardable_bytes));
 }
 BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show);
@@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n",
+       return sysfs_emit(buf, "%d\n",
                        atomic_read(&fs_info->discard_ctl.discardable_extents));
 }
 BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show);
@@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       fs_info->discard_ctl.discard_bitmap_bytes);
+       return sysfs_emit(buf, "%llu\n",
+                         fs_info->discard_ctl.discard_bitmap_bytes);
 }
 BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show);
 
@@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%lld\n",
+       return sysfs_emit(buf, "%lld\n",
                atomic64_read(&fs_info->discard_ctl.discard_bytes_saved));
 }
 BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show);
@@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       fs_info->discard_ctl.discard_extent_bytes);
+       return sysfs_emit(buf, "%llu\n",
+                         fs_info->discard_ctl.discard_extent_bytes);
 }
 BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show);
 
@@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                       READ_ONCE(fs_info->discard_ctl.iops_limit));
+       return sysfs_emit(buf, "%u\n",
+                         READ_ONCE(fs_info->discard_ctl.iops_limit));
 }
 
 static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj,
@@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                       READ_ONCE(fs_info->discard_ctl.kbps_limit));
+       return sysfs_emit(buf, "%u\n",
+                         READ_ONCE(fs_info->discard_ctl.kbps_limit));
 }
 
 static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj,
@@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                       READ_ONCE(fs_info->discard_ctl.max_discard_size));
+       return sysfs_emit(buf, "%llu\n",
+                         READ_ONCE(fs_info->discard_ctl.max_discard_size));
 }
 
 static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj,
@@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
        val = *value_ptr;
        if (lock)
                spin_unlock(lock);
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+       return sysfs_emit(buf, "%llu\n", val);
 }
 
 static ssize_t global_rsv_size_show(struct kobject *kobj,
@@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
                        val += block_group->used;
        }
        up_read(&sinfo->groups_sem);
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+       return sysfs_emit(buf, "%llu\n", val);
 }
 
 /*
@@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
        ssize_t ret;
 
        spin_lock(&fs_info->super_lock);
-       ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
+       ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label);
        spin_unlock(&fs_info->super_lock);
 
        return ret;
@@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize);
 }
 
 BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%u\n",
-                        fs_info->super_copy->sectorsize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
 }
 
 BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+       return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize);
 }
 
 BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
@@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj,
        int quota_override;
 
        quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags);
-       return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+       return sysfs_emit(buf, "%d\n", quota_override);
 }
 
 static ssize_t quota_override_store(struct kobject *kobj,
@@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%pU\n",
-                       fs_info->fs_devices->metadata_uuid);
+       return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid);
 }
 
 BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
@@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
        u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
 
-       return scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
-                       btrfs_super_csum_name(csum_type),
-                       crypto_shash_driver_name(fs_info->csum_shash));
+       return sysfs_emit(buf, "%s (%s)\n",
+                         btrfs_super_csum_name(csum_type),
+                         crypto_shash_driver_name(fs_info->csum_shash));
 }
 
 BTRFS_ATTR(, checksum, btrfs_checksum_show);
@@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
                        str = "UNKNOWN\n";
                        break;
        }
-       return scnprintf(buf, PAGE_SIZE, "%s", str);
+       return sysfs_emit(buf, "%s", str);
 }
 BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
 
@@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+       return sysfs_emit(buf, "%llu\n", fs_info->generation);
 }
 BTRFS_ATTR(, generation, btrfs_generation_show);
 
@@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
        ssize_t ret;
 
-       ret = scnprintf(buf, PAGE_SIZE, "%d\n",
-                       READ_ONCE(fs_info->bg_reclaim_threshold));
+       ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
 
        return ret;
 }
@@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
 
        val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
 }
 BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show);
 
@@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj,
 
        val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
 }
 BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show);
 
@@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj,
 
        val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
 }
 BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show);
 
@@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj,
        struct btrfs_device *device = container_of(kobj, struct btrfs_device,
                                                   devid_kobj);
 
-       return scnprintf(buf, PAGE_SIZE, "%llu\n",
-                        READ_ONCE(device->scrub_speed_max));
+       return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max));
 }
 
 static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
@@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj,
 
        val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 
-       return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+       return sysfs_emit(buf, "%d\n", val);
 }
 BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show);
 
@@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj,
                                                   devid_kobj);
 
        if (!device->dev_stats_valid)
-               return scnprintf(buf, PAGE_SIZE, "invalid\n");
+               return sysfs_emit(buf, "invalid\n");
 
        /*
         * Print all at once so we get a snapshot of all values from the same
         * time. Keep them in sync and in order of definition of
         * btrfs_dev_stat_values.
         */
-       return scnprintf(buf, PAGE_SIZE,
+       return sysfs_emit(buf,
                "write_errs %d\n"
                "read_errs %d\n"
                "flush_errs %d\n"
index df54cdf..2a95f72 100644 (file)
@@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
        key.type = BTRFS_EXTENT_CSUM_KEY;
        key.offset = 0;
 
-       setup_items_for_insert(root, path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, path, &key, value_len);
        item = btrfs_item_nr(0);
        write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
                            value_len);
index 73e96d5..c2e72e7 100644 (file)
@@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize)
         */
        set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
        start = 0;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
        found = find_lock_delalloc_range(inode, locked_page, &start,
                                         &end);
        if (!found) {
@@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize)
        }
        set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
        start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
        found = find_lock_delalloc_range(inode, locked_page, &start,
                                         &end);
        if (!found) {
@@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize)
                goto out_bits;
        }
        start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
        found = find_lock_delalloc_range(inode, locked_page, &start,
                                         &end);
        if (found) {
                test_err("found range when we shouldn't have");
                goto out_bits;
        }
-       if (end != (u64)-1) {
+       if (end != test_start + PAGE_SIZE - 1) {
                test_err("did not return the proper end offset");
                goto out_bits;
        }
@@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize)
         */
        set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
        start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
        found = find_lock_delalloc_range(inode, locked_page, &start,
                                         &end);
        if (!found) {
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
        /* We unlocked it in the previous test */
        lock_page(locked_page);
        start = test_start;
-       end = 0;
+       end = start + PAGE_SIZE - 1;
        /*
         * Currently if we fail to find dirty pages in the delalloc range we
         * will adjust max_bytes down to PAGE_SIZE and then re-search.  If
index c9874b1..cac89c3 100644 (file)
@@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = start;
 
-       setup_items_for_insert(root, &path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, &path, &key, value_len);
        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
        btrfs_set_file_extent_generation(leaf, fi, 1);
        btrfs_set_file_extent_type(leaf, fi, type);
@@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
 
-       setup_items_for_insert(root, &path, &key, &value_len, 1);
+       btrfs_setup_item_for_insert(root, &path, &key, value_len);
 }
 
 /*
index 14b9fdc..1c3a118 100644 (file)
@@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
        spin_lock(&fs_info->trans_lock);
 loop:
        /* The file system has been taken offline. No new transactions. */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                spin_unlock(&fs_info->trans_lock);
                return -EROFS;
        }
@@ -331,7 +331,7 @@ loop:
                 */
                kfree(cur_trans);
                goto loop;
-       } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       } else if (BTRFS_FS_ERROR(fs_info)) {
                spin_unlock(&fs_info->trans_lock);
                kfree(cur_trans);
                return -EROFS;
@@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
        bool do_chunk_alloc = false;
        int ret;
 
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+       if (BTRFS_FS_ERROR(fs_info))
                return ERR_PTR(-EROFS);
 
        if (current->journal_info) {
@@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (throttle)
                btrfs_run_delayed_iputs(info);
 
-       if (TRANS_ABORTED(trans) ||
-           test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
+       if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
                wake_up_process(info->transaction_kthread);
                if (TRANS_ABORTED(trans))
                        err = trans->aborted;
@@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                 * abort to prevent writing a new superblock that reflects a
                 * corrupt state (pointing to trees with unwritten nodes/leafs).
                 */
-               if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+               if (BTRFS_FS_ERROR(fs_info)) {
                        ret = -EROFS;
                        goto cleanup_transaction;
                }
index b415c5e..8ab33ca 100644 (file)
@@ -94,7 +94,7 @@ enum {
 };
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct btrfs_inode *inode,
+                          struct btrfs_inode *inode,
                           int inode_only,
                           struct btrfs_log_ctx *ctx);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
@@ -207,7 +207,7 @@ again:
        }
 
        atomic_inc(&root->log_writers);
-       if (ctx && !ctx->logging_new_name) {
+       if (!ctx->logging_new_name) {
                int index = root->log_transid % 2;
                list_add_tail(&ctx->list, &root->log_ctxs[index]);
                ctx->log_transid = root->log_transid;
@@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log,
        return ret;
 }
 
-/*
- * Item overwrite used by replay and tree logging.  eb, slot and key all refer
- * to the src data we are copying out.
- *
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
- *
- * If the key is already in the destination tree the existing item is
- * overwritten.  If the existing item isn't big enough, it is extended.
- * If it is too large, it is truncated.
- *
- * If the key isn't in the destination yet, a new item is inserted.
- */
-static noinline int overwrite_item(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root,
-                                  struct btrfs_path *path,
-                                  struct extent_buffer *eb, int slot,
-                                  struct btrfs_key *key)
+static int do_overwrite_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct extent_buffer *eb, int slot,
+                            struct btrfs_key *key)
 {
        int ret;
        u32 item_size;
@@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
        item_size = btrfs_item_size_nr(eb, slot);
        src_ptr = btrfs_item_ptr_offset(eb, slot);
 
-       /* look for the key in the destination tree */
-       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-       if (ret < 0)
-               return ret;
+       /* Our caller must have done a search for the key for us. */
+       ASSERT(path->nodes[0] != NULL);
+
+       /*
+        * And the slot must point to the exact key or the slot where the key
+        * should be at (the first item with a key greater than 'key')
+        */
+       if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+               struct btrfs_key found_key;
+
+               btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+               ret = btrfs_comp_cpu_keys(&found_key, key);
+               ASSERT(ret >= 0);
+       } else {
+               ret = 1;
+       }
 
        if (ret == 0) {
                char *src_copy;
@@ -585,6 +583,36 @@ no_copy:
 }
 
 /*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static int overwrite_item(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         struct extent_buffer *eb, int slot,
+                         struct btrfs_key *key)
+{
+       int ret;
+
+       /* Look for the key in the destination tree. */
+       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       return do_overwrite_item(trans, root, path, eb, slot, key);
+}
+
+/*
  * simple helper to read an inode off the disk from a given root
  * This can only be called for subvolume roots and not for the log
  */
@@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                ins.objectid, ins.offset, 0);
                                btrfs_init_data_ref(&ref,
                                                root->root_key.objectid,
-                                               key->objectid, offset);
+                                               key->objectid, offset, 0, false);
                                ret = btrfs_inc_extent_ref(trans, &ref);
                                if (ret)
                                        goto out;
@@ -893,11 +921,11 @@ out:
  * item
  */
 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
                                      struct btrfs_path *path,
                                      struct btrfs_inode *dir,
                                      struct btrfs_dir_item *di)
 {
+       struct btrfs_root *root = dir->root;
        struct inode *inode;
        char *name;
        int name_len;
@@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        if (ret)
                goto out;
 
-       ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
+       ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
                        name_len);
        if (ret)
                goto out;
@@ -1091,7 +1119,7 @@ again:
                                inc_nlink(&inode->vfs_inode);
                                btrfs_release_path(path);
 
-                               ret = btrfs_unlink_inode(trans, root, dir, inode,
+                               ret = btrfs_unlink_inode(trans, dir, inode,
                                                victim_name, victim_name_len);
                                kfree(victim_name);
                                if (ret)
@@ -1162,7 +1190,7 @@ again:
                                        inc_nlink(&inode->vfs_inode);
                                        btrfs_release_path(path);
 
-                                       ret = btrfs_unlink_inode(trans, root,
+                                       ret = btrfs_unlink_inode(trans,
                                                        BTRFS_I(victim_parent),
                                                        inode,
                                                        victim_name,
@@ -1192,7 +1220,7 @@ next:
        if (IS_ERR(di)) {
                return PTR_ERR(di);
        } else if (di) {
-               ret = drop_one_dir_item(trans, root, path, dir, di);
+               ret = drop_one_dir_item(trans, path, dir, di);
                if (ret)
                        return ret;
        }
@@ -1204,7 +1232,7 @@ next:
        if (IS_ERR(di)) {
                return PTR_ERR(di);
        } else if (di) {
-               ret = drop_one_dir_item(trans, root, path, dir, di);
+               ret = drop_one_dir_item(trans, path, dir, di);
                if (ret)
                        return ret;
        }
@@ -1324,7 +1352,7 @@ again:
                                kfree(name);
                                goto out;
                        }
-                       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                                                 inode, name, namelen);
                        kfree(name);
                        iput(dir);
@@ -1385,10 +1413,11 @@ out:
        return ret;
 }
 
-static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+static int add_link(struct btrfs_trans_handle *trans,
                    struct inode *dir, struct inode *inode, const char *name,
                    int namelen, u64 ref_index)
 {
+       struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_dir_item *dir_item;
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -1422,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                ret = -ENOENT;
                goto out;
        }
-       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
+       ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
                                 name, namelen);
        if (ret)
                goto out;
@@ -1568,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                        ret = btrfs_inode_ref_exists(inode, dir, key->type,
                                                     name, namelen);
                        if (ret > 0) {
-                               ret = btrfs_unlink_inode(trans, root,
+                               ret = btrfs_unlink_inode(trans,
                                                         BTRFS_I(dir),
                                                         BTRFS_I(inode),
                                                         name, namelen);
@@ -1584,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                goto out;
 
                        /* insert our name */
-                       ret = add_link(trans, root, dir, inode, name, namelen,
+                       ret = add_link(trans, dir, inode, name, namelen,
                                       ref_index);
                        if (ret)
                                goto out;
@@ -2021,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        if (!exists)
                goto out;
 
-       ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
+       ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di);
        if (ret)
                goto out;
 
@@ -2251,13 +2280,13 @@ out:
  * to is unlinked
  */
 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
                                      struct btrfs_root *log,
                                      struct btrfs_path *path,
                                      struct btrfs_path *log_path,
                                      struct inode *dir,
                                      struct btrfs_key *dir_key)
 {
+       struct btrfs_root *root = BTRFS_I(dir)->root;
        int ret;
        struct extent_buffer *eb;
        int slot;
@@ -2318,7 +2347,7 @@ again:
                        }
 
                        inc_nlink(inode);
-                       ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
+                       ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
                                        BTRFS_I(inode), name, name_len);
                        if (!ret)
                                ret = btrfs_run_delayed_items(trans);
@@ -2500,7 +2529,9 @@ again:
                else {
                        ret = find_dir_range(log, path, dirid, key_type,
                                             &range_start, &range_end);
-                       if (ret != 0)
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
                                break;
                }
 
@@ -2529,7 +2560,7 @@ again:
                        if (found_key.offset > range_end)
                                break;
 
-                       ret = check_item_in_log(trans, root, log, path,
+                       ret = check_item_in_log(trans, log, path,
                                                log_path, dir,
                                                &found_key);
                        if (ret)
@@ -3037,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root)
 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
                                        struct btrfs_log_ctx *ctx)
 {
-       if (!ctx)
-               return;
-
        mutex_lock(&root->log_mutex);
        list_del_init(&ctx->list);
        mutex_unlock(&root->log_mutex);
@@ -3328,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * writing the super here would result in transid mismatches.  If there
         * is an error here just bail.
         */
-       if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+       if (BTRFS_FS_ERROR(fs_info)) {
                ret = -EIO;
                btrfs_set_log_full_commit(trans);
                btrfs_abort_transaction(trans, ret);
@@ -3452,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
        if (inode->logged_trans == trans->transid)
                return true;
 
+       if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
+               return false;
+
        /*
         * The inode's logged_trans is always 0 when we load it (because it is
         * not persisted in the inode item or elsewhere). So if it is 0, the
@@ -3490,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans,
  * This optimizations allows us to avoid relogging the entire inode
  * or the entire directory.
  */
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                const char *name, int name_len,
-                                struct btrfs_inode *dir, u64 index)
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct btrfs_inode *dir, u64 index)
 {
        struct btrfs_root *log;
        struct btrfs_dir_item *di;
@@ -3503,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        u64 dir_ino = btrfs_ino(dir);
 
        if (!inode_logged(trans, dir))
-               return 0;
+               return;
 
        ret = join_running_log_trans(root);
        if (ret)
-               return 0;
+               return;
 
        mutex_lock(&dir->log_mutex);
 
@@ -3555,48 +3586,36 @@ fail:
        btrfs_free_path(path);
 out_unlock:
        mutex_unlock(&dir->log_mutex);
-       if (err == -ENOSPC) {
+       if (err < 0)
                btrfs_set_log_full_commit(trans);
-               err = 0;
-       } else if (err < 0) {
-               btrfs_abort_transaction(trans, err);
-       }
-
        btrfs_end_log_trans(root);
-
-       return err;
 }
 
 /* see comments for btrfs_del_dir_entries_in_log */
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              const char *name, int name_len,
-                              struct btrfs_inode *inode, u64 dirid)
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct btrfs_inode *inode, u64 dirid)
 {
        struct btrfs_root *log;
        u64 index;
        int ret;
 
        if (!inode_logged(trans, inode))
-               return 0;
+               return;
 
        ret = join_running_log_trans(root);
        if (ret)
-               return 0;
+               return;
        log = root->log_root;
        mutex_lock(&inode->log_mutex);
 
        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
                                  dirid, &index);
        mutex_unlock(&inode->log_mutex);
-       if (ret == -ENOSPC) {
+       if (ret < 0 && ret != -ENOENT)
                btrfs_set_log_full_commit(trans);
-               ret = 0;
-       } else if (ret < 0 && ret != -ENOENT)
-               btrfs_abort_transaction(trans, ret);
        btrfs_end_log_trans(root);
-
-       return ret;
 }
 
 /*
@@ -3632,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *log,
+                                struct extent_buffer *src,
+                                struct btrfs_path *dst_path,
+                                int start_slot,
+                                int count)
+{
+       char *ins_data = NULL;
+       struct btrfs_item_batch batch;
+       struct extent_buffer *dst;
+       unsigned long src_offset;
+       unsigned long dst_offset;
+       struct btrfs_key key;
+       u32 item_size;
+       int ret;
+       int i;
+
+       ASSERT(count > 0);
+       batch.nr = count;
+
+       if (count == 1) {
+               btrfs_item_key_to_cpu(src, &key, start_slot);
+               item_size = btrfs_item_size_nr(src, start_slot);
+               batch.keys = &key;
+               batch.data_sizes = &item_size;
+               batch.total_data_size = item_size;
+       } else {
+               struct btrfs_key *ins_keys;
+               u32 *ins_sizes;
+
+               ins_data = kmalloc(count * sizeof(u32) +
+                                  count * sizeof(struct btrfs_key), GFP_NOFS);
+               if (!ins_data)
+                       return -ENOMEM;
+
+               ins_sizes = (u32 *)ins_data;
+               ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
+               batch.keys = ins_keys;
+               batch.data_sizes = ins_sizes;
+               batch.total_data_size = 0;
+
+               for (i = 0; i < count; i++) {
+                       const int slot = start_slot + i;
+
+                       btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
+                       ins_sizes[i] = btrfs_item_size_nr(src, slot);
+                       batch.total_data_size += ins_sizes[i];
+               }
+       }
+
+       ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
+       if (ret)
+               goto out;
+
+       dst = dst_path->nodes[0];
+       /*
+        * Copy all the items in bulk, in a single copy operation. Item data is
+        * organized such that it's placed at the end of a leaf and from right
+        * to left. For example, the data for the second item ends at an offset
+        * that matches the offset where the data for the first item starts, the
+        * data for the third item ends at an offset that matches the offset
+        * where the data of the second items starts, and so on.
+        * Therefore our source and destination start offsets for copy match the
+        * offsets of the last items (highest slots).
+        */
+       dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
+       src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
+       copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
+       btrfs_release_path(dst_path);
+out:
+       kfree(ins_data);
+
+       return ret;
+}
+
+static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
+                                 struct btrfs_inode *inode,
+                                 struct btrfs_path *path,
+                                 struct btrfs_path *dst_path,
+                                 int key_type,
+                                 struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_root *log = inode->root->log_root;
+       struct extent_buffer *src = path->nodes[0];
+       const int nritems = btrfs_header_nritems(src);
+       const u64 ino = btrfs_ino(inode);
+       const bool inode_logged_before = inode_logged(trans, inode);
+       u64 last_logged_key_offset;
+       bool last_found = false;
+       int batch_start = 0;
+       int batch_size = 0;
+       int i;
+
+       if (key_type == BTRFS_DIR_ITEM_KEY)
+               last_logged_key_offset = inode->last_dir_item_offset;
+       else
+               last_logged_key_offset = inode->last_dir_index_offset;
+
+       for (i = path->slots[0]; i < nritems; i++) {
+               struct btrfs_key key;
+               int ret;
+
+               btrfs_item_key_to_cpu(src, &key, i);
+
+               if (key.objectid != ino || key.type != key_type) {
+                       last_found = true;
+                       break;
+               }
+
+               ctx->last_dir_item_offset = key.offset;
+               /*
+                * We must make sure that when we log a directory entry, the
+                * corresponding inode, after log replay, has a matching link
+                * count. For example:
+                *
+                * touch foo
+                * mkdir mydir
+                * sync
+                * ln foo mydir/bar
+                * xfs_io -c "fsync" mydir
+                * <crash>
+                * <mount fs and log replay>
+                *
+                * Would result in a fsync log that when replayed, our file inode
+                * would have a link count of 1, but we get two directory entries
+                * pointing to the same inode. After removing one of the names,
+                * it would not be possible to remove the other name, which
+                * resulted always in stale file handle errors, and would not be
+                * possible to rmdir the parent directory, since its i_size could
+                * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
+                * resulting in -ENOTEMPTY errors.
+                */
+               if (!ctx->log_new_dentries) {
+                       struct btrfs_dir_item *di;
+                       struct btrfs_key di_key;
+
+                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+                       btrfs_dir_item_key_to_cpu(src, di, &di_key);
+                       if ((btrfs_dir_transid(src, di) == trans->transid ||
+                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+                           di_key.type != BTRFS_ROOT_ITEM_KEY)
+                               ctx->log_new_dentries = true;
+               }
+
+               if (!inode_logged_before)
+                       goto add_to_batch;
+
+               /*
+                * If we were logged before and have logged dir items, we can skip
+                * checking if any item with a key offset larger than the last one
+                * we logged is in the log tree, saving time and avoiding adding
+                * contention on the log tree.
+                */
+               if (key.offset > last_logged_key_offset)
+                       goto add_to_batch;
+               /*
+                * Check if the key was already logged before. If not we can add
+                * it to a batch for bulk insertion.
+                */
+               ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret > 0) {
+                       btrfs_release_path(dst_path);
+                       goto add_to_batch;
+               }
+
+               /*
+                * Item exists in the log. Overwrite the item in the log if it
+                * has different content or do nothing if it has exactly the same
+                * content. And then flush the current batch if any - do it after
+                * overwriting the current item, or we would deadlock otherwise,
+                * since we are holding a path for the existing item.
+                */
+               ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
+               if (ret < 0)
+                       return ret;
+
+               if (batch_size > 0) {
+                       ret = flush_dir_items_batch(trans, log, src, dst_path,
+                                                   batch_start, batch_size);
+                       if (ret < 0)
+                               return ret;
+                       batch_size = 0;
+               }
+               continue;
+add_to_batch:
+               if (batch_size == 0)
+                       batch_start = i;
+               batch_size++;
+       }
+
+       if (batch_size > 0) {
+               int ret;
+
+               ret = flush_dir_items_batch(trans, log, src, dst_path,
+                                           batch_start, batch_size);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return last_found ? 1 : 0;
+}
+
 /*
  * log all the items included in the current transaction for a given
  * directory.  This also creates the range items in the log tree required
  * to replay anything deleted before the fsync
  */
 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, struct btrfs_inode *inode,
+                         struct btrfs_inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path, int key_type,
                          struct btrfs_log_ctx *ctx,
                          u64 min_offset, u64 *last_offset_ret)
 {
        struct btrfs_key min_key;
+       struct btrfs_root *root = inode->root;
        struct btrfs_root *log = root->log_root;
-       struct extent_buffer *src;
        int err = 0;
        int ret;
-       int i;
-       int nritems;
        u64 first_offset = min_offset;
        u64 last_offset = (u64)-1;
        u64 ino = btrfs_ino(inode);
 
-       log = root->log_root;
-
        min_key.objectid = ino;
        min_key.type = key_type;
        min_key.offset = min_offset;
@@ -3730,62 +3949,14 @@ search:
         * from our directory
         */
        while (1) {
-               struct btrfs_key tmp;
-               src = path->nodes[0];
-               nritems = btrfs_header_nritems(src);
-               for (i = path->slots[0]; i < nritems; i++) {
-                       struct btrfs_dir_item *di;
-
-                       btrfs_item_key_to_cpu(src, &min_key, i);
-
-                       if (min_key.objectid != ino || min_key.type != key_type)
-                               goto done;
-
-                       if (need_resched()) {
-                               btrfs_release_path(path);
-                               cond_resched();
-                               goto search;
-                       }
-
-                       ret = overwrite_item(trans, log, dst_path, src, i,
-                                            &min_key);
-                       if (ret) {
+               ret = process_dir_items_leaf(trans, inode, path, dst_path,
+                                            key_type, ctx);
+               if (ret != 0) {
+                       if (ret < 0)
                                err = ret;
-                               goto done;
-                       }
-
-                       /*
-                        * We must make sure that when we log a directory entry,
-                        * the corresponding inode, after log replay, has a
-                        * matching link count. For example:
-                        *
-                        * touch foo
-                        * mkdir mydir
-                        * sync
-                        * ln foo mydir/bar
-                        * xfs_io -c "fsync" mydir
-                        * <crash>
-                        * <mount fs and log replay>
-                        *
-                        * Would result in a fsync log that when replayed, our
-                        * file inode would have a link count of 1, but we get
-                        * two directory entries pointing to the same inode.
-                        * After removing one of the names, it would not be
-                        * possible to remove the other name, which resulted
-                        * always in stale file handle errors, and would not
-                        * be possible to rmdir the parent directory, since
-                        * its i_size could never decrement to the value
-                        * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
-                        */
-                       di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
-                       btrfs_dir_item_key_to_cpu(src, di, &tmp);
-                       if (ctx &&
-                           (btrfs_dir_transid(src, di) == trans->transid ||
-                            btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
-                           tmp.type != BTRFS_ROOT_ITEM_KEY)
-                               ctx->log_new_dentries = true;
+                       goto done;
                }
-               path->slots[0] = nritems;
+               path->slots[0] = btrfs_header_nritems(path->nodes[0]);
 
                /*
                 * look ahead to the next item and see if it is also
@@ -3799,21 +3970,26 @@ search:
                                err = ret;
                        goto done;
                }
-               btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-               if (tmp.objectid != ino || tmp.type != key_type) {
+               btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
+               if (min_key.objectid != ino || min_key.type != key_type) {
                        last_offset = (u64)-1;
                        goto done;
                }
                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
-                                            &tmp);
+                                            &min_key);
                        if (ret)
                                err = ret;
                        else
-                               last_offset = tmp.offset;
+                               last_offset = min_key.offset;
                        goto done;
                }
+               if (need_resched()) {
+                       btrfs_release_path(path);
+                       cond_resched();
+                       goto search;
+               }
        }
 done:
        btrfs_release_path(path);
@@ -3846,7 +4022,7 @@ done:
  * key logged by this transaction.
  */
 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, struct btrfs_inode *inode,
+                         struct btrfs_inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path,
                          struct btrfs_log_ctx *ctx)
@@ -3856,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
        int ret;
        int key_type = BTRFS_DIR_ITEM_KEY;
 
+       /*
+        * If this is the first time we are being logged in the current
+        * transaction, or we were logged before but the inode was evicted and
+        * reloaded later, in which case its logged_trans is 0, reset the values
+        * of the last logged key offsets. Note that we don't use the helper
+        * function inode_logged() here - that is because the function returns
+        * true after an inode eviction, assuming the worst case as it can not
+        * know for sure if the inode was logged before. So we can not skip key
+        * searches in the case the inode was evicted, because it may not have
+        * been logged in this transaction and may have been logged in a past
+        * transaction, so we need to reset the last dir item and index offsets
+        * to (u64)-1.
+        */
+       if (inode->logged_trans != trans->transid) {
+               inode->last_dir_item_offset = (u64)-1;
+               inode->last_dir_index_offset = (u64)-1;
+       }
 again:
        min_key = 0;
        max_key = 0;
+       if (key_type == BTRFS_DIR_ITEM_KEY)
+               ctx->last_dir_item_offset = inode->last_dir_item_offset;
+       else
+               ctx->last_dir_item_offset = inode->last_dir_index_offset;
+
        while (1) {
-               ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
+               ret = log_dir_items(trans, inode, path, dst_path, key_type,
                                ctx, min_key, &max_key);
                if (ret)
                        return ret;
@@ -3870,8 +4068,11 @@ again:
        }
 
        if (key_type == BTRFS_DIR_ITEM_KEY) {
+               inode->last_dir_item_offset = ctx->last_dir_item_offset;
                key_type = BTRFS_DIR_INDEX_KEY;
                goto again;
+       } else {
+               inode->last_dir_index_offset = ctx->last_dir_item_offset;
        }
        return 0;
 }
@@ -3882,17 +4083,21 @@ again:
  * This cannot be run for file data extents because it does not
  * free the extents they point to.
  */
-static int drop_objectid_items(struct btrfs_trans_handle *trans,
+static int drop_inode_items(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *log,
                                  struct btrfs_path *path,
-                                 u64 objectid, int max_key_type)
+                                 struct btrfs_inode *inode,
+                                 int max_key_type)
 {
        int ret;
        struct btrfs_key key;
        struct btrfs_key found_key;
        int start_slot;
 
-       key.objectid = objectid;
+       if (!inode_logged(trans, inode))
+               return 0;
+
+       key.objectid = btrfs_ino(inode);
        key.type = max_key_type;
        key.offset = (u64)-1;
 
@@ -3909,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);
 
-               if (found_key.objectid != objectid)
+               if (found_key.objectid != key.objectid)
                        break;
 
                found_key.offset = 0;
@@ -3934,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int truncate_inode_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log_root,
+                               struct btrfs_inode *inode,
+                               u64 new_size, u32 min_type)
+{
+       int ret;
+
+       do {
+               ret = btrfs_truncate_inode_items(trans, log_root, inode,
+                                                new_size, min_type, NULL);
+       } while (ret == -EAGAIN);
+
+       return ret;
+}
+
 static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct extent_buffer *leaf,
                            struct btrfs_inode_item *item,
@@ -4106,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        int ret;
        struct btrfs_key *ins_keys;
        u32 *ins_sizes;
+       struct btrfs_item_batch batch;
        char *ins_data;
        int i;
        struct list_head ordered_sums;
@@ -4120,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+       batch.keys = ins_keys;
+       batch.data_sizes = ins_sizes;
+       batch.total_data_size = 0;
+       batch.nr = nr;
 
        for (i = 0; i < nr; i++) {
                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+               batch.total_data_size += ins_sizes[i];
                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
        }
-       ret = btrfs_insert_empty_items(trans, log, dst_path,
-                                      ins_keys, ins_sizes, nr);
+       ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
        if (ret) {
                kfree(ins_data);
                return ret;
@@ -4338,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 }
 
 static int log_one_extent(struct btrfs_trans_handle *trans,
-                         struct btrfs_inode *inode, struct btrfs_root *root,
+                         struct btrfs_inode *inode,
                          const struct extent_map *em,
                          struct btrfs_path *path,
                          struct btrfs_log_ctx *ctx)
 {
        struct btrfs_drop_extents_args drop_args = { 0 };
-       struct btrfs_root *log = root->log_root;
+       struct btrfs_root *log = inode->root->log_root;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *leaf;
        struct btrfs_map_token token;
@@ -4357,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       drop_args.path = path;
-       drop_args.start = em->start;
-       drop_args.end = em->start + em->len;
-       drop_args.replace_extent = true;
-       drop_args.extent_item_size = sizeof(*fi);
-       ret = btrfs_drop_extents(trans, log, inode, &drop_args);
-       if (ret)
-               return ret;
+       /*
+        * If this is the first time we are logging the inode in the current
+        * transaction, we can avoid btrfs_drop_extents(), which is expensive
+        * because it does a deletion search, which always acquires write locks
+        * for extent buffers at levels 2, 1 and 0. This not only wastes time
+        * but also adds significant contention in a log tree, since log trees
+        * are small, with a root at level 2 or 3 at most, due to their short
+        * life span.
+        */
+       if (inode_logged(trans, inode)) {
+               drop_args.path = path;
+               drop_args.start = em->start;
+               drop_args.end = em->start + em->len;
+               drop_args.replace_extent = true;
+               drop_args.extent_item_size = sizeof(*fi);
+               ret = btrfs_drop_extents(trans, log, inode, &drop_args);
+               if (ret)
+                       return ret;
+       }
 
        if (!drop_args.extent_inserted) {
                key.objectid = btrfs_ino(inode);
@@ -4522,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
                         * Avoid logging extent items logged in past fsync calls
                         * and leading to duplicate keys in the log tree.
                         */
-                       do {
-                               ret = btrfs_truncate_inode_items(trans,
-                                                        root->log_root,
-                                                        inode, truncate_offset,
-                                                        BTRFS_EXTENT_DATA_KEY,
-                                                        NULL);
-                       } while (ret == -EAGAIN);
+                       ret = truncate_inode_items(trans, root->log_root, inode,
+                                                  truncate_offset,
+                                                  BTRFS_EXTENT_DATA_KEY);
                        if (ret)
                                goto out;
                        dropped_extents = true;
@@ -4555,7 +4787,6 @@ out:
 }
 
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
                                     struct btrfs_inode *inode,
                                     struct btrfs_path *path,
                                     struct btrfs_log_ctx *ctx)
@@ -4620,7 +4851,7 @@ process:
 
                write_unlock(&tree->lock);
 
-               ret = log_one_extent(trans, inode, root, em, path, ctx);
+               ret = log_one_extent(trans, inode, em, path, ctx);
                write_lock(&tree->lock);
                clear_em_logging(tree, em);
                free_extent_map(em);
@@ -4709,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
  * with a journal, ext3/4, xfs, f2fs, etc).
  */
 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
                                struct btrfs_inode *inode,
                                struct btrfs_path *path,
                                struct btrfs_path *dst_path)
 {
+       struct btrfs_root *root = inode->root;
        int ret;
        struct btrfs_key key;
        const u64 ino = btrfs_ino(inode);
@@ -4787,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
  * truncate operation that changes the inode's size.
  */
 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
                           struct btrfs_inode *inode,
                           struct btrfs_path *path)
 {
+       struct btrfs_root *root = inode->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
        const u64 ino = btrfs_ino(inode);
@@ -5067,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                                if (IS_ERR(inode)) {
                                        ret = PTR_ERR(inode);
                                } else {
-                                       ret = btrfs_log_inode(trans, root,
+                                       ret = btrfs_log_inode(trans,
                                                      BTRFS_I(inode),
                                                      LOG_OTHER_INODE_ALL,
                                                      ctx);
@@ -5127,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                 * well because during a rename we pin the log and update the
                 * log with the new name before we unpin it.
                 */
-               ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
-                                     LOG_OTHER_INODE, ctx);
+               ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
                if (ret) {
                        btrfs_add_delayed_iput(inode);
                        continue;
@@ -5239,7 +5469,7 @@ again:
                                        &other_ino, &other_parent);
                        if (ret < 0) {
                                return ret;
-                       } else if (ret > 0 && ctx &&
+                       } else if (ret > 0 &&
                                   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
                                if (ins_nr > 0) {
                                        ins_nr++;
@@ -5339,7 +5569,7 @@ next_key:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct btrfs_inode *inode,
+                          struct btrfs_inode *inode,
                           int inode_only,
                           struct btrfs_log_ctx *ctx)
 {
@@ -5347,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_path *dst_path;
        struct btrfs_key min_key;
        struct btrfs_key max_key;
-       struct btrfs_root *log = root->log_root;
+       struct btrfs_root *log = inode->root->log_root;
        int err = 0;
        int ret = 0;
        bool fast_search = false;
@@ -5389,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         * Only run delayed items if we are a directory. We want to make sure
         * all directory indexes hit the fs/subvolume tree so we can find them
         * and figure out which index ranges have to be logged.
-        *
-        * Otherwise commit the delayed inode only if the full sync flag is set,
-        * as we want to make sure an up to date version is in the subvolume
-        * tree so copy_inode_items_to_log() / copy_items() can find it and copy
-        * it to the log tree. For a non full sync, we always log the inode item
-        * based on the in-memory struct btrfs_inode which is always up to date.
         */
-       if (S_ISDIR(inode->vfs_inode.i_mode))
-               ret = btrfs_commit_inode_delayed_items(trans, inode);
-       else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
-               ret = btrfs_commit_inode_delayed_inode(inode);
-
-       if (ret) {
-               btrfs_free_path(path);
-               btrfs_free_path(dst_path);
-               return ret;
+       if (S_ISDIR(inode->vfs_inode.i_mode)) {
+               err = btrfs_commit_inode_delayed_items(trans, inode);
+               if (err)
+                       goto out;
        }
 
        if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
@@ -5443,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
                if (inode_only == LOG_INODE_EXISTS)
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
-               ret = drop_objectid_items(trans, log, path, ino, max_key_type);
+               ret = drop_inode_items(trans, log, path, inode, max_key_type);
        } else {
-               if (inode_only == LOG_INODE_EXISTS) {
+               if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
                        /*
                         * Make sure the new inode item we write to the log has
                         * the same isize as the current one (if it exists).
@@ -5467,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             &inode->runtime_flags)) {
                        if (inode_only == LOG_INODE_EXISTS) {
                                max_key.type = BTRFS_XATTR_ITEM_KEY;
-                               ret = drop_objectid_items(trans, log, path, ino,
-                                                         max_key.type);
+                               ret = drop_inode_items(trans, log, path, inode,
+                                                      max_key.type);
                        } else {
                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                          &inode->runtime_flags);
                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                          &inode->runtime_flags);
-                               while(1) {
-                                       ret = btrfs_truncate_inode_items(trans,
-                                               log, inode, 0, 0, NULL);
-                                       if (ret != -EAGAIN)
-                                               break;
-                               }
+                               if (inode_logged(trans, inode))
+                                       ret = truncate_inode_items(trans, log,
+                                                                  inode, 0, 0);
                        }
                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                              &inode->runtime_flags) ||
@@ -5487,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
                        max_key.type = BTRFS_XATTR_ITEM_KEY;
-                       ret = drop_objectid_items(trans, log, path, ino,
-                                                 max_key.type);
+                       ret = drop_inode_items(trans, log, path, inode,
+                                              max_key.type);
                } else {
                        if (inode_only == LOG_INODE_ALL)
                                fast_search = true;
@@ -5511,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
-       err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+       err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
        if (err)
                goto out_unlock;
        xattrs_logged = true;
        if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
                btrfs_release_path(path);
                btrfs_release_path(dst_path);
-               err = btrfs_log_holes(trans, root, inode, path);
+               err = btrfs_log_holes(trans, inode, path);
                if (err)
                        goto out_unlock;
        }
@@ -5538,16 +5754,14 @@ log_extents:
                 * BTRFS_INODE_COPY_EVERYTHING set.
                 */
                if (!xattrs_logged && inode->logged_trans < trans->transid) {
-                       err = btrfs_log_all_xattrs(trans, root, inode, path,
-                                                  dst_path);
+                       err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
                        if (err)
                                goto out_unlock;
                        btrfs_release_path(path);
                }
        }
        if (fast_search) {
-               ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                               ctx);
+               ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
                if (ret) {
                        err = ret;
                        goto out_unlock;
@@ -5562,59 +5776,52 @@ log_extents:
        }
 
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
-               ret = log_directory_changes(trans, root, inode, path, dst_path,
-                                       ctx);
+               ret = log_directory_changes(trans, inode, path, dst_path, ctx);
                if (ret) {
                        err = ret;
                        goto out_unlock;
                }
        }
 
+       spin_lock(&inode->lock);
+       inode->logged_trans = trans->transid;
        /*
-        * If we are logging that an ancestor inode exists as part of logging a
-        * new name from a link or rename operation, don't mark the inode as
-        * logged - otherwise if an explicit fsync is made against an ancestor,
-        * the fsync considers the inode in the log and doesn't sync the log,
-        * resulting in the ancestor missing after a power failure unless the
-        * log was synced as part of an fsync against any other unrelated inode.
-        * So keep it simple for this case and just don't flag the ancestors as
-        * logged.
+        * Don't update last_log_commit if we logged that an inode exists.
+        * We do this for three reasons:
+        *
+        * 1) We might have had buffered writes to this inode that were
+        *    flushed and had their ordered extents completed in this
+        *    transaction, but we did not previously log the inode with
+        *    LOG_INODE_ALL. Later the inode was evicted and after that
+        *    it was loaded again and this LOG_INODE_EXISTS log operation
+        *    happened. We must make sure that if an explicit fsync against
+        *    the inode is performed later, it logs the new extents, an
+        *    updated inode item, etc, and syncs the log. The same logic
+        *    applies to direct IO writes instead of buffered writes.
+        *
+        * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
+        *    is logged with an i_size of 0 or whatever value was logged
+        *    before. If later the i_size of the inode is increased by a
+        *    truncate operation, the log is synced through an fsync of
+        *    some other inode and then finally an explicit fsync against
+        *    this inode is made, we must make sure this fsync logs the
+        *    inode with the new i_size, the hole between old i_size and
+        *    the new i_size, and syncs the log.
+        *
+        * 3) If we are logging that an ancestor inode exists as part of
+        *    logging a new name from a link or rename operation, don't update
+        *    its last_log_commit - otherwise if an explicit fsync is made
+        *    against an ancestor, the fsync considers the inode in the log
+        *    and doesn't sync the log, resulting in the ancestor missing after
+        *    a power failure unless the log was synced as part of an fsync
+        *    against any other unrelated inode.
         */
-       if (!ctx ||
-           !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
-             &inode->vfs_inode != ctx->inode)) {
-               spin_lock(&inode->lock);
-               inode->logged_trans = trans->transid;
-               /*
-                * Don't update last_log_commit if we logged that an inode exists.
-                * We do this for two reasons:
-                *
-                * 1) We might have had buffered writes to this inode that were
-                *    flushed and had their ordered extents completed in this
-                *    transaction, but we did not previously log the inode with
-                *    LOG_INODE_ALL. Later the inode was evicted and after that
-                *    it was loaded again and this LOG_INODE_EXISTS log operation
-                *    happened. We must make sure that if an explicit fsync against
-                *    the inode is performed later, it logs the new extents, an
-                *    updated inode item, etc, and syncs the log. The same logic
-                *    applies to direct IO writes instead of buffered writes.
-                *
-                * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
-                *    is logged with an i_size of 0 or whatever value was logged
-                *    before. If later the i_size of the inode is increased by a
-                *    truncate operation, the log is synced through an fsync of
-                *    some other inode and then finally an explicit fsync against
-                *    this inode is made, we must make sure this fsync logs the
-                *    inode with the new i_size, the hole between old i_size and
-                *    the new i_size, and syncs the log.
-                */
-               if (inode_only != LOG_INODE_EXISTS)
-                       inode->last_log_commit = inode->last_sub_trans;
-               spin_unlock(&inode->lock);
-       }
+       if (inode_only != LOG_INODE_EXISTS)
+               inode->last_log_commit = inode->last_sub_trans;
+       spin_unlock(&inode->lock);
 out_unlock:
        mutex_unlock(&inode->log_mutex);
-
+out:
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
        return err;
@@ -5714,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
        struct btrfs_dir_list *dir_elem;
        int ret = 0;
 
+       /*
+        * If we are logging a new name, as part of a link or rename operation,
+        * don't bother logging new dentries, as we just want to log the names
+        * of an inode and that any new parents exist.
+        */
+       if (ctx->logging_new_name)
+               return 0;
+
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -5790,7 +6005,7 @@ process_leaf:
                        ctx->log_new_dentries = false;
                        if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
                                log_mode = LOG_INODE_ALL;
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
+                       ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
                                              log_mode, ctx);
                        btrfs_add_delayed_iput(di_inode);
                        if (ret)
@@ -5934,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                                continue;
                        }
 
-                       if (ctx)
-                               ctx->log_new_dentries = false;
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
+                       ctx->log_new_dentries = false;
+                       ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
                                              LOG_INODE_ALL, ctx);
-                       if (!ret && ctx && ctx->log_new_dentries)
+                       if (!ret && ctx->log_new_dentries)
                                ret = log_new_dir_dentries(trans, root,
                                                   BTRFS_I(dir_inode), ctx);
                        btrfs_add_delayed_iput(dir_inode);
@@ -5984,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
 
                if (BTRFS_I(inode)->generation >= trans->transid &&
                    need_log_inode(trans, BTRFS_I(inode)))
-                       ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
+                       ret = btrfs_log_inode(trans, BTRFS_I(inode),
                                              LOG_INODE_EXISTS, ctx);
                btrfs_add_delayed_iput(inode);
                if (ret)
@@ -6039,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
 
                if (inode->generation >= trans->transid &&
                    need_log_inode(trans, inode)) {
-                       ret = btrfs_log_inode(trans, root, inode,
+                       ret = btrfs_log_inode(trans, inode,
                                              LOG_INODE_EXISTS, ctx);
                        if (ret)
                                break;
@@ -6182,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
 
-       ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
+       ret = btrfs_log_inode(trans, inode, inode_only, ctx);
        if (ret)
                goto end_trans;
 
@@ -6199,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
        }
 
-       if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
+       if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
                log_dentries = true;
 
        /*
@@ -6325,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 
        ret = walk_log_tree(trans, log_root_tree, &wc);
        if (ret) {
-               btrfs_handle_fs_error(fs_info, ret,
-                       "Failed to pin buffers while recovering log root tree.");
+               btrfs_abort_transaction(trans, ret);
                goto error;
        }
 
@@ -6339,8 +6552,7 @@ again:
                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 
                if (ret < 0) {
-                       btrfs_handle_fs_error(fs_info, ret,
-                                   "Couldn't find tree log root.");
+                       btrfs_abort_transaction(trans, ret);
                        goto error;
                }
                if (ret > 0) {
@@ -6357,8 +6569,7 @@ again:
                log = btrfs_read_tree_root(log_root_tree, &found_key);
                if (IS_ERR(log)) {
                        ret = PTR_ERR(log);
-                       btrfs_handle_fs_error(fs_info, ret,
-                                   "Couldn't read tree log root.");
+                       btrfs_abort_transaction(trans, ret);
                        goto error;
                }
 
@@ -6386,8 +6597,7 @@ again:
 
                        if (!ret)
                                goto next;
-                       btrfs_handle_fs_error(fs_info, ret,
-                               "Couldn't read target root for tree log recovery.");
+                       btrfs_abort_transaction(trans, ret);
                        goto error;
                }
 
@@ -6395,14 +6605,15 @@ again:
                ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
                if (ret)
                        /* The loop needs to continue due to the root refs */
-                       btrfs_handle_fs_error(fs_info, ret,
-                               "failed to record the log root in transaction");
+                       btrfs_abort_transaction(trans, ret);
                else
                        ret = walk_log_tree(trans, log, &wc);
 
                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
                                                      path);
+                       if (ret)
+                               btrfs_abort_transaction(trans, ret);
                }
 
                if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
@@ -6419,6 +6630,8 @@ again:
                         * could only happen during mount.
                         */
                        ret = btrfs_init_root_free_objectid(root);
+                       if (ret)
+                               btrfs_abort_transaction(trans, ret);
                }
 
                wc.replay_dest->log_root = NULL;
index 731bd9c..f6811c3 100644 (file)
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
        int log_transid;
        bool log_new_dentries;
        bool logging_new_name;
+       /* Tracks the last logged dir item/index key offset. */
+       u64 last_dir_item_offset;
        struct inode *inode;
        struct list_head list;
        /* Only used for fast fsyncs. */
@@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct dentry *dentry,
                          struct btrfs_log_ctx *ctx);
-int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                const char *name, int name_len,
-                                struct btrfs_inode *dir, u64 index);
-int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              const char *name, int name_len,
-                              struct btrfs_inode *inode, u64 dirid);
+void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct btrfs_inode *dir, u64 index);
+void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct btrfs_inode *inode, u64 dirid);
 void btrfs_end_log_trans(struct btrfs_root *root);
 void btrfs_pin_log_trans(struct btrfs_root *root);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
index 9533f35..61ac57b 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
+#include <linux/namei.h>
 #include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                             enum btrfs_map_op op,
                             u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                             int mirror_num, int need_raid_map);
 
 /*
@@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 
                device = NULL;
        } else {
+               struct btrfs_dev_lookup_args args = {
+                       .devid = devid,
+                       .uuid = disk_super->dev_item.uuid,
+               };
+
                mutex_lock(&fs_devices->device_list_mutex);
-               device = btrfs_find_device(fs_devices, devid,
-                               disk_super->dev_item.uuid, NULL);
+               device = btrfs_find_device(fs_devices, &args);
 
                /*
                 * If this disk has been pulled into an fs devices created by
@@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
        list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
                __btrfs_free_extra_devids(seed_dev, &latest_dev);
 
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
 
        mutex_unlock(&uuid_mutex);
 }
@@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
        if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                fs_devices->missing_devices--;
+       }
 
        btrfs_close_bdev(device);
        if (device->bdev) {
@@ -1222,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
                return -EINVAL;
 
        fs_devices->opened = 1;
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
        fs_devices->total_rw_bytes = 0;
        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
        fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1843,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
 
+       btrfs_reserve_chunk_metadata(trans, true);
        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
                                      &key, sizeof(*dev_item));
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret)
                goto out;
 
@@ -1882,18 +1891,22 @@ out:
 /*
  * Function to update ctime/mtime for a given device path.
  * Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
  */
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
 {
-       struct inode *inode = bdev->bd_inode;
+       struct path path;
        struct timespec64 now;
+       int ret;
 
-       /* Shouldn't happen but just in case. */
-       if (!inode)
+       ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+       if (ret)
                return;
 
-       now = current_time(inode);
-       generic_update_time(inode, &now, S_MTIME | S_CTIME);
+       now = current_time(d_inode(path.dentry));
+       inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+       path_put(&path);
 }
 
 static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1917,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
 
+       btrfs_reserve_chunk_metadata(trans, false);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
@@ -1986,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
 }
 
 /*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
  * and replace it with the provided or the next active device, in the context
  * where this function called, there should be always be another device (or
  * this_dev) which is active.
@@ -2005,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
                        (fs_info->sb->s_bdev == device->bdev))
                fs_info->sb->s_bdev = next_device->bdev;
 
-       if (fs_info->fs_devices->latest_bdev == device->bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+               fs_info->fs_devices->latest_dev = next_device;
 }
 
 /*
@@ -2069,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 
        /* Update ctime/mtime for device path for libblkid */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
 }
 
-int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
-                   u64 devid, struct block_device **bdev, fmode_t *mode)
+int btrfs_rm_device(struct btrfs_fs_info *fs_info,
+                   struct btrfs_dev_lookup_args *args,
+                   struct block_device **bdev, fmode_t *mode)
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *cur_devices;
@@ -2081,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        u64 num_devices;
        int ret = 0;
 
-       mutex_lock(&uuid_mutex);
-
+       /*
+        * The device list in fs_devices is accessed without locks (neither
+        * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+        * filesystem and another device rm cannot run.
+        */
        num_devices = btrfs_num_devices(fs_info);
 
        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
        if (ret)
                goto out;
 
-       device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
-
-       if (IS_ERR(device)) {
-               if (PTR_ERR(device) == -ENOENT &&
-                   device_path && strcmp(device_path, "missing") == 0)
+       device = btrfs_find_device(fs_info->fs_devices, args);
+       if (!device) {
+               if (args->missing)
                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
                else
-                       ret = PTR_ERR(device);
+                       ret = -ENOENT;
                goto out;
        }
 
@@ -2126,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                mutex_unlock(&fs_info->chunk_mutex);
        }
 
-       mutex_unlock(&uuid_mutex);
        ret = btrfs_shrink_device(device, 0);
        if (!ret)
                btrfs_reada_remove_dev(device);
-       mutex_lock(&uuid_mutex);
        if (ret)
                goto error_undo;
 
@@ -2159,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        /*
         * In normal cases the cur_devices == fs_devices. But in case
         * of deleting a seed device, the cur_devices should point to
-        * its own fs_devices listed under the fs_devices->seed.
+        * its own fs_devices listed under the fs_devices->seed_list.
         */
        cur_devices = device->fs_devices;
        mutex_lock(&fs_devices->device_list_mutex);
@@ -2210,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        synchronize_rcu();
        btrfs_free_device(device);
 
-       if (cur_devices->open_devices == 0) {
+       /*
+        * This can happen if cur_devices is the private seed devices list.  We
+        * cannot call close_fs_devices() here because it expects the uuid_mutex
+        * to be held, but in fact we don't need that for the private
+        * seed_devices, we can simply decrement cur_devices->opened and then
+        * remove it from our list and free the fs_devices.
+        */
+       if (cur_devices->num_devices == 0) {
                list_del_init(&cur_devices->seed_list);
-               close_fs_devices(cur_devices);
+               ASSERT(cur_devices->opened == 1);
+               cur_devices->opened--;
                free_fs_devices(cur_devices);
        }
 
 out:
-       mutex_unlock(&uuid_mutex);
        return ret;
 
 error_undo:
@@ -2305,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
 
        mutex_unlock(&fs_devices->device_list_mutex);
 
-       /*
-        * The update_dev_time() with in btrfs_scratch_superblocks()
-        * may lead to a call to btrfs_show_devname() which will try
-        * to hold device_list_mutex. And here this device
-        * is already out of device list, so we don't have to hold
-        * the device_list_mutex lock.
-        */
        btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
                                  tgtdev->name->str);
 
@@ -2320,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
        btrfs_free_device(tgtdev);
 }
 
-static struct btrfs_device *btrfs_find_device_by_path(
-               struct btrfs_fs_info *fs_info, const char *device_path)
+/**
+ * Populate args from device at path
+ *
+ * @fs_info:   the filesystem
+ * @args:      the args to populate
+ * @path:      the path to the device
+ *
+ * This will read the super block of the device at @path and populate @args with
+ * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
+ * lookup a device to operate on, but need to do it before we take any locks.
+ * This properly handles the special case of "missing" that a user may pass in,
+ * and does some basic sanity checks.  The caller must make sure that @path is
+ * properly NUL terminated before calling in, and must call
+ * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
+ * uuid buffers.
+ *
+ * Return: 0 for success, -errno for failure
+ */
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+                                struct btrfs_dev_lookup_args *args,
+                                const char *path)
 {
-       int ret = 0;
        struct btrfs_super_block *disk_super;
-       u64 devid;
-       u8 *dev_uuid;
        struct block_device *bdev;
-       struct btrfs_device *device;
+       int ret;
 
-       ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
-                                   fs_info->bdev_holder, 0, &bdev, &disk_super);
-       if (ret)
-               return ERR_PTR(ret);
+       if (!path || !path[0])
+               return -EINVAL;
+       if (!strcmp(path, "missing")) {
+               args->missing = true;
+               return 0;
+       }
 
-       devid = btrfs_stack_device_id(&disk_super->dev_item);
-       dev_uuid = disk_super->dev_item.uuid;
+       args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
+       args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
+       if (!args->uuid || !args->fsid) {
+               btrfs_put_dev_args_from_path(args);
+               return -ENOMEM;
+       }
+
+       ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0,
+                                   &bdev, &disk_super);
+       if (ret)
+               return ret;
+       args->devid = btrfs_stack_device_id(&disk_super->dev_item);
+       memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->metadata_uuid);
+               memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
        else
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          disk_super->fsid);
-
+               memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
        btrfs_release_disk_super(disk_super);
-       if (!device)
-               device = ERR_PTR(-ENOENT);
        blkdev_put(bdev, FMODE_READ);
-       return device;
+       return 0;
 }
 
 /*
- * Lookup a device given by device id, or the path if the id is 0.
+ * Only use this jointly with btrfs_get_dev_args_from_path() because we will
+ * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
+ * that don't need to be freed.
  */
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
+{
+       kfree(args->uuid);
+       kfree(args->fsid);
+       args->uuid = NULL;
+       args->fsid = NULL;
+}
+
 struct btrfs_device *btrfs_find_device_by_devspec(
                struct btrfs_fs_info *fs_info, u64 devid,
                const char *device_path)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_device *device;
+       int ret;
 
        if (devid) {
-               device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
-                                          NULL);
+               args.devid = devid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                if (!device)
                        return ERR_PTR(-ENOENT);
                return device;
        }
 
-       if (!device_path || !device_path[0])
-               return ERR_PTR(-EINVAL);
-
-       if (strcmp(device_path, "missing") == 0) {
-               /* Find first missing device */
-               list_for_each_entry(device, &fs_info->fs_devices->devices,
-                                   dev_list) {
-                       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
-                                    &device->dev_state) && !device->bdev)
-                               return device;
-               }
+       ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
+       if (ret)
+               return ERR_PTR(ret);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
+       btrfs_put_dev_args_from_path(&args);
+       if (!device)
                return ERR_PTR(-ENOENT);
-       }
-
-       return btrfs_find_device_by_path(fs_info, device_path);
+       return device;
 }
 
 /*
@@ -2459,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
  */
 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root = fs_info->chunk_root;
        struct btrfs_path *path;
@@ -2468,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
        struct btrfs_key key;
        u8 fs_uuid[BTRFS_FSID_SIZE];
        u8 dev_uuid[BTRFS_UUID_SIZE];
-       u64 devid;
        int ret;
 
        path = btrfs_alloc_path();
@@ -2480,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
        key.type = BTRFS_DEV_ITEM_KEY;
 
        while (1) {
+               btrfs_reserve_chunk_metadata(trans, false);
                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+               btrfs_trans_release_chunk_metadata(trans);
                if (ret < 0)
                        goto error;
 
@@ -2505,13 +2551,14 @@ next_slot:
 
                dev_item = btrfs_item_ptr(leaf, path->slots[0],
                                          struct btrfs_dev_item);
-               devid = btrfs_device_id(leaf, dev_item);
+               args.devid = btrfs_device_id(leaf, dev_item);
                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                                   BTRFS_UUID_SIZE);
                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                                   BTRFS_FSID_SIZE);
-               device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                          fs_uuid);
+               args.uuid = dev_uuid;
+               args.fsid = fs_uuid;
+               device = btrfs_find_device(fs_info->fs_devices, &args);
                BUG_ON(!device); /* Logic error */
 
                if (device->fs_devices->seeding) {
@@ -2627,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
                        btrfs_abort_transaction(trans, ret);
                        goto error_trans;
                }
+               btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+                                               device);
        }
 
        device->fs_devices = fs_devices;
@@ -2733,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
        btrfs_forget_devices(device_path);
 
        /* Update ctime/mtime for blkid or udev */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
 
        return ret;
 
@@ -2826,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        u64 old_total;
        u64 diff;
+       int ret;
 
        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                return -EACCES;
@@ -2854,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
                              &trans->transaction->dev_update_list);
        mutex_unlock(&fs_info->chunk_mutex);
 
-       return btrfs_update_device(trans, device);
+       btrfs_reserve_chunk_metadata(trans, false);
+       ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
+
+       return ret;
 }
 
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
                struct btrfs_block_group *sys_bg;
 
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                if (IS_ERR(sys_bg)) {
                        ret = PTR_ERR(sys_bg);
                        btrfs_abort_transaction(trans, ret);
@@ -4889,8 +4943,10 @@ again:
                        round_down(old_total - diff, fs_info->sectorsize));
        mutex_unlock(&fs_info->chunk_mutex);
 
+       btrfs_reserve_chunk_metadata(trans, false);
        /* Now btrfs_update_device() will change the on-disk size. */
        ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
@@ -4973,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
 }
 
 /*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
  * Wraps needed parameters.
  */
 struct alloc_chunk_ctl {
@@ -5377,7 +5433,7 @@ error_del_extent:
        return block_group;
 }
 
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                            u64 type)
 {
        struct btrfs_fs_info *info = trans->fs_info;
@@ -5578,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
         */
 
        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-       meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       meta_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(meta_bg))
                return PTR_ERR(meta_bg);
 
        alloc_profile = btrfs_system_alloc_profile(fs_info);
-       sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       sys_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(sys_bg))
                return PTR_ERR(sys_bg);
 
@@ -5597,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
        return btrfs_raid_array[index].tolerated_failures;
 }
 
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
 {
        struct extent_map *em;
        struct map_lookup *map;
-       int readonly = 0;
        int miss_ndevs = 0;
        int i;
+       bool ret = true;
 
        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
        if (IS_ERR(em))
-               return 1;
+               return false;
 
        map = em->map_lookup;
        for (i = 0; i < map->num_stripes; i++) {
@@ -5618,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
                }
                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
                                        &map->stripes[i].dev->dev_state)) {
-                       readonly = 1;
+                       ret = false;
                        goto end;
                }
        }
 
        /*
-        * If the number of missing devices is larger than max errors,
-        * we can not write the data into that chunk successfully, so
-        * set it readonly.
+        * If the number of missing devices is larger than max errors, we can
+        * not write the data into that chunk successfully.
         */
        if (miss_ndevs > btrfs_chunk_max_errors(map))
-               readonly = 1;
+               ret = false;
 end:
        free_extent_map(em);
-       return readonly;
+       return ret;
 }
 
 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
@@ -5795,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
 }
 
 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
+static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
 {
        int i;
        int again = 1;
@@ -5804,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
                again = 0;
                for (i = 0; i < num_stripes - 1; i++) {
                        /* Swap if parity is on a smaller index */
-                       if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
-                               swap(bbio->stripes[i], bbio->stripes[i + 1]);
-                               swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
+                       if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+                               swap(bioc->stripes[i], bioc->stripes[i + 1]);
+                               swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
                                again = 1;
                        }
                }
        }
 }
 
-static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+                                                      int total_stripes,
+                                                      int real_stripes)
 {
-       struct btrfs_bio *bbio = kzalloc(
-                /* the size of the btrfs_bio */
-               sizeof(struct btrfs_bio) +
-               /* plus the variable array for the stripes */
-               sizeof(struct btrfs_bio_stripe) * (total_stripes) +
-               /* plus the variable array for the tgt dev */
+       struct btrfs_io_context *bioc = kzalloc(
+                /* The size of btrfs_io_context */
+               sizeof(struct btrfs_io_context) +
+               /* Plus the variable array for the stripes */
+               sizeof(struct btrfs_io_stripe) * (total_stripes) +
+               /* Plus the variable array for the tgt dev */
                sizeof(int) * (real_stripes) +
                /*
-                * plus the raid_map, which includes both the tgt dev
-                * and the stripes
+                * Plus the raid_map, which includes both the tgt dev
+                * and the stripes.
                 */
                sizeof(u64) * (total_stripes),
                GFP_NOFS|__GFP_NOFAIL);
 
-       atomic_set(&bbio->error, 0);
-       refcount_set(&bbio->refs, 1);
+       atomic_set(&bioc->error, 0);
+       refcount_set(&bioc->refs, 1);
 
-       bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
-       bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
+       bioc->fs_info = fs_info;
+       bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+       bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
 
-       return bbio;
+       return bioc;
 }
 
-void btrfs_get_bbio(struct btrfs_bio *bbio)
+void btrfs_get_bioc(struct btrfs_io_context *bioc)
 {
-       WARN_ON(!refcount_read(&bbio->refs));
-       refcount_inc(&bbio->refs);
+       WARN_ON(!refcount_read(&bioc->refs));
+       refcount_inc(&bioc->refs);
 }
 
-void btrfs_put_bbio(struct btrfs_bio *bbio)
+void btrfs_put_bioc(struct btrfs_io_context *bioc)
 {
-       if (!bbio)
+       if (!bioc)
                return;
-       if (refcount_dec_and_test(&bbio->refs))
-               kfree(bbio);
+       if (refcount_dec_and_test(&bioc->refs))
+               kfree(bioc);
 }
 
 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
@@ -5859,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
  */
 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                                         u64 logical, u64 *length_ret,
-                                        struct btrfs_bio **bbio_ret)
+                                        struct btrfs_io_context **bioc_ret)
 {
        struct extent_map *em;
        struct map_lookup *map;
-       struct btrfs_bio *bbio;
+       struct btrfs_io_context *bioc;
        u64 length = *length_ret;
        u64 offset;
        u64 stripe_nr;
@@ -5882,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
        int ret = 0;
        int i;
 
-       /* discard always return a bbio */
-       ASSERT(bbio_ret);
+       /* Discard always returns a bioc. */
+       ASSERT(bioc_ret);
 
        em = btrfs_get_chunk_map(fs_info, logical, length);
        if (IS_ERR(em))
@@ -5946,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                                        &stripe_index);
        }
 
-       bbio = alloc_btrfs_bio(num_stripes, 0);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0);
+       if (!bioc) {
                ret = -ENOMEM;
                goto out;
        }
 
        for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical =
+               bioc->stripes[i].physical =
                        map->stripes[stripe_index].physical +
                        stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
 
                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
                                 BTRFS_BLOCK_GROUP_RAID10)) {
-                       bbio->stripes[i].length = stripes_per_dev *
+                       bioc->stripes[i].length = stripes_per_dev *
                                map->stripe_len;
 
                        if (i / sub_stripes < remaining_stripes)
-                               bbio->stripes[i].length +=
-                                       map->stripe_len;
+                               bioc->stripes[i].length += map->stripe_len;
 
                        /*
                         * Special for the first stripe and
@@ -5976,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                         *    off     end_off
                         */
                        if (i < sub_stripes)
-                               bbio->stripes[i].length -=
-                                       stripe_offset;
+                               bioc->stripes[i].length -= stripe_offset;
 
                        if (stripe_index >= last_stripe &&
                            stripe_index <= (last_stripe +
                                             sub_stripes - 1))
-                               bbio->stripes[i].length -=
-                                       stripe_end_offset;
+                               bioc->stripes[i].length -= stripe_end_offset;
 
                        if (i == sub_stripes - 1)
                                stripe_offset = 0;
                } else {
-                       bbio->stripes[i].length = length;
+                       bioc->stripes[i].length = length;
                }
 
                stripe_index++;
@@ -5998,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
                }
        }
 
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
 out:
        free_extent_map(em);
        return ret;
@@ -6024,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
                                         u64 srcdev_devid, int *mirror_num,
                                         u64 *physical)
 {
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        int num_stripes;
        int index_srcdev = 0;
        int found = 0;
@@ -6033,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
        int ret = 0;
 
        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &length, &bbio, 0, 0);
+                               logical, &length, &bioc, 0, 0);
        if (ret) {
-               ASSERT(bbio == NULL);
+               ASSERT(bioc == NULL);
                return ret;
        }
 
-       num_stripes = bbio->num_stripes;
+       num_stripes = bioc->num_stripes;
        if (*mirror_num > num_stripes) {
                /*
                 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
                 * that means that the requested area is not left of the left
                 * cursor
                 */
-               btrfs_put_bbio(bbio);
+               btrfs_put_bioc(bioc);
                return -EIO;
        }
 
@@ -6056,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
         * pointer to the one of the target drive.
         */
        for (i = 0; i < num_stripes; i++) {
-               if (bbio->stripes[i].dev->devid != srcdev_devid)
+               if (bioc->stripes[i].dev->devid != srcdev_devid)
                        continue;
 
                /*
@@ -6064,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
                 * mirror with the lowest physical address
                 */
                if (found &&
-                   physical_of_found <= bbio->stripes[i].physical)
+                   physical_of_found <= bioc->stripes[i].physical)
                        continue;
 
                index_srcdev = i;
                found = 1;
-               physical_of_found = bbio->stripes[i].physical;
+               physical_of_found = bioc->stripes[i].physical;
        }
 
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
 
        ASSERT(found);
        if (!found)
@@ -6103,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
 }
 
 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
-                                     struct btrfs_bio **bbio_ret,
+                                     struct btrfs_io_context **bioc_ret,
                                      struct btrfs_dev_replace *dev_replace,
                                      u64 logical,
                                      int *num_stripes_ret, int *max_errors_ret)
 {
-       struct btrfs_bio *bbio = *bbio_ret;
+       struct btrfs_io_context *bioc = *bioc_ret;
        u64 srcdev_devid = dev_replace->srcdev->devid;
        int tgtdev_indexes = 0;
        int num_stripes = *num_stripes_ret;
@@ -6138,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
                 */
                index_where_to_add = num_stripes;
                for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                /* write to new disk, too */
-                               struct btrfs_bio_stripe *new =
-                                       bbio->stripes + index_where_to_add;
-                               struct btrfs_bio_stripe *old =
-                                       bbio->stripes + i;
+                               struct btrfs_io_stripe *new =
+                                       bioc->stripes + index_where_to_add;
+                               struct btrfs_io_stripe *old =
+                                       bioc->stripes + i;
 
                                new->physical = old->physical;
                                new->length = old->length;
                                new->dev = dev_replace->tgtdev;
-                               bbio->tgtdev_map[i] = index_where_to_add;
+                               bioc->tgtdev_map[i] = index_where_to_add;
                                index_where_to_add++;
                                max_errors++;
                                tgtdev_indexes++;
@@ -6168,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
                 * full copy of the source drive.
                 */
                for (i = 0; i < num_stripes; i++) {
-                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                       if (bioc->stripes[i].dev->devid == srcdev_devid) {
                                /*
                                 * In case of DUP, in order to keep it simple,
                                 * only add the mirror with the lowest physical
                                 * address
                                 */
                                if (found &&
-                                   physical_of_found <=
-                                    bbio->stripes[i].physical)
+                                   physical_of_found <= bioc->stripes[i].physical)
                                        continue;
                                index_srcdev = i;
                                found = 1;
-                               physical_of_found = bbio->stripes[i].physical;
+                               physical_of_found = bioc->stripes[i].physical;
                        }
                }
                if (found) {
-                       struct btrfs_bio_stripe *tgtdev_stripe =
-                               bbio->stripes + num_stripes;
+                       struct btrfs_io_stripe *tgtdev_stripe =
+                               bioc->stripes + num_stripes;
 
                        tgtdev_stripe->physical = physical_of_found;
                        tgtdev_stripe->length =
-                               bbio->stripes[index_srcdev].length;
+                               bioc->stripes[index_srcdev].length;
                        tgtdev_stripe->dev = dev_replace->tgtdev;
-                       bbio->tgtdev_map[index_srcdev] = num_stripes;
+                       bioc->tgtdev_map[index_srcdev] = num_stripes;
 
                        tgtdev_indexes++;
                        num_stripes++;
@@ -6200,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
 
        *num_stripes_ret = num_stripes;
        *max_errors_ret = max_errors;
-       bbio->num_tgtdevs = tgtdev_indexes;
-       *bbio_ret = bbio;
+       bioc->num_tgtdevs = tgtdev_indexes;
+       *bioc_ret = bioc;
 }
 
 static bool need_full_stripe(enum btrfs_map_op op)
@@ -6304,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                             enum btrfs_map_op op,
                             u64 logical, u64 *length,
-                            struct btrfs_bio **bbio_ret,
+                            struct btrfs_io_context **bioc_ret,
                             int mirror_num, int need_raid_map)
 {
        struct extent_map *em;
@@ -6319,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        int num_stripes;
        int max_errors = 0;
        int tgtdev_indexes = 0;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
        int dev_replace_is_ongoing = 0;
        int num_alloc_stripes;
@@ -6328,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        u64 raid56_full_stripe_start = (u64)-1;
        struct btrfs_io_geometry geom;
 
-       ASSERT(bbio_ret);
+       ASSERT(bioc_ret);
        ASSERT(op != BTRFS_MAP_DISCARD);
 
        em = btrfs_get_chunk_map(fs_info, logical, *length);
@@ -6472,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                tgtdev_indexes = num_stripes;
        }
 
-       bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
-       if (!bbio) {
+       bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
+       if (!bioc) {
                ret = -ENOMEM;
                goto out;
        }
 
        for (i = 0; i < num_stripes; i++) {
-               bbio->stripes[i].physical = map->stripes[stripe_index].physical +
+               bioc->stripes[i].physical = map->stripes[stripe_index].physical +
                        stripe_offset + stripe_nr * map->stripe_len;
-               bbio->stripes[i].dev = map->stripes[stripe_index].dev;
+               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
                stripe_index++;
        }
 
-       /* build raid_map */
+       /* Build raid_map */
        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
            (need_full_stripe(op) || mirror_num > 1)) {
                u64 tmp;
@@ -6497,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                /* Fill in the logical address of each stripe */
                tmp = stripe_nr * data_stripes;
                for (i = 0; i < data_stripes; i++)
-                       bbio->raid_map[(i+rot) % num_stripes] =
+                       bioc->raid_map[(i + rot) % num_stripes] =
                                em->start + (tmp + i) * map->stripe_len;
 
-               bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+               bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                       bbio->raid_map[(i+rot+1) % num_stripes] =
+                       bioc->raid_map[(i + rot + 1) % num_stripes] =
                                RAID6_Q_STRIPE;
 
-               sort_parity_stripes(bbio, num_stripes);
+               sort_parity_stripes(bioc, num_stripes);
        }
 
        if (need_full_stripe(op))
@@ -6513,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 
        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
            need_full_stripe(op)) {
-               handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
+               handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
                                          &num_stripes, &max_errors);
        }
 
-       *bbio_ret = bbio;
-       bbio->map_type = map->type;
-       bbio->num_stripes = num_stripes;
-       bbio->max_errors = max_errors;
-       bbio->mirror_num = mirror_num;
+       *bioc_ret = bioc;
+       bioc->map_type = map->type;
+       bioc->num_stripes = num_stripes;
+       bioc->max_errors = max_errors;
+       bioc->mirror_num = mirror_num;
 
        /*
         * this is the case that REQ_READ && dev_replace_is_ongoing &&
@@ -6530,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
         */
        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
                WARN_ON(num_stripes > 1);
-               bbio->stripes[0].dev = dev_replace->tgtdev;
-               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
-               bbio->mirror_num = map->num_stripes + 1;
+               bioc->stripes[0].dev = dev_replace->tgtdev;
+               bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bioc->mirror_num = map->num_stripes + 1;
        }
 out:
        if (dev_replace_is_ongoing) {
@@ -6546,43 +6600,43 @@ out:
 
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      u64 logical, u64 *length,
-                     struct btrfs_bio **bbio_ret, int mirror_num)
+                     struct btrfs_io_context **bioc_ret, int mirror_num)
 {
        if (op == BTRFS_MAP_DISCARD)
                return __btrfs_map_block_for_discard(fs_info, logical,
-                                                    length, bbio_ret);
+                                                    length, bioc_ret);
 
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
                                 mirror_num, 0);
 }
 
 /* For Scrub/replace */
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret)
+                    struct btrfs_io_context **bioc_ret)
 {
-       return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
 }
 
-static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
+static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio)
 {
-       bio->bi_private = bbio->private;
-       bio->bi_end_io = bbio->end_io;
+       bio->bi_private = bioc->private;
+       bio->bi_end_io = bioc->end_io;
        bio_endio(bio);
 
-       btrfs_put_bbio(bbio);
+       btrfs_put_bioc(bioc);
 }
 
 static void btrfs_end_bio(struct bio *bio)
 {
-       struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_io_context *bioc = bio->bi_private;
        int is_orig_bio = 0;
 
        if (bio->bi_status) {
-               atomic_inc(&bbio->error);
+               atomic_inc(&bioc->error);
                if (bio->bi_status == BLK_STS_IOERR ||
                    bio->bi_status == BLK_STS_TARGET) {
-                       struct btrfs_device *dev = btrfs_io_bio(bio)->device;
+                       struct btrfs_device *dev = btrfs_bio(bio)->device;
 
                        ASSERT(dev->bdev);
                        if (btrfs_op(bio) == BTRFS_MAP_WRITE)
@@ -6597,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio)
                }
        }
 
-       if (bio == bbio->orig_bio)
+       if (bio == bioc->orig_bio)
                is_orig_bio = 1;
 
-       btrfs_bio_counter_dec(bbio->fs_info);
+       btrfs_bio_counter_dec(bioc->fs_info);
 
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                if (!is_orig_bio) {
                        bio_put(bio);
-                       bio = bbio->orig_bio;
+                       bio = bioc->orig_bio;
                }
 
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                /* only send an error to the higher layers if it is
                 * beyond the tolerance of the btrfs bio
                 */
-               if (atomic_read(&bbio->error) > bbio->max_errors) {
+               if (atomic_read(&bioc->error) > bioc->max_errors) {
                        bio->bi_status = BLK_STS_IOERR;
                } else {
                        /*
@@ -6622,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio)
                        bio->bi_status = BLK_STS_OK;
                }
 
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
        } else if (!is_orig_bio) {
                bio_put(bio);
        }
 }
 
-static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
+static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
                              u64 physical, struct btrfs_device *dev)
 {
-       struct btrfs_fs_info *fs_info = bbio->fs_info;
+       struct btrfs_fs_info *fs_info = bioc->fs_info;
 
-       bio->bi_private = bbio;
-       btrfs_io_bio(bio)->device = dev;
+       bio->bi_private = bioc;
+       btrfs_bio(bio)->device = dev;
        bio->bi_end_io = btrfs_end_bio;
        bio->bi_iter.bi_sector = physical >> 9;
        /*
@@ -6663,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
        btrfsic_submit_bio(bio);
 }
 
-static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
 {
-       atomic_inc(&bbio->error);
-       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+       atomic_inc(&bioc->error);
+       if (atomic_dec_and_test(&bioc->stripes_pending)) {
                /* Should be the original bio. */
-               WARN_ON(bio != bbio->orig_bio);
+               WARN_ON(bio != bioc->orig_bio);
 
-               btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+               btrfs_bio(bio)->mirror_num = bioc->mirror_num;
                bio->bi_iter.bi_sector = logical >> 9;
-               if (atomic_read(&bbio->error) > bbio->max_errors)
+               if (atomic_read(&bioc->error) > bioc->max_errors)
                        bio->bi_status = BLK_STS_IOERR;
                else
                        bio->bi_status = BLK_STS_OK;
-               btrfs_end_bbio(bbio, bio);
+               btrfs_end_bioc(bioc, bio);
        }
 }
 
@@ -6691,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
        int ret;
        int dev_nr;
        int total_devs;
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
 
        length = bio->bi_iter.bi_size;
        map_length = length;
 
        btrfs_bio_counter_inc_blocked(fs_info);
        ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
-                               &map_length, &bbio, mirror_num, 1);
+                               &map_length, &bioc, mirror_num, 1);
        if (ret) {
                btrfs_bio_counter_dec(fs_info);
                return errno_to_blk_status(ret);
        }
 
-       total_devs = bbio->num_stripes;
-       bbio->orig_bio = first_bio;
-       bbio->private = first_bio->bi_private;
-       bbio->end_io = first_bio->bi_end_io;
-       bbio->fs_info = fs_info;
-       atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+       total_devs = bioc->num_stripes;
+       bioc->orig_bio = first_bio;
+       bioc->private = first_bio->bi_private;
+       bioc->end_io = first_bio->bi_end_io;
+       atomic_set(&bioc->stripes_pending, bioc->num_stripes);
 
-       if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+       if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
            ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
-                       ret = raid56_parity_write(fs_info, bio, bbio,
-                                                 map_length);
+                       ret = raid56_parity_write(bio, bioc, map_length);
                } else {
-                       ret = raid56_parity_recover(fs_info, bio, bbio,
-                                                   map_length, mirror_num, 1);
+                       ret = raid56_parity_recover(bio, bioc, map_length,
+                                                   mirror_num, 1);
                }
 
                btrfs_bio_counter_dec(fs_info);
@@ -6735,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
        }
 
        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-               dev = bbio->stripes[dev_nr].dev;
+               dev = bioc->stripes[dev_nr].dev;
                if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
                                                   &dev->dev_state) ||
                    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-                       bbio_error(bbio, first_bio, logical);
+                       bioc_error(bioc, first_bio, logical);
                        continue;
                }
 
@@ -6749,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                else
                        bio = first_bio;
 
-               submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
+               submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
        }
        btrfs_bio_counter_dec(fs_info);
        return BLK_STS_OK;
 }
 
+static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
+                                     const struct btrfs_fs_devices *fs_devices)
+{
+       if (args->fsid == NULL)
+               return true;
+       if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
+               return true;
+       return false;
+}
+
+static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
+                                 const struct btrfs_device *device)
+{
+       ASSERT((args->devid != (u64)-1) || args->missing);
+
+       if ((args->devid != (u64)-1) && device->devid != args->devid)
+               return false;
+       if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
+               return false;
+       if (!args->missing)
+               return true;
+       if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
+           !device->bdev)
+               return true;
+       return false;
+}
+
 /*
  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
  * return NULL.
@@ -6762,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
  * If devid and uuid are both specified, the match must be exact, otherwise
  * only devid is used.
  */
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
-                                      u64 devid, u8 *uuid, u8 *fsid)
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+                                      const struct btrfs_dev_lookup_args *args)
 {
        struct btrfs_device *device;
        struct btrfs_fs_devices *seed_devs;
 
-       if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+       if (dev_args_match_fs_devices(args, fs_devices)) {
                list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                       if (device->devid == devid &&
-                           (!uuid || memcmp(device->uuid, uuid,
-                                            BTRFS_UUID_SIZE) == 0))
+                       if (dev_args_match_device(args, device))
                                return device;
                }
        }
 
        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
-               if (!fsid ||
-                   !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
-                       list_for_each_entry(device, &seed_devs->devices,
-                                           dev_list) {
-                               if (device->devid == devid &&
-                                   (!uuid || memcmp(device->uuid, uuid,
-                                                    BTRFS_UUID_SIZE) == 0))
-                                       return device;
-                       }
+               if (!dev_args_match_fs_devices(args, seed_devs))
+                       continue;
+               list_for_each_entry(device, &seed_devs->devices, dev_list) {
+                       if (dev_args_match_device(args, device))
+                               return device;
                }
        }
 
@@ -6952,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                          struct btrfs_chunk *chunk)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
        struct map_lookup *map;
@@ -7029,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
                map->stripes[i].physical =
                        btrfs_stripe_offset_nr(leaf, chunk, i);
                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+               args.devid = devid;
                read_extent_buffer(leaf, uuid, (unsigned long)
                                   btrfs_stripe_dev_uuid_nr(chunk, i),
                                   BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
-                                                       devid, uuid, NULL);
+               args.uuid = uuid;
+               map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
                if (!map->stripes[i].dev &&
                    !btrfs_test_opt(fs_info, DEGRADED)) {
                        free_extent_map(em);
@@ -7151,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
 static int read_one_dev(struct extent_buffer *leaf,
                        struct btrfs_dev_item *dev_item)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
@@ -7159,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf,
        u8 fs_uuid[BTRFS_FSID_SIZE];
        u8 dev_uuid[BTRFS_UUID_SIZE];
 
-       devid = btrfs_device_id(leaf, dev_item);
+       devid = args.devid = btrfs_device_id(leaf, dev_item);
        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
                           BTRFS_UUID_SIZE);
        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
                           BTRFS_FSID_SIZE);
+       args.uuid = dev_uuid;
+       args.fsid = fs_uuid;
 
        if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
                fs_devices = open_seed_devices(fs_info, fs_uuid);
@@ -7171,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf,
                        return PTR_ERR(fs_devices);
        }
 
-       device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
-                                  fs_uuid);
+       device = btrfs_find_device(fs_info->fs_devices, &args);
        if (!device) {
                if (!btrfs_test_opt(fs_info, DEGRADED)) {
                        btrfs_report_missing_device(fs_info, devid,
@@ -7841,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
                        struct btrfs_ioctl_get_dev_stats *stats)
 {
+       BTRFS_DEV_LOOKUP_ARGS(args);
        struct btrfs_device *dev;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        int i;
 
        mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
+       args.devid = stats->devid;
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
        mutex_unlock(&fs_devices->device_list_mutex);
 
        if (!dev) {
@@ -7922,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
                                 u64 chunk_offset, u64 devid,
                                 u64 physical_offset, u64 physical_len)
 {
+       struct btrfs_dev_lookup_args args = { .devid = devid };
        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
        struct extent_map *em;
        struct map_lookup *map;
@@ -7977,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
        }
 
        /* Make sure no dev extent is beyond device boundary */
-       dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info->fs_devices, &args);
        if (!dev) {
                btrfs_err(fs_info, "failed to find devid %llu", devid);
                ret = -EUCLEAN;
index 2183361..3b81306 100644 (file)
@@ -236,17 +236,40 @@ struct btrfs_fs_devices {
        bool fsid_change;
        struct list_head fs_list;
 
+       /*
+        * Number of devices under this fsid including missing and
+        * replace-target device and excludes seed devices.
+        */
        u64 num_devices;
+
+       /*
+        * The number of devices that successfully opened, including
+        * replace-target, excludes seed devices.
+        */
        u64 open_devices;
+
+       /* The number of devices that are under the chunk allocation list. */
        u64 rw_devices;
+
+       /* Count of missing devices under this fsid excluding seed device. */
        u64 missing_devices;
        u64 total_rw_bytes;
+
+       /*
+        * Count of devices from btrfs_super_block::num_devices for this fsid,
+        * which includes the seed device, excludes the transient replace-target
+        * device.
+        */
        u64 total_devices;
 
        /* Highest generation number of seen devices */
        u64 latest_generation;
 
-       struct block_device *latest_bdev;
+       /*
+        * The mount device or a device with highest generation after removal
+        * or replace.
+        */
+       struct btrfs_device *latest_dev;
 
        /* all of the devices in the FS, protected by a mutex
         * so we can safely walk it to write out the supers without
@@ -300,48 +323,62 @@ struct btrfs_fs_devices {
                                / sizeof(struct btrfs_stripe) + 1)
 
 /*
- * we need the mirror number and stripe index to be passed around
- * the call chain while we are processing end_io (especially errors).
- * Really, what we need is a btrfs_bio structure that has this info
- * and is properly sized with its stripe array, but we're not there
- * quite yet.  We have our own btrfs bioset, and all of the bios
- * we allocate are actually btrfs_io_bios.  We'll cram as much of
- * struct btrfs_bio as we can into this over time.
+ * Additional info to pass along bio.
+ *
+ * Mostly for btrfs specific features like csum and mirror_num.
  */
-struct btrfs_io_bio {
+struct btrfs_bio {
        unsigned int mirror_num;
+
+       /* @device is for stripe IO submission. */
        struct btrfs_device *device;
-       u64 logical;
        u8 *csum;
        u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
        struct bvec_iter iter;
+
        /*
         * This member must come last, bio_alloc_bioset will allocate enough
-        * bytes for entire btrfs_io_bio but relies on bio being last.
+        * bytes for entire btrfs_bio but relies on bio being last.
         */
        struct bio bio;
 };
 
-static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
 {
-       return container_of(bio, struct btrfs_io_bio, bio);
+       return container_of(bio, struct btrfs_bio, bio);
 }
 
-static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio)
+static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 {
-       if (io_bio->csum != io_bio->csum_inline) {
-               kfree(io_bio->csum);
-               io_bio->csum = NULL;
+       if (bbio->csum != bbio->csum_inline) {
+               kfree(bbio->csum);
+               bbio->csum = NULL;
        }
 }
 
-struct btrfs_bio_stripe {
+struct btrfs_io_stripe {
        struct btrfs_device *dev;
        u64 physical;
        u64 length; /* only used for discard mappings */
 };
 
-struct btrfs_bio {
+/*
+ * Context for IO subsmission for device stripe.
+ *
+ * - Track the unfinished mirrors for mirror based profiles
+ *   Mirror based profiles are SINGLE/DUP/RAID1/RAID10.
+ *
+ * - Contain the logical -> physical mapping info
+ *   Used by submit_stripe_bio() for mapping logical bio
+ *   into physical device address.
+ *
+ * - Contain device replace info
+ *   Used by handle_ops_on_dev_replace() to copy logical bios
+ *   into the new device.
+ *
+ * - Contain RAID56 full stripe logical bytenrs
+ */
+struct btrfs_io_context {
        refcount_t refs;
        atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
@@ -361,7 +398,7 @@ struct btrfs_bio {
         * so raid_map[0] is the start of our full stripe
         */
        u64 *raid_map;
-       struct btrfs_bio_stripe stripes[];
+       struct btrfs_io_stripe stripes[];
 };
 
 struct btrfs_device_info {
@@ -396,11 +433,11 @@ struct map_lookup {
        int num_stripes;
        int sub_stripes;
        int verified_stripes; /* For mount time dev extent verification */
-       struct btrfs_bio_stripe stripes[];
+       struct btrfs_io_stripe stripes[];
 };
 
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
-                           (sizeof(struct btrfs_bio_stripe) * (n)))
+                           (sizeof(struct btrfs_io_stripe) * (n)))
 
 struct btrfs_balance_args;
 struct btrfs_balance_progress;
@@ -414,6 +451,22 @@ struct btrfs_balance_control {
        struct btrfs_balance_progress stat;
 };
 
+/*
+ * Search for a given device by the set parameters
+ */
+struct btrfs_dev_lookup_args {
+       u64 devid;
+       u8 *uuid;
+       u8 *fsid;
+       bool missing;
+};
+
+/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */
+#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 }
+
+#define BTRFS_DEV_LOOKUP_ARGS(name) \
+       struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT
+
 enum btrfs_map_op {
        BTRFS_MAP_READ,
        BTRFS_MAP_WRITE,
@@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
        }
 }
 
-void btrfs_get_bbio(struct btrfs_bio *bbio);
-void btrfs_put_bbio(struct btrfs_bio *bbio);
+void btrfs_get_bioc(struct btrfs_io_context *bioc);
+void btrfs_put_bioc(struct btrfs_io_context *bioc);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                    u64 logical, u64 *length,
-                   struct btrfs_bio **bbio_ret, int mirror_num);
+                   struct btrfs_io_context **bioc_ret, int mirror_num);
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
-                    struct btrfs_bio **bbio_ret);
+                    struct btrfs_io_context **bioc_ret);
 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
                          enum btrfs_map_op op, u64 logical,
                          struct btrfs_io_geometry *io_geom);
 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                            u64 type);
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
@@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
 struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
                                                  u64 devid,
                                                  const char *devpath);
+int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
+                                struct btrfs_dev_lookup_args *args,
+                                const char *path);
 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
                                        const u64 *devid,
                                        const u8 *uuid);
+void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 void btrfs_free_device(struct btrfs_device *device);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
-                   const char *device_path, u64 devid,
+                   struct btrfs_dev_lookup_args *args,
                    struct block_device **bdev, fmode_t *mode);
 void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
-                                      u64 devid, u8 *uuid, u8 *fsid);
+struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
+                                      const struct btrfs_dev_lookup_args *args);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
 int btrfs_balance(struct btrfs_fs_info *fs_info,
@@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_uuid_scan_kthread(void *data);
-int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
index 8a45142..2837b4c 100644 (file)
@@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
                 * matches our target xattr, so lets check.
                 */
                ret = 0;
-               btrfs_assert_tree_locked(path->nodes[0]);
+               btrfs_assert_tree_write_locked(path->nodes[0]);
                di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
                if (!di && !(flags & XATTR_REPLACE)) {
                        ret = -ENOSPC;
index 47af1ab..67d932d 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/sched/mm.h>
+#include <linux/atomic.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "zoned.h"
 #define BTRFS_NR_SB_LOG_ZONES 2
 
 /*
+ * Minimum of active zones we need:
+ *
+ * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
+ * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
+ * - 1 zone for tree-log dedicated block group
+ * - 1 zone for relocation
+ */
+#define BTRFS_MIN_ACTIVE_ZONES         (BTRFS_SUPER_MIRROR_MAX + 5)
+
+/*
  * Maximum supported zone size. Currently, SMR disks have a zone size of
  * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  * expect the zone size to become larger than 8GiB in the near future.
  */
 #define BTRFS_MAX_ZONE_SIZE            SZ_8G
 
+#define SUPER_INFO_SECTORS     ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
+
+static inline bool sb_zone_is_full(const struct blk_zone *zone)
+{
+       return (zone->cond == BLK_ZONE_COND_FULL) ||
+               (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
+}
+
 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
 {
        struct blk_zone *zones = data;
@@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
        bool empty[BTRFS_NR_SB_LOG_ZONES];
        bool full[BTRFS_NR_SB_LOG_ZONES];
        sector_t sector;
+       int i;
 
-       ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
-              zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
-
-       empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
-       empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
-       full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
-       full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
+       for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+               ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
+               empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
+               full[i] = sb_zone_is_full(&zones[i]);
+       }
 
        /*
         * Possible states of log buffer zones
@@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
        struct btrfs_fs_info *fs_info = device->fs_info;
        struct btrfs_zoned_device_info *zone_info = NULL;
        struct block_device *bdev = device->bdev;
+       struct request_queue *queue = bdev_get_queue(bdev);
+       unsigned int max_active_zones;
+       unsigned int nactive;
        sector_t nr_sectors;
        sector_t sector = 0;
        struct blk_zone *zones = NULL;
@@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
        if (!IS_ALIGNED(nr_sectors, zone_sectors))
                zone_info->nr_zones++;
 
+       max_active_zones = queue_max_active_zones(queue);
+       if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
+               btrfs_err_in_rcu(fs_info,
+"zoned: %s: max active zones %u is too small, need at least %u active zones",
+                                rcu_str_deref(device->name), max_active_zones,
+                                BTRFS_MIN_ACTIVE_ZONES);
+               ret = -EINVAL;
+               goto out;
+       }
+       zone_info->max_active_zones = max_active_zones;
+
        zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
        if (!zone_info->seq_zones) {
                ret = -ENOMEM;
@@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                goto out;
        }
 
+       zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
+       if (!zone_info->active_zones) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
        if (!zones) {
                ret = -ENOMEM;
@@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
        }
 
        /* Get zones type */
+       nactive = 0;
        while (sector < nr_sectors) {
                nr_zones = BTRFS_REPORT_NR_ZONES;
                ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
@@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                for (i = 0; i < nr_zones; i++) {
                        if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
                                __set_bit(nreported, zone_info->seq_zones);
-                       if (zones[i].cond == BLK_ZONE_COND_EMPTY)
+                       switch (zones[i].cond) {
+                       case BLK_ZONE_COND_EMPTY:
                                __set_bit(nreported, zone_info->empty_zones);
+                               break;
+                       case BLK_ZONE_COND_IMP_OPEN:
+                       case BLK_ZONE_COND_EXP_OPEN:
+                       case BLK_ZONE_COND_CLOSED:
+                               __set_bit(nreported, zone_info->active_zones);
+                               nactive++;
+                               break;
+                       }
                        nreported++;
                }
                sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
@@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
                goto out;
        }
 
+       if (max_active_zones) {
+               if (nactive > max_active_zones) {
+                       btrfs_err_in_rcu(device->fs_info,
+                       "zoned: %u active zones on %s exceeds max_active_zones %u",
+                                        nactive, rcu_str_deref(device->name),
+                                        max_active_zones);
+                       ret = -EIO;
+                       goto out;
+               }
+               atomic_set(&zone_info->active_zones_left,
+                          max_active_zones - nactive);
+       }
+
        /* Validate superblock log */
        nr_zones = BTRFS_NR_SB_LOG_ZONES;
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
 out:
        kfree(zones);
 out_free_zone_info:
+       bitmap_free(zone_info->active_zones);
        bitmap_free(zone_info->empty_zones);
        bitmap_free(zone_info->seq_zones);
        kfree(zone_info);
@@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
        if (!zone_info)
                return;
 
+       bitmap_free(zone_info->active_zones);
        bitmap_free(zone_info->seq_zones);
        bitmap_free(zone_info->empty_zones);
        kfree(zone_info);
@@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 
        /*
         * stripe_size is always aligned to BTRFS_STRIPE_LEN in
-        * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
+        * btrfs_create_chunk(). Since we want stripe_len == zone_size,
         * check the alignment here.
         */
        if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
@@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
                        reset = &zones[1];
 
                if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
-                       ASSERT(reset->cond == BLK_ZONE_COND_FULL);
+                       ASSERT(sb_zone_is_full(reset));
 
                        ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
                                               reset->start, reset->len,
@@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
                        reset->wp = reset->start;
                }
        } else if (ret != -ENOENT) {
-               /* For READ, we want the precious one */
+               /*
+                * For READ, we want the previous one. Move write pointer to
+                * the end of a zone, if it is at the head of a zone.
+                */
+               u64 zone_end = 0;
+
                if (wp == zones[0].start << SECTOR_SHIFT)
-                       wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
+                       zone_end = zones[1].start + zones[1].capacity;
+               else if (wp == zones[1].start << SECTOR_SHIFT)
+                       zone_end = zones[0].start + zones[0].capacity;
+               if (zone_end)
+                       wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
+                                       BTRFS_SUPER_INFO_SIZE);
+
                wp -= BTRFS_SUPER_INFO_SIZE;
        }
 
@@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
        return true;
 }
 
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 {
        struct btrfs_zoned_device_info *zinfo = device->zone_info;
        struct blk_zone *zone;
+       int i;
 
        if (!is_sb_log_zone(zinfo, mirror))
-               return;
+               return 0;
 
        zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
-       if (zone->cond != BLK_ZONE_COND_FULL) {
+       for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
+               /* Advance the next zone */
+               if (zone->cond == BLK_ZONE_COND_FULL) {
+                       zone++;
+                       continue;
+               }
+
                if (zone->cond == BLK_ZONE_COND_EMPTY)
                        zone->cond = BLK_ZONE_COND_IMP_OPEN;
 
-               zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
+               zone->wp += SUPER_INFO_SECTORS;
+
+               if (sb_zone_is_full(zone)) {
+                       /*
+                        * No room left to write new superblock. Since
+                        * superblock is written with REQ_SYNC, it is safe to
+                        * finish the zone now.
+                        *
+                        * If the write pointer is exactly at the capacity,
+                        * explicit ZONE_FINISH is not necessary.
+                        */
+                       if (zone->wp != zone->start + zone->capacity) {
+                               int ret;
+
+                               ret = blkdev_zone_mgmt(device->bdev,
+                                               REQ_OP_ZONE_FINISH, zone->start,
+                                               zone->len, GFP_NOFS);
+                               if (ret)
+                                       return ret;
+                       }
 
-               if (zone->wp == zone->start + zone->len)
+                       zone->wp = zone->start + zone->len;
                        zone->cond = BLK_ZONE_COND_FULL;
-
-               return;
+               }
+               return 0;
        }
 
-       zone++;
-       ASSERT(zone->cond != BLK_ZONE_COND_FULL);
-       if (zone->cond == BLK_ZONE_COND_EMPTY)
-               zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
-       zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
-
-       if (zone->wp == zone->start + zone->len)
-               zone->cond = BLK_ZONE_COND_FULL;
+       /* All the zones are FULL. Should not reach here. */
+       ASSERT(0);
+       return -EIO;
 }
 
 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
@@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
        return pos;
 }
 
+static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
+{
+       struct btrfs_zoned_device_info *zone_info = device->zone_info;
+       unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+       /* We can use any number of zones */
+       if (zone_info->max_active_zones == 0)
+               return true;
+
+       if (!test_bit(zno, zone_info->active_zones)) {
+               /* Active zone left? */
+               if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
+                       return false;
+               if (test_and_set_bit(zno, zone_info->active_zones)) {
+                       /* Someone already set the bit */
+                       atomic_inc(&zone_info->active_zones_left);
+               }
+       }
+
+       return true;
+}
+
+static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
+{
+       struct btrfs_zoned_device_info *zone_info = device->zone_info;
+       unsigned int zno = (pos >> zone_info->zone_size_shift);
+
+       /* We can use any number of zones */
+       if (zone_info->max_active_zones == 0)
+               return;
+
+       if (test_and_clear_bit(zno, zone_info->active_zones))
+               atomic_inc(&zone_info->active_zones_left);
+}
+
 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
                            u64 length, u64 *bytes)
 {
@@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
        *bytes = length;
        while (length) {
                btrfs_dev_set_zone_empty(device, physical);
+               btrfs_dev_clear_active_zone(device, physical);
                physical += device->zone_info->zone_size;
                length -= device->zone_info->zone_size;
        }
@@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
        int i;
        unsigned int nofs_flag;
        u64 *alloc_offsets = NULL;
+       u64 *caps = NULL;
+       unsigned long *active = NULL;
        u64 last_alloc = 0;
        u32 num_sequential = 0, num_conventional = 0;
 
@@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
 
        map = em->map_lookup;
 
+       cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+       if (!cache->physical_map) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
        if (!alloc_offsets) {
-               free_extent_map(em);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
+       if (!caps) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
+       if (!active) {
+               ret = -ENOMEM;
+               goto out;
        }
 
        for (i = 0; i < map->num_stripes; i++) {
@@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                        goto out;
                }
 
+               caps[i] = (zone.capacity << SECTOR_SHIFT);
+
                switch (zone.cond) {
                case BLK_ZONE_COND_OFFLINE:
                case BLK_ZONE_COND_READONLY:
@@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                        alloc_offsets[i] = 0;
                        break;
                case BLK_ZONE_COND_FULL:
-                       alloc_offsets[i] = fs_info->zone_size;
+                       alloc_offsets[i] = caps[i];
                        break;
                default:
                        /* Partially used zone */
                        alloc_offsets[i] =
                                        ((zone.wp - zone.start) << SECTOR_SHIFT);
+                       __set_bit(i, active);
                        break;
                }
+
+               /*
+                * Consider a zone as active if we can allow any number of
+                * active zones.
+                */
+               if (!device->zone_info->max_active_zones)
+                       __set_bit(i, active);
        }
 
        if (num_sequential > 0)
@@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                 * calculate_alloc_pointer() which takes extent buffer
                 * locks to avoid deadlock.
                 */
+
+               /* Zone capacity is always zone size in emulation */
+               cache->zone_capacity = cache->length;
                if (new) {
                        cache->alloc_offset = 0;
                        goto out;
@@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                        goto out;
                }
                cache->alloc_offset = alloc_offsets[0];
+               cache->zone_capacity = caps[0];
+               cache->zone_is_active = test_bit(0, active);
                break;
        case BTRFS_BLOCK_GROUP_DUP:
        case BTRFS_BLOCK_GROUP_RAID1:
@@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                goto out;
        }
 
+       if (cache->zone_is_active) {
+               btrfs_get_block_group(cache);
+               spin_lock(&fs_info->zone_active_bgs_lock);
+               list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
+               spin_unlock(&fs_info->zone_active_bgs_lock);
+       }
+
 out:
        if (cache->alloc_offset > fs_info->zone_size) {
                btrfs_err(fs_info,
@@ -1218,6 +1390,14 @@ out:
                ret = -EIO;
        }
 
+       if (cache->alloc_offset > cache->zone_capacity) {
+               btrfs_err(fs_info,
+"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
+                         cache->alloc_offset, cache->zone_capacity,
+                         cache->start);
+               ret = -EIO;
+       }
+
        /* An extent is allocated after the write pointer */
        if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
                btrfs_err(fs_info,
@@ -1229,6 +1409,12 @@ out:
        if (!ret)
                cache->meta_write_pointer = cache->alloc_offset + cache->start;
 
+       if (ret) {
+               kfree(cache->physical_map);
+               cache->physical_map = NULL;
+       }
+       bitmap_free(active);
+       kfree(caps);
        kfree(alloc_offsets);
        free_extent_map(em);
 
@@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
                return;
 
        WARN_ON(cache->bytes_super != 0);
-       unusable = cache->alloc_offset - cache->used;
-       free = cache->length - cache->alloc_offset;
+       unusable = (cache->alloc_offset - cache->used) +
+                  (cache->length - cache->zone_capacity);
+       free = cache->zone_capacity - cache->alloc_offset;
 
        /* We only need ->free_space in ALLOC_SEQ block groups */
        cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
        cache->free_space_ctl->free_space = free;
        cache->zone_unusable = unusable;
-
-       /* Should not have any excluded extents. Just in case, though */
-       btrfs_free_excluded_extents(cache);
 }
 
 void btrfs_redirty_list_add(struct btrfs_transaction *trans,
@@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
        if (!is_data_inode(&inode->vfs_inode))
                return false;
 
+       /*
+        * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+        * extent layout the relocation code has.
+        * Furthermore we have set aside own block-group from which only the
+        * relocation "process" can allocate and make sure only one process at a
+        * time can add pages to an extent that gets relocated, so it's safe to
+        * use regular REQ_OP_WRITE for this special case.
+        */
+       if (btrfs_is_data_reloc_root(inode->root))
+               return false;
+
        cache = btrfs_lookup_block_group(fs_info, start);
        ASSERT(cache);
        if (!cache)
@@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len
 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
                          struct blk_zone *zone)
 {
-       struct btrfs_bio *bbio = NULL;
+       struct btrfs_io_context *bioc = NULL;
        u64 mapped_length = PAGE_SIZE;
        unsigned int nofs_flag;
        int nmirrors;
        int i, ret;
 
        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
-                              &mapped_length, &bbio);
-       if (ret || !bbio || mapped_length < PAGE_SIZE) {
-               btrfs_put_bbio(bbio);
+                              &mapped_length, &bioc);
+       if (ret || !bioc || mapped_length < PAGE_SIZE) {
+               btrfs_put_bioc(bioc);
                return -EIO;
        }
 
-       if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+       if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                return -EINVAL;
 
        nofs_flag = memalloc_nofs_save();
-       nmirrors = (int)bbio->num_stripes;
+       nmirrors = (int)bioc->num_stripes;
        for (i = 0; i < nmirrors; i++) {
-               u64 physical = bbio->stripes[i].physical;
-               struct btrfs_device *dev = bbio->stripes[i].dev;
+               u64 physical = bioc->stripes[i].physical;
+               struct btrfs_device *dev = bioc->stripes[i].dev;
 
                /* Missing device */
                if (!dev->bdev)
@@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
 
        return device;
 }
+
+/**
+ * Activate block group and underlying device zones
+ *
+ * @block_group: the block group to activate
+ *
+ * Return: true on success, false otherwise
+ */
+bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+       struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+       bool ret;
+
+       if (!btrfs_is_zoned(block_group->fs_info))
+               return true;
+
+       map = block_group->physical_map;
+       /* Currently support SINGLE profile only */
+       ASSERT(map->num_stripes == 1);
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (device->zone_info->max_active_zones == 0)
+               return true;
+
+       spin_lock(&block_group->lock);
+
+       if (block_group->zone_is_active) {
+               ret = true;
+               goto out_unlock;
+       }
+
+       /* No space left */
+       if (block_group->alloc_offset == block_group->zone_capacity) {
+               ret = false;
+               goto out_unlock;
+       }
+
+       if (!btrfs_dev_set_active_zone(device, physical)) {
+               /* Cannot activate the zone */
+               ret = false;
+               goto out_unlock;
+       }
+
+       /* Successfully activated all the zones */
+       block_group->zone_is_active = 1;
+
+       spin_unlock(&block_group->lock);
+
+       /* For the active block group list */
+       btrfs_get_block_group(block_group);
+
+       spin_lock(&fs_info->zone_active_bgs_lock);
+       ASSERT(list_empty(&block_group->active_bg_list));
+       list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
+       spin_unlock(&fs_info->zone_active_bgs_lock);
+
+       return true;
+
+out_unlock:
+       spin_unlock(&block_group->lock);
+       return ret;
+}
+
+int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+       struct btrfs_fs_info *fs_info = block_group->fs_info;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+       int ret = 0;
+
+       if (!btrfs_is_zoned(fs_info))
+               return 0;
+
+       map = block_group->physical_map;
+       /* Currently support SINGLE profile only */
+       ASSERT(map->num_stripes == 1);
+
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (device->zone_info->max_active_zones == 0)
+               return 0;
+
+       spin_lock(&block_group->lock);
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               return 0;
+       }
+
+       /* Check if we have unwritten allocated space */
+       if ((block_group->flags &
+            (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
+           block_group->alloc_offset > block_group->meta_write_pointer) {
+               spin_unlock(&block_group->lock);
+               return -EAGAIN;
+       }
+       spin_unlock(&block_group->lock);
+
+       ret = btrfs_inc_block_group_ro(block_group, false);
+       if (ret)
+               return ret;
+
+       /* Ensure all writes in this block group finish */
+       btrfs_wait_block_group_reservations(block_group);
+       /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
+       btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
+                                block_group->length);
+
+       spin_lock(&block_group->lock);
+
+       /*
+        * Bail out if someone already deactivated the block group, or
+        * allocated space is left in the block group.
+        */
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               btrfs_dec_block_group_ro(block_group);
+               return 0;
+       }
+
+       if (block_group->reserved) {
+               spin_unlock(&block_group->lock);
+               btrfs_dec_block_group_ro(block_group);
+               return -EAGAIN;
+       }
+
+       block_group->zone_is_active = 0;
+       block_group->alloc_offset = block_group->zone_capacity;
+       block_group->free_space_ctl->free_space = 0;
+       btrfs_clear_treelog_bg(block_group);
+       spin_unlock(&block_group->lock);
+
+       ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+                              physical >> SECTOR_SHIFT,
+                              device->zone_info->zone_size >> SECTOR_SHIFT,
+                              GFP_NOFS);
+       btrfs_dec_block_group_ro(block_group);
+
+       if (!ret) {
+               btrfs_dev_clear_active_zone(device, physical);
+
+               spin_lock(&fs_info->zone_active_bgs_lock);
+               ASSERT(!list_empty(&block_group->active_bg_list));
+               list_del_init(&block_group->active_bg_list);
+               spin_unlock(&fs_info->zone_active_bgs_lock);
+
+               /* For active_bg_list */
+               btrfs_put_block_group(block_group);
+       }
+
+       return ret;
+}
+
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
+{
+       struct btrfs_device *device;
+       bool ret = false;
+
+       if (!btrfs_is_zoned(fs_devices->fs_info))
+               return true;
+
+       /* Non-single profiles are not supported yet */
+       if (raid_index != BTRFS_RAID_SINGLE)
+               return false;
+
+       /* Check if there is a device with active zones left */
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               struct btrfs_zoned_device_info *zinfo = device->zone_info;
+
+               if (!device->bdev)
+                       continue;
+
+               if (!zinfo->max_active_zones ||
+                   atomic_read(&zinfo->active_zones_left)) {
+                       ret = true;
+                       break;
+               }
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       return ret;
+}
+
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
+{
+       struct btrfs_block_group *block_group;
+       struct map_lookup *map;
+       struct btrfs_device *device;
+       u64 physical;
+
+       if (!btrfs_is_zoned(fs_info))
+               return;
+
+       block_group = btrfs_lookup_block_group(fs_info, logical);
+       ASSERT(block_group);
+
+       if (logical + length < block_group->start + block_group->zone_capacity)
+               goto out;
+
+       spin_lock(&block_group->lock);
+
+       if (!block_group->zone_is_active) {
+               spin_unlock(&block_group->lock);
+               goto out;
+       }
+
+       block_group->zone_is_active = 0;
+       /* We should have consumed all the free space */
+       ASSERT(block_group->alloc_offset == block_group->zone_capacity);
+       ASSERT(block_group->free_space_ctl->free_space == 0);
+       btrfs_clear_treelog_bg(block_group);
+       spin_unlock(&block_group->lock);
+
+       map = block_group->physical_map;
+       device = map->stripes[0].dev;
+       physical = map->stripes[0].physical;
+
+       if (!device->zone_info->max_active_zones)
+               goto out;
+
+       btrfs_dev_clear_active_zone(device, physical);
+
+       spin_lock(&fs_info->zone_active_bgs_lock);
+       ASSERT(!list_empty(&block_group->active_bg_list));
+       list_del_init(&block_group->active_bg_list);
+       spin_unlock(&fs_info->zone_active_bgs_lock);
+
+       btrfs_put_block_group(block_group);
+
+out:
+       btrfs_put_block_group(block_group);
+}
+
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
+{
+       struct btrfs_fs_info *fs_info = bg->fs_info;
+
+       spin_lock(&fs_info->relocation_bg_lock);
+       if (fs_info->data_reloc_bg == bg->start)
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
+}
index 4b29970..e53ab7b 100644 (file)
@@ -23,8 +23,11 @@ struct btrfs_zoned_device_info {
        u64 zone_size;
        u8  zone_size_shift;
        u32 nr_zones;
+       unsigned int max_active_zones;
+       atomic_t active_zones_left;
        unsigned long *seq_zones;
        unsigned long *empty_zones;
+       unsigned long *active_zones;
        struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
 };
 
@@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
                               u64 *bytenr_ret);
 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
                          u64 *bytenr_ret);
-void btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
+int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
                                 u64 hole_end, u64 num_bytes);
@@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
                                  u64 physical_start, u64 physical_pos);
 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
                                            u64 logical, u64 length);
+bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+int btrfs_zone_finish(struct btrfs_block_group *block_group);
+bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+                            int raid_index);
+void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
                                     struct blk_zone *zone)
@@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
        return 0;
 }
 
-static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
-{ }
+static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
+{
+       return 0;
+}
 
 static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 {
@@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device(
        return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+{
+       return true;
+}
+
+static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
+{
+       return 0;
+}
+
+static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+                                          int raid_index)
+{
+       return true;
+}
+
+static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
+                                          u64 logical, u64 length) { }
+
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
 #endif
 
 static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
index ed0cab8..9abc88d 100644 (file)
@@ -1782,12 +1782,13 @@ EXPORT_SYMBOL(generic_update_time);
  * This does the actual work of updating an inodes time or version.  Must have
  * had called mnt_want_write() before calling this.
  */
-static int update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
 {
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, time, flags);
        return generic_update_time(inode, time, flags);
 }
+EXPORT_SYMBOL(inode_update_time);
 
 /**
  *     atime_needs_update      -       update the access time
@@ -1857,7 +1858,7 @@ void touch_atime(const struct path *path)
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        now = current_time(inode);
-       update_time(inode, &now, S_ATIME);
+       inode_update_time(inode, &now, S_ATIME);
        __mnt_drop_write(mnt);
 skip_update:
        sb_end_write(inode->i_sb);
@@ -2002,7 +2003,7 @@ int file_update_time(struct file *file)
        if (__mnt_want_write_file(file))
                return 0;
 
-       ret = update_time(inode, &now, sync_it);
+       ret = inode_update_time(inode, &now, sync_it);
        __mnt_drop_write_file(file);
 
        return ret;
index 0dcb902..f3cfca5 100644 (file)
@@ -2496,6 +2496,8 @@ enum file_time_flags {
 
 extern bool atime_needs_update(const struct path *, struct inode *);
 extern void touch_atime(const struct path *);
+int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
+
 static inline void file_accessed(struct file *file)
 {
        if (!(file->f_flags & O_NOATIME))
index d7d3cfe..7386199 100644 (file)
@@ -771,10 +771,16 @@ struct btrfs_ioctl_received_subvol_args {
  */
 #define BTRFS_SEND_FLAG_OMIT_END_CMD           0x4
 
+/*
+ * Read the protocol version in the structure
+ */
+#define BTRFS_SEND_FLAG_VERSION                        0x8
+
 #define BTRFS_SEND_FLAG_MASK \
        (BTRFS_SEND_FLAG_NO_FILE_DATA | \
         BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \
-        BTRFS_SEND_FLAG_OMIT_END_CMD)
+        BTRFS_SEND_FLAG_OMIT_END_CMD | \
+        BTRFS_SEND_FLAG_VERSION)
 
 struct btrfs_ioctl_send_args {
        __s64 send_fd;                  /* in */
@@ -782,7 +788,8 @@ struct btrfs_ioctl_send_args {
        __u64 __user *clone_sources;    /* in */
        __u64 parent_root;              /* in */
        __u64 flags;                    /* in */
-       __u64 reserved[4];              /* in */
+       __u32 version;                  /* in */
+       __u8  reserved[28];             /* in */
 };
 
 /*