Merge tag 'for-6.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Oct 2022 00:36:48 +0000 (17:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 7 Oct 2022 00:36:48 +0000 (17:36 -0700)
Pull btrfs updates from David Sterba:
 "There's a bunch of performance improvements, most notably the FIEMAP
  speedup, the new block group tree to speed up mount on large
  filesystems, more io_uring integration, some sysfs exports and the
  usual fixes and core updates.

  Summary:

  Performance:

   - outstanding FIEMAP speed improvement
      - algorithmic change how extents are enumerated leads to orders of
        magnitude speed boost (uncached and cached)
      - extent sharing check speedup (2.2x uncached, 3x cached)
      - add more cancellation points, allowing to interrupt seeking in
        files with large number of extents
      - more efficient hole and data seeking (4x uncached, 1.3x cached)
      - sample results:
    256M, 32K extents:   4s ->  29ms  (~150x)
    512M, 64K extents:  30s ->  59ms  (~550x)
    1G,  128K extents: 225s -> 120ms (~1800x)

   - improved inode logging, especially for directories (on dbench
     workload throughput +25%, max latency -21%)

   - improved buffered IO, remove redundant extent state tracking,
     lowering memory consumption and avoiding rb tree traversal

   - add sysfs tunable to let qgroup temporarily skip exact accounting
     when deleting snapshot, leading to a speedup but requiring a rescan
     after that, will be used by snapper

   - support io_uring and buffered writes, until now it was just for
     direct IO, with the no-wait semantics implemented in the buffered
     write path it now works and leads to speed improvement in IOPS
     (2x), throughput (2.2x), latency (depends, 2x to 150x)

   - small performance improvements when dropping and searching for
     extent maps as well as when flushing delalloc in COW mode
     (throughput +5MB/s)

  User visible changes:

   - new incompatible feature block-group-tree adding a dedicated tree
     for tracking block groups, this allows a much faster load during
     mount and avoids seeking unlike when it's scattered in the extent
     tree items
      - this reduces mount time for many-terabyte sized filesystems
      - conversion tool will be provided so existing filesystem can also
        be updated in place
      - to reduce test matrix and feature combinations requires no-holes
        and free-space-tree (mkfs defaults since 5.15)

   - improved reporting of super block corruption detected by scrub

   - scrub also tries to repair super block and does not wait until next
     commit

   - discard stats and tunables are exported in sysfs
     (/sys/fs/btrfs/FSID/discard)

   - qgroup status is exported in sysfs
     (/sys/sys/fs/btrfs/FSID/qgroups/)

   - verify that super block was not modified when thawing filesystem

  Fixes:

   - FIEMAP fixes
      - fix extent sharing status, does not depend on the cached status
        where merged
      - flush delalloc so compressed extents are reported correctly

   - fix alignment of VMA for memory mapped files on THP

   - send: fix failures when processing inodes with no links (orphan
     files and directories)

   - fix race between quota enable and quota rescan ioctl

   - handle more corner cases for read-only compat feature verification

   - fix missed extent on fsync after dropping extent maps

  Core:

   - lockdep annotations to validate various transactions states and
     state transitions

   - preliminary support for fs-verity in send

   - more effective memory use in scrub for subpage where sector is
     smaller than page

   - block group caching progress logic has been removed, load is now
     synchronous

   - simplify end IO callbacks and bio handling, use chained bios
     instead of own tracking

   - add no-wait semantics to several functions (tree search, nocow,
     flushing, buffered write

   - cleanups and refactoring

  MM changes:

   - export balance_dirty_pages_ratelimited_flags"

* tag 'for-6.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (177 commits)
  btrfs: set generation before calling btrfs_clean_tree_block in btrfs_init_new_buffer
  btrfs: drop extent map range more efficiently
  btrfs: avoid pointless extent map tree search when flushing delalloc
  btrfs: remove unnecessary next extent map search
  btrfs: remove unnecessary NULL pointer checks when searching extent maps
  btrfs: assert tree is locked when clearing extent map from logging
  btrfs: remove unnecessary extent map initializations
  btrfs: remove the refcount warning/check at free_extent_map()
  btrfs: add helper to replace extent map range with a new extent map
  btrfs: move open coded extent map tree deletion out of inode eviction
  btrfs: use cond_resched_rwlock_write() during inode eviction
  btrfs: use extent_map_end() at btrfs_drop_extent_map_range()
  btrfs: move btrfs_drop_extent_cache() to extent_map.c
  btrfs: fix missed extent on fsync after dropping extent maps
  btrfs: remove stale prototype of btrfs_write_inode
  btrfs: enable nowait async buffered writes
  btrfs: assert nowait mode is not used for some btree search functions
  btrfs: make btrfs_buffered_write nowait compatible
  btrfs: plumb NOWAIT through the write path
  btrfs: make lock_and_cleanup_extent_if_need nowait compatible
  ...

70 files changed:
fs/btrfs/Makefile
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/block-group.c
fs/btrfs/block-group.h
fs/btrfs/block-rsv.c
fs/btrfs/block-rsv.h
fs/btrfs/btrfs_inode.h
fs/btrfs/compression.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delalloc-space.c
fs/btrfs/delalloc-space.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-inode.h
fs/btrfs/dev-replace.c
fs/btrfs/dev-replace.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-io-tree.c [new file with mode: 0644]
fs/btrfs/extent-io-tree.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file-item.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/free-space-cache.h
fs/btrfs/free-space-tree.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/locking.c
fs/btrfs/locking.h
fs/btrfs/misc.h
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/props.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/raid56.c
fs/btrfs/raid56.h
fs/btrfs/reflink.c
fs/btrfs/relocation.c
fs/btrfs/root-tree.c
fs/btrfs/scrub.c
fs/btrfs/send.c
fs/btrfs/send.h
fs/btrfs/space-info.c
fs/btrfs/space-info.h
fs/btrfs/super.c
fs/btrfs/sysfs.c
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/tests/extent-io-tests.c
fs/btrfs/tests/free-space-tests.c
fs/btrfs/tests/inode-tests.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/tree-log.h
fs/btrfs/verity.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/zoned.c
fs/verity/fsverity_private.h
include/linux/fsverity.h
include/trace/events/btrfs.h
include/uapi/linux/btrfs.h
include/uapi/linux/btrfs_tree.h
mm/page-writeback.c

index 99f9995..fa9ddcc 100644 (file)
@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
           uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
           block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
-          subpage.o tree-mod-log.o
+          subpage.o tree-mod-log.o extent-io-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index d385357..dce3a16 100644 (file)
@@ -1511,16 +1511,118 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-/**
- * Check if an extent is shared or not
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+                                       struct btrfs_root *root,
+                                       u64 bytenr, int level, bool *is_shared)
+{
+       struct btrfs_backref_shared_cache_entry *entry;
+
+       if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+               return false;
+
+       /*
+        * Level -1 is used for the data extent, which is not reliable to cache
+        * because its reference count can increase or decrease without us
+        * realizing. We cache results only for extent buffers that lead from
+        * the root node down to the leaf with the file extent item.
+        */
+       ASSERT(level >= 0);
+
+       entry = &cache->entries[level];
+
+       /* Unused cache entry or being used for some other extent buffer. */
+       if (entry->bytenr != bytenr)
+               return false;
+
+       /*
+        * We cached a false result, but the last snapshot generation of the
+        * root changed, so we now have a snapshot. Don't trust the result.
+        */
+       if (!entry->is_shared &&
+           entry->gen != btrfs_root_last_snapshot(&root->root_item))
+               return false;
+
+       /*
+        * If we cached a true result and the last generation used for dropping
+        * a root changed, we can not trust the result, because the dropped root
+        * could be a snapshot sharing this extent buffer.
+        */
+       if (entry->is_shared &&
+           entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
+               return false;
+
+       *is_shared = entry->is_shared;
+
+       return true;
+}
+
+/*
+ * The caller has joined a transaction or is holding a read lock on the
+ * fs_info->commit_root_sem semaphore, so no need to worry about the root's last
+ * snapshot field changing while updating or checking the cache.
+ */
+static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
+                                      struct btrfs_root *root,
+                                      u64 bytenr, int level, bool is_shared)
+{
+       struct btrfs_backref_shared_cache_entry *entry;
+       u64 gen;
+
+       if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
+               return;
+
+       /*
+        * Level -1 is used for the data extent, which is not reliable to cache
+        * because its reference count can increase or decrease without us
+        * realizing. We cache results only for extent buffers that lead from
+        * the root node down to the leaf with the file extent item.
+        */
+       ASSERT(level >= 0);
+
+       if (is_shared)
+               gen = btrfs_get_last_root_drop_gen(root->fs_info);
+       else
+               gen = btrfs_root_last_snapshot(&root->root_item);
+
+       entry = &cache->entries[level];
+       entry->bytenr = bytenr;
+       entry->is_shared = is_shared;
+       entry->gen = gen;
+
+       /*
+        * If we found an extent buffer is shared, set the cache result for all
+        * extent buffers below it to true. As nodes in the path are COWed,
+        * their sharedness is moved to their children, and if a leaf is COWed,
+        * then the sharedness of a data extent becomes direct, the refcount of
+        * data extent is increased in the extent item at the extent tree.
+        */
+       if (is_shared) {
+               for (int i = 0; i < level; i++) {
+                       entry = &cache->entries[i];
+                       entry->is_shared = is_shared;
+                       entry->gen = gen;
+               }
+       }
+}
+
+/*
+ * Check if a data extent is shared or not.
  *
- * @root:   root inode belongs to
- * @inum:   inode number of the inode whose extent we are checking
- * @bytenr: logical bytenr of the extent we are checking
- * @roots:  list of roots this extent is shared among
- * @tmp:    temporary list used for iteration
+ * @root:        The root the inode belongs to.
+ * @inum:        Number of the inode whose extent we are checking.
+ * @bytenr:      Logical bytenr of the extent we are checking.
+ * @extent_gen:  Generation of the extent (file extent item) or 0 if it is
+ *               not known.
+ * @roots:       List of roots this extent is shared among.
+ * @tmp:         Temporary list used for iteration.
+ * @cache:       A backref lookup result cache.
  *
- * btrfs_check_shared uses the backref walking code but will short
+ * btrfs_is_data_extent_shared uses the backref walking code but will short
  * circuit as soon as it finds a root or inode that doesn't match the
  * one passed in. This provides a significant performance benefit for
  * callers (such as fiemap) which want to know whether the extent is
@@ -1531,8 +1633,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
  *
  * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
  */
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
-               struct ulist *roots, struct ulist *tmp)
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+                               u64 extent_gen,
+                               struct ulist *roots, struct ulist *tmp,
+                               struct btrfs_backref_shared_cache *cache)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_trans_handle *trans;
@@ -1545,6 +1649,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
                .inum = inum,
                .share_count = 0,
        };
+       int level;
 
        ulist_init(roots);
        ulist_init(tmp);
@@ -1561,22 +1666,52 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
                btrfs_get_tree_mod_seq(fs_info, &elem);
        }
 
+       /* -1 means we are in the bytenr of the data extent. */
+       level = -1;
        ULIST_ITER_INIT(&uiter);
        while (1) {
+               bool is_shared;
+               bool cached;
+
                ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
                                        roots, NULL, &shared, false);
                if (ret == BACKREF_FOUND_SHARED) {
                        /* this is the only condition under which we return 1 */
                        ret = 1;
+                       if (level >= 0)
+                               store_backref_shared_cache(cache, root, bytenr,
+                                                          level, true);
                        break;
                }
                if (ret < 0 && ret != -ENOENT)
                        break;
                ret = 0;
+               /*
+                * If our data extent is not shared through reflinks and it was
+                * created in a generation after the last one used to create a
+                * snapshot of the inode's root, then it can not be shared
+                * indirectly through subtrees, as that can only happen with
+                * snapshots. In this case bail out, no need to check for the
+                * sharedness of extent buffers.
+                */
+               if (level == -1 &&
+                   extent_gen > btrfs_root_last_snapshot(&root->root_item))
+                       break;
+
+               if (level >= 0)
+                       store_backref_shared_cache(cache, root, bytenr,
+                                                  level, false);
                node = ulist_next(tmp, &uiter);
                if (!node)
                        break;
                bytenr = node->val;
+               level++;
+               cached = lookup_backref_shared_cache(cache, root, bytenr, level,
+                                                    &is_shared);
+               if (cached) {
+                       ret = (is_shared ? 1 : 0);
+                       break;
+               }
                shared.share_count = 0;
                cond_resched();
        }
index 2759de7..52ae695 100644 (file)
@@ -17,6 +17,20 @@ struct inode_fs_paths {
        struct btrfs_data_container     *fspath;
 };
 
+struct btrfs_backref_shared_cache_entry {
+       u64 bytenr;
+       u64 gen;
+       bool is_shared;
+};
+
+struct btrfs_backref_shared_cache {
+       /*
+        * A path from a root to a leaf that has a file extent item pointing to
+        * a given data extent should never exceed the maximum b+tree height.
+        */
+       struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
+};
+
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                void *ctx);
 
@@ -62,8 +76,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                          u64 start_off, struct btrfs_path *path,
                          struct btrfs_inode_extref **ret_extref,
                          u64 *found_off);
-int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
-               struct ulist *roots, struct ulist *tmp_ulist);
+int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
+                               u64 extent_gen,
+                               struct ulist *roots, struct ulist *tmp,
+                               struct btrfs_backref_shared_cache *cache);
 
 int __init btrfs_prelim_ref_init(void);
 void __cold btrfs_prelim_ref_exit(void);
index e0375ba..32c415c 100644 (file)
@@ -593,8 +593,6 @@ next:
 
                        if (need_resched() ||
                            rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               if (wakeup)
-                                       caching_ctl->progress = last;
                                btrfs_release_path(path);
                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
@@ -618,9 +616,6 @@ next:
                        key.objectid = last;
                        key.offset = 0;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
-
-                       if (wakeup)
-                               caching_ctl->progress = last;
                        btrfs_release_path(path);
                        goto next;
                }
@@ -655,7 +650,6 @@ next:
 
        total_found += add_new_free_space(block_group, last,
                                block_group->start + block_group->length);
-       caching_ctl->progress = (u64)-1;
 
 out:
        btrfs_free_path(path);
@@ -725,8 +719,6 @@ done:
        }
 #endif
 
-       caching_ctl->progress = (u64)-1;
-
        up_read(&fs_info->commit_root_sem);
        btrfs_free_excluded_extents(block_group);
        mutex_unlock(&caching_ctl->mutex);
@@ -755,7 +747,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
        mutex_init(&caching_ctl->mutex);
        init_waitqueue_head(&caching_ctl->wait);
        caching_ctl->block_group = cache;
-       caching_ctl->progress = cache->start;
        refcount_set(&caching_ctl->count, 2);
        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 
@@ -772,7 +763,6 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
        WARN_ON(cache->caching_ctl);
        cache->caching_ctl = caching_ctl;
        cache->cached = BTRFS_CACHE_STARTED;
-       cache->has_caching_ctl = 1;
        spin_unlock(&cache->lock);
 
        write_lock(&fs_info->block_group_cache_lock);
@@ -784,8 +774,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 
        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 out:
+       /* REVIEW */
        if (wait && caching_ctl)
                ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
+               /* wait_event(caching_ctl->wait, space_cache_v1_done(cache)); */
        if (caching_ctl)
                btrfs_put_caching_control(caching_ctl);
 
@@ -988,32 +980,31 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                kobject_put(kobj);
        }
 
-       if (block_group->has_caching_ctl)
-               caching_ctl = btrfs_get_caching_control(block_group);
        if (block_group->cached == BTRFS_CACHE_STARTED)
                btrfs_wait_block_group_cache_done(block_group);
-       if (block_group->has_caching_ctl) {
-               write_lock(&fs_info->block_group_cache_lock);
-               if (!caching_ctl) {
-                       struct btrfs_caching_control *ctl;
-
-                       list_for_each_entry(ctl,
-                                   &fs_info->caching_block_groups, list)
-                               if (ctl->block_group == block_group) {
-                                       caching_ctl = ctl;
-                                       refcount_inc(&caching_ctl->count);
-                                       break;
-                               }
-               }
-               if (caching_ctl)
-                       list_del_init(&caching_ctl->list);
-               write_unlock(&fs_info->block_group_cache_lock);
-               if (caching_ctl) {
-                       /* Once for the caching bgs list and once for us. */
-                       btrfs_put_caching_control(caching_ctl);
-                       btrfs_put_caching_control(caching_ctl);
+
+       write_lock(&fs_info->block_group_cache_lock);
+       caching_ctl = btrfs_get_caching_control(block_group);
+       if (!caching_ctl) {
+               struct btrfs_caching_control *ctl;
+
+               list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
+                       if (ctl->block_group == block_group) {
+                               caching_ctl = ctl;
+                               refcount_inc(&caching_ctl->count);
+                               break;
+                       }
                }
        }
+       if (caching_ctl)
+               list_del_init(&caching_ctl->list);
+       write_unlock(&fs_info->block_group_cache_lock);
+
+       if (caching_ctl) {
+               /* Once for the caching bgs list and once for us. */
+               btrfs_put_caching_control(caching_ctl);
+               btrfs_put_caching_control(caching_ctl);
+       }
 
        spin_lock(&trans->transaction->dirty_bgs_lock);
        WARN_ON(!list_empty(&block_group->dirty_list));
@@ -1034,12 +1025,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                        < block_group->zone_unusable);
                WARN_ON(block_group->space_info->disk_total
                        < block_group->length * factor);
-               WARN_ON(block_group->zone_is_active &&
+               WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+                                &block_group->runtime_flags) &&
                        block_group->space_info->active_total_bytes
                        < block_group->length);
        }
        block_group->space_info->total_bytes -= block_group->length;
-       if (block_group->zone_is_active)
+       if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
                block_group->space_info->active_total_bytes -= block_group->length;
        block_group->space_info->bytes_readonly -=
                (block_group->length - block_group->zone_unusable);
@@ -1069,7 +1061,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
 
        spin_lock(&block_group->lock);
-       block_group->removed = 1;
+       set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+
        /*
         * At this point trimming or scrub can't start on this block group,
         * because we removed the block group from the rbtree
@@ -1304,6 +1297,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
                return;
 
+       if (btrfs_fs_closing(fs_info))
+               return;
+
        /*
         * Long running balances can keep us blocked here for eternity, so
         * simply skip deletion if we're unable to get the mutex.
@@ -1543,6 +1539,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
                return;
 
+       if (btrfs_fs_closing(fs_info))
+               return;
+
        if (!btrfs_should_reclaim(fs_info))
                return;
 
@@ -1890,16 +1889,6 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
        return 0;
 }
 
-static void link_block_group(struct btrfs_block_group *cache)
-{
-       struct btrfs_space_info *space_info = cache->space_info;
-       int index = btrfs_bg_flags_to_raid_index(cache->flags);
-
-       down_write(&space_info->groups_sem);
-       list_add_tail(&cache->list, &space_info->block_groups[index]);
-       up_write(&space_info->groups_sem);
-}
-
 static struct btrfs_block_group *btrfs_create_block_group_cache(
                struct btrfs_fs_info *fs_info, u64 start)
 {
@@ -1937,7 +1926,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
        btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
        atomic_set(&cache->frozen, 0);
        mutex_init(&cache->free_space_lock);
-       btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
+       cache->full_stripe_locks_root.root = RB_ROOT;
+       mutex_init(&cache->full_stripe_locks_root.lock);
 
        return cache;
 }
@@ -2002,7 +1992,6 @@ static int read_one_block_group(struct btrfs_fs_info *info,
                                int need_clear)
 {
        struct btrfs_block_group *cache;
-       struct btrfs_space_info *space_info;
        const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
        int ret;
 
@@ -2078,11 +2067,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
                /* Should not have any excluded extents. Just in case, though. */
                btrfs_free_excluded_extents(cache);
        } else if (cache->length == cache->used) {
-               cache->last_byte_to_unpin = (u64)-1;
                cache->cached = BTRFS_CACHE_FINISHED;
                btrfs_free_excluded_extents(cache);
        } else if (cache->used == 0) {
-               cache->last_byte_to_unpin = (u64)-1;
                cache->cached = BTRFS_CACHE_FINISHED;
                add_new_free_space(cache, cache->start,
                                   cache->start + cache->length);
@@ -2095,14 +2082,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
                goto error;
        }
        trace_btrfs_add_block_group(info, cache, 0);
-       btrfs_update_space_info(info, cache->flags, cache->length,
-                               cache->used, cache->bytes_super,
-                               cache->zone_unusable, cache->zone_is_active,
-                               &space_info);
-
-       cache->space_info = space_info;
-
-       link_block_group(cache);
+       btrfs_add_bg_to_space_info(info, cache);
 
        set_avail_alloc_bits(info, cache->flags);
        if (btrfs_chunk_writeable(info, cache->start)) {
@@ -2126,7 +2106,6 @@ error:
 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
 {
        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
-       struct btrfs_space_info *space_info;
        struct rb_node *node;
        int ret = 0;
 
@@ -2146,7 +2125,6 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
                /* Fill dummy cache as FULL */
                bg->length = em->len;
                bg->flags = map->type;
-               bg->last_byte_to_unpin = (u64)-1;
                bg->cached = BTRFS_CACHE_FINISHED;
                bg->used = em->len;
                bg->flags = map->type;
@@ -2167,10 +2145,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
                        break;
                }
 
-               btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
-                                       0, 0, false, &space_info);
-               bg->space_info = space_info;
-               link_block_group(bg);
+               btrfs_add_bg_to_space_info(fs_info, bg);
 
                set_avail_alloc_bits(fs_info, bg->flags);
        }
@@ -2190,7 +2165,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
        int need_clear = 0;
        u64 cache_gen;
 
-       if (!root)
+       /*
+        * Either no extent root (with ibadroots rescue option) or we have
+        * unsupported RO options. The fs can never be mounted read-write, so no
+        * need to waste time searching block group items.
+        *
+        * This also allows new extent tree related changes to be RO compat,
+        * no need for a full incompat flag.
+        */
+       if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
+                     ~BTRFS_FEATURE_COMPAT_RO_SUPP))
                return fill_dummy_bgs(info);
 
        key.objectid = 0;
@@ -2425,7 +2409,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                ret = insert_block_group_item(trans, block_group);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
-               if (!block_group->chunk_item_inserted) {
+               if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+                             &block_group->runtime_flags)) {
                        mutex_lock(&fs_info->chunk_mutex);
                        ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
                        mutex_unlock(&fs_info->chunk_mutex);
@@ -2494,7 +2479,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
        set_free_space_tree_thresholds(cache);
        cache->used = bytes_used;
        cache->flags = type;
-       cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
        cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
 
@@ -2519,14 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
 
        btrfs_free_excluded_extents(cache);
 
-#ifdef CONFIG_BTRFS_DEBUG
-       if (btrfs_should_fragment_free_space(cache)) {
-               u64 new_bytes_used = size - bytes_used;
-
-               bytes_used += new_bytes_used >> 1;
-               fragment_free_space(cache);
-       }
-#endif
        /*
         * Ensure the corresponding space_info object is created and
         * assigned to our block group. We want our bg to be added to the rbtree
@@ -2547,12 +2523,17 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
         * the rbtree, update the space info's counters.
         */
        trace_btrfs_add_block_group(fs_info, cache, 1);
-       btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
-                               cache->bytes_super, cache->zone_unusable,
-                               cache->zone_is_active, &cache->space_info);
+       btrfs_add_bg_to_space_info(fs_info, cache);
        btrfs_update_global_block_rsv(fs_info);
 
-       link_block_group(cache);
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(cache)) {
+               u64 new_bytes_used = size - bytes_used;
+
+               cache->space_info->bytes_used += new_bytes_used >> 1;
+               fragment_free_space(cache);
+       }
+#endif
 
        list_add_tail(&cache->bg_list, &trans->new_bgs);
        trans->delayed_ref_updates++;
@@ -2869,7 +2850,7 @@ again:
        cache_size *= fs_info->sectorsize;
 
        ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
-                                         cache_size);
+                                         cache_size, false);
        if (ret)
                goto out_put;
 
@@ -3965,35 +3946,24 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group *block_group;
-       u64 last = 0;
 
-       while (1) {
-               struct inode *inode;
+       block_group = btrfs_lookup_first_block_group(info, 0);
+       while (block_group) {
+               btrfs_wait_block_group_cache_done(block_group);
+               spin_lock(&block_group->lock);
+               if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
+                                      &block_group->runtime_flags)) {
+                       struct inode *inode = block_group->inode;
 
-               block_group = btrfs_lookup_first_block_group(info, last);
-               while (block_group) {
-                       btrfs_wait_block_group_cache_done(block_group);
-                       spin_lock(&block_group->lock);
-                       if (block_group->iref)
-                               break;
+                       block_group->inode = NULL;
                        spin_unlock(&block_group->lock);
-                       block_group = btrfs_next_block_group(block_group);
-               }
-               if (!block_group) {
-                       if (last == 0)
-                               break;
-                       last = 0;
-                       continue;
-               }
 
-               inode = block_group->inode;
-               block_group->iref = 0;
-               block_group->inode = NULL;
-               spin_unlock(&block_group->lock);
-               ASSERT(block_group->io_ctl.inode == NULL);
-               iput(inode);
-               last = block_group->start + block_group->length;
-               btrfs_put_block_group(block_group);
+                       ASSERT(block_group->io_ctl.inode == NULL);
+                       iput(inode);
+               } else {
+                       spin_unlock(&block_group->lock);
+               }
+               block_group = btrfs_next_block_group(block_group);
        }
 }
 
@@ -4129,7 +4099,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
 
        spin_lock(&block_group->lock);
        cleanup = (atomic_dec_and_test(&block_group->frozen) &&
-                  block_group->removed);
+                  test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
        spin_unlock(&block_group->lock);
 
        if (cleanup) {
@@ -4150,7 +4120,7 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
                 * tasks trimming this block group have left 1 entry each one.
                 * Free them if any.
                 */
-               __btrfs_remove_free_space_cache(block_group->free_space_ctl);
+               btrfs_remove_free_space_cache(block_group);
        }
 }
 
index 6b3cdc4..8fb14b9 100644 (file)
@@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum {
        CHUNK_ALLOC_FORCE_FOR_EXTENT,
 };
 
+/* Block group flags set at runtime */
+enum btrfs_block_group_flags {
+       BLOCK_GROUP_FLAG_IREF,
+       BLOCK_GROUP_FLAG_REMOVED,
+       BLOCK_GROUP_FLAG_TO_COPY,
+       BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
+       BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
+       BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+       BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+};
+
+enum btrfs_caching_type {
+       BTRFS_CACHE_NO,
+       BTRFS_CACHE_STARTED,
+       BTRFS_CACHE_FINISHED,
+       BTRFS_CACHE_ERROR,
+};
+
 struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
        wait_queue_head_t wait;
        struct btrfs_work work;
        struct btrfs_block_group *block_group;
-       u64 progress;
        refcount_t count;
 };
 
 /* Once caching_thread() finds this much free space, it will wake up waiters. */
 #define CACHING_CTL_WAKE_UP SZ_2M
 
+/*
+ * Tree to record all locked full stripes of a RAID5/6 block group
+ */
+struct btrfs_full_stripe_locks_tree {
+       struct rb_root root;
+       struct mutex lock;
+};
+
 struct btrfs_block_group {
        struct btrfs_fs_info *fs_info;
        struct inode *inode;
@@ -95,23 +120,15 @@ struct btrfs_block_group {
 
        /* For raid56, this is a full stripe, without parity */
        unsigned long full_stripe_len;
+       unsigned long runtime_flags;
 
        unsigned int ro;
-       unsigned int iref:1;
-       unsigned int has_caching_ctl:1;
-       unsigned int removed:1;
-       unsigned int to_copy:1;
-       unsigned int relocating_repair:1;
-       unsigned int chunk_item_inserted:1;
-       unsigned int zone_is_active:1;
-       unsigned int zoned_data_reloc_ongoing:1;
 
        int disk_cache_state;
 
        /* Cache tracking stuff */
        int cached;
        struct btrfs_caching_control *caching_ctl;
-       u64 last_byte_to_unpin;
 
        struct btrfs_space_info *space_info;
 
@@ -305,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
-void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
-                               struct btrfs_caching_control *caching_ctl);
 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
                       struct block_device *bdev, u64 physical, u64 **logical,
                       int *naddrs, int *stripe_len);
index 06be064..ec96285 100644 (file)
@@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
         */
        if (block_rsv == delayed_rsv)
                target = global_rsv;
-       else if (block_rsv != global_rsv && !delayed_rsv->full)
+       else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
                target = delayed_rsv;
 
        if (target && block_rsv->space_info != target->space_info)
@@ -424,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
        case BTRFS_CSUM_TREE_OBJECTID:
        case BTRFS_EXTENT_TREE_OBJECTID:
        case BTRFS_FREE_SPACE_TREE_OBJECTID:
+       case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
                root->block_rsv = &fs_info->delayed_refs_rsv;
                break;
        case BTRFS_ROOT_TREE_OBJECTID:
index 0c18370..578c349 100644 (file)
@@ -92,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
        btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
 }
 
+/*
+ * Fast path to check if the reserve is full, may be carefully used outside of
+ * locks.
+ */
+static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
+{
+       return data_race(rsv->full);
+}
+
 #endif /* BTRFS_BLOCK_RSV_H */
index b160b8e..54c2ccb 100644 (file)
@@ -65,6 +65,8 @@ enum {
         * on the same file.
         */
        BTRFS_INODE_VERITY_IN_PROGRESS,
+       /* Set when this inode is a free space inode. */
+       BTRFS_INODE_FREE_SPACE_INODE,
 };
 
 /* in memory btrfs inode */
@@ -94,7 +96,8 @@ struct btrfs_inode {
        /* special utility tree used to record which mirrors have already been
         * tried when checksums fail for a given block
         */
-       struct extent_io_tree io_failure_tree;
+       struct rb_root io_failure_tree;
+       spinlock_t io_failure_lock;
 
        /*
         * Keep track of where the inode has extent items mapped in order to
@@ -250,11 +253,6 @@ struct btrfs_inode {
        struct inode vfs_inode;
 };
 
-static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
-{
-       return inode->root->fs_info->sectorsize;
-}
-
 static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
        return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
        return (unsigned long)h;
 }
 
-static inline void btrfs_insert_inode_hash(struct inode *inode)
-{
-       unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
-
-       __insert_inode_hash(inode, h);
-}
-
 #if BITS_PER_LONG == 32
 
 /*
@@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
 
 static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
 {
-       struct btrfs_root *root = inode->root;
-
-       if (root == root->fs_info->tree_root &&
-           btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
-               return true;
-
-       return false;
+       return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
 }
 
 static inline bool is_data_inode(struct inode *inode)
index e84d22c..54caa00 100644 (file)
@@ -152,9 +152,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
        }
 
        /* Do io completion on the original bio */
-       if (cb->status != BLK_STS_OK)
-               cb->orig_bio->bi_status = cb->status;
-       bio_endio(cb->orig_bio);
+       btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
 
        /* Finally free the cb struct */
        kfree(cb->compressed_pages);
@@ -166,16 +164,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
  * before decompressing it into the original bio and freeing the uncompressed
  * pages.
  */
-static void end_compressed_bio_read(struct bio *bio)
+static void end_compressed_bio_read(struct btrfs_bio *bbio)
 {
-       struct compressed_bio *cb = bio->bi_private;
+       struct compressed_bio *cb = bbio->private;
        struct inode *inode = cb->inode;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct btrfs_inode *bi = BTRFS_I(inode);
        bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
                    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-       blk_status_t status = bio->bi_status;
-       struct btrfs_bio *bbio = btrfs_bio(bio);
+       blk_status_t status = bbio->bio.bi_status;
        struct bvec_iter iter;
        struct bio_vec bv;
        u32 offset;
@@ -186,9 +183,8 @@ static void end_compressed_bio_read(struct bio *bio)
                if (!status &&
                    (!csum || !btrfs_check_data_csum(inode, bbio, offset,
                                                     bv.bv_page, bv.bv_offset))) {
-                       clean_io_failure(fs_info, &bi->io_failure_tree,
-                                        &bi->io_tree, start, bv.bv_page,
-                                        btrfs_ino(bi), bv.bv_offset);
+                       btrfs_clean_io_failure(bi, start, bv.bv_page,
+                                              bv.bv_offset);
                } else {
                        int ret;
 
@@ -209,7 +205,7 @@ static void end_compressed_bio_read(struct bio *bio)
        if (refcount_dec_and_test(&cb->pending_ios))
                finish_compressed_bio_read(cb);
        btrfs_bio_free_csum(bbio);
-       bio_put(bio);
+       bio_put(&bbio->bio);
 }
 
 /*
@@ -301,20 +297,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
  * This also calls the writeback end hooks for the file pages so that metadata
  * and checksums can be updated in the file.
  */
-static void end_compressed_bio_write(struct bio *bio)
+static void end_compressed_bio_write(struct btrfs_bio *bbio)
 {
-       struct compressed_bio *cb = bio->bi_private;
+       struct compressed_bio *cb = bbio->private;
 
-       if (bio->bi_status)
-               cb->status = bio->bi_status;
+       if (bbio->bio.bi_status)
+               cb->status = bbio->bio.bi_status;
 
        if (refcount_dec_and_test(&cb->pending_ios)) {
                struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
 
-               btrfs_record_physical_zoned(cb->inode, cb->start, bio);
+               btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
                queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
        }
-       bio_put(bio);
+       bio_put(&bbio->bio);
 }
 
 /*
@@ -335,7 +331,8 @@ static void end_compressed_bio_write(struct bio *bio)
 
 
 static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
-                                       blk_opf_t opf, bio_end_io_t endio_func,
+                                       blk_opf_t opf,
+                                       btrfs_bio_end_io_t endio_func,
                                        u64 *next_stripe_start)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
@@ -344,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
        struct bio *bio;
        int ret;
 
-       bio = btrfs_bio_alloc(BIO_MAX_VECS);
-
+       bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
        bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
-       bio->bi_opf = opf;
-       bio->bi_private = cb;
-       bio->bi_end_io = endio_func;
 
        em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
        if (IS_ERR(em)) {
@@ -478,8 +471,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
                        if (!skip_sum) {
                                ret = btrfs_csum_one_bio(inode, bio, start, true);
                                if (ret) {
-                                       bio->bi_status = ret;
-                                       bio_endio(bio);
+                                       btrfs_bio_end_io(btrfs_bio(bio), ret);
                                        break;
                                }
                        }
@@ -596,7 +588,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                }
 
                page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
-               lock_extent(tree, cur, page_end);
+               lock_extent(tree, cur, page_end, NULL);
                read_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
                read_unlock(&em_tree->lock);
@@ -610,7 +602,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                    (cur + fs_info->sectorsize > extent_map_end(em)) ||
                    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
                        free_extent_map(em);
-                       unlock_extent(tree, cur, page_end);
+                       unlock_extent(tree, cur, page_end, NULL);
                        unlock_page(page);
                        put_page(page);
                        break;
@@ -630,7 +622,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                add_size = min(em->start + em->len, page_end + 1) - cur;
                ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
                if (ret != add_size) {
-                       unlock_extent(tree, cur, page_end);
+                       unlock_extent(tree, cur, page_end, NULL);
                        unlock_page(page);
                        put_page(page);
                        break;
@@ -799,8 +791,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
                        ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
                        if (ret) {
-                               comp_bio->bi_status = ret;
-                               bio_endio(comp_bio);
+                               btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
                                break;
                        }
 
@@ -826,8 +817,7 @@ fail:
        kfree(cb);
 out:
        free_extent_map(em);
-       bio->bi_status = ret;
-       bio_endio(bio);
+       btrfs_bio_end_io(btrfs_bio(bio), ret);
        return;
 }
 
index ebfa35f..b39b339 100644 (file)
@@ -1447,6 +1447,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
                        return 0;
                }
 
+               if (p->nowait) {
+                       free_extent_buffer(tmp);
+                       return -EAGAIN;
+               }
+
                if (unlock_up)
                        btrfs_unlock_up_safe(p, level + 1);
 
@@ -1467,6 +1472,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
                        ret = -EAGAIN;
 
                goto out;
+       } else if (p->nowait) {
+               return -EAGAIN;
        }
 
        if (unlock_up) {
@@ -1634,7 +1641,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
                 * We don't know the level of the root node until we actually
                 * have it read locked
                 */
-               b = btrfs_read_lock_root_node(root);
+               if (p->nowait) {
+                       b = btrfs_try_read_lock_root_node(root);
+                       if (IS_ERR(b))
+                               return b;
+               } else {
+                       b = btrfs_read_lock_root_node(root);
+               }
                level = btrfs_header_level(b);
                if (level > write_lock_level)
                        goto out;
@@ -1910,6 +1923,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        WARN_ON(p->nodes[0] != NULL);
        BUG_ON(!cow && ins_len);
 
+       /*
+        * For now only allow nowait for read only operations.  There's no
+        * strict reason why we can't, we just only need it for reads so it's
+        * only implemented for reads.
+        */
+       ASSERT(!p->nowait || !cow);
+
        if (ins_len < 0) {
                lowest_unlock = 2;
 
@@ -1936,7 +1956,12 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
        if (p->need_commit_sem) {
                ASSERT(p->search_commit_root);
-               down_read(&fs_info->commit_root_sem);
+               if (p->nowait) {
+                       if (!down_read_trylock(&fs_info->commit_root_sem))
+                               return -EAGAIN;
+               } else {
+                       down_read(&fs_info->commit_root_sem);
+               }
        }
 
 again:
@@ -2082,7 +2107,15 @@ cow_done:
                                btrfs_tree_lock(b);
                                p->locks[level] = BTRFS_WRITE_LOCK;
                        } else {
-                               btrfs_tree_read_lock(b);
+                               if (p->nowait) {
+                                       if (!btrfs_try_tree_read_lock(b)) {
+                                               free_extent_buffer(b);
+                                               ret = -EAGAIN;
+                                               goto done;
+                                       }
+                               } else {
+                                       btrfs_tree_read_lock(b);
+                               }
                                p->locks[level] = BTRFS_READ_LOCK;
                        }
                        p->nodes[level] = b;
@@ -2131,6 +2164,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
 
        lowest_level = p->lowest_level;
        WARN_ON(p->nodes[0] != NULL);
+       ASSERT(!p->nowait);
 
        if (p->search_commit_root) {
                BUG_ON(time_seq);
@@ -4432,6 +4466,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
        int ret = 1;
        int keep_locks = path->keep_locks;
 
+       ASSERT(!path->nowait);
        path->keep_locks = 1;
 again:
        cur = btrfs_read_lock_root_node(root);
@@ -4612,6 +4647,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
        int ret;
        int i;
 
+       ASSERT(!path->nowait);
+
        nritems = btrfs_header_nritems(path->nodes[0]);
        if (nritems == 0)
                return 1;
index df8c99c..727595e 100644 (file)
@@ -42,7 +42,6 @@ struct btrfs_delayed_ref_root;
 struct btrfs_space_info;
 struct btrfs_block_group;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
@@ -50,6 +49,11 @@ struct btrfs_ordered_sum;
 struct btrfs_ref;
 struct btrfs_bio;
 struct btrfs_ioctl_encoded_io_args;
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_balance_control;
+struct btrfs_delayed_root;
+struct reloc_control;
 
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
@@ -280,14 +284,9 @@ struct btrfs_super_block {
        /* the UUID written into btree blocks */
        u8 metadata_uuid[BTRFS_FSID_SIZE];
 
-       /* Extent tree v2 */
-       __le64 block_group_root;
-       __le64 block_group_root_generation;
-       u8 block_group_root_level;
-
        /* future expansion */
-       u8 reserved8[7];
-       __le64 reserved[25];
+       u8 reserved8[8];
+       __le64 reserved[27];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
        struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 
@@ -307,7 +306,8 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
 #define BTRFS_FEATURE_COMPAT_RO_SUPP                   \
        (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE |      \
         BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
-        BTRFS_FEATURE_COMPAT_RO_VERITY)
+        BTRFS_FEATURE_COMPAT_RO_VERITY |               \
+        BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
 
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET       0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR     0ULL
@@ -443,9 +443,10 @@ struct btrfs_path {
         * header (ie. sizeof(struct btrfs_item) is not included).
         */
        unsigned int search_for_extension:1;
+       /* Stop search if any locks need to be taken (for read) */
+       unsigned int nowait:1;
 };
-#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
-                                       sizeof(struct btrfs_item))
+
 struct btrfs_dev_replace {
        u64 replace_state;      /* see #define above */
        time64_t time_started;  /* seconds since 1-Jan-1970 */
@@ -502,21 +503,6 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
 
-enum btrfs_caching_type {
-       BTRFS_CACHE_NO,
-       BTRFS_CACHE_STARTED,
-       BTRFS_CACHE_FINISHED,
-       BTRFS_CACHE_ERROR,
-};
-
-/*
- * Tree to record all locked full stripes of a RAID5/6 block group
- */
-struct btrfs_full_stripe_locks_tree {
-       struct rb_root root;
-       struct mutex lock;
-};
-
 /* Discard control. */
 /*
  * Async discard uses multiple lists to differentiate the discard filter
@@ -548,42 +534,6 @@ struct btrfs_discard_ctl {
        atomic64_t discard_bytes_saved;
 };
 
-void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
-
-/* fs_info */
-struct reloc_control;
-struct btrfs_device;
-struct btrfs_fs_devices;
-struct btrfs_balance_control;
-struct btrfs_delayed_root;
-
-/*
- * Block group or device which contains an active swapfile. Used for preventing
- * unsafe operations while a swapfile is active.
- *
- * These are sorted on (ptr, inode) (note that a block group or device can
- * contain more than one swapfile). We compare the pointer values because we
- * don't actually care what the object is, we just need a quick check whether
- * the object exists in the rbtree.
- */
-struct btrfs_swapfile_pin {
-       struct rb_node node;
-       void *ptr;
-       struct inode *inode;
-       /*
-        * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
-        * points to a struct btrfs_device.
-        */
-       bool is_block_group;
-       /*
-        * Only used when 'is_block_group' is true and it is the number of
-        * extents used by a swapfile for this block group ('ptr' field).
-        */
-       int bg_extent_count;
-};
-
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
 enum {
        BTRFS_FS_CLOSING_START,
        BTRFS_FS_CLOSING_DONE,
@@ -890,6 +840,7 @@ struct btrfs_fs_info {
 
        struct kobject *space_info_kobj;
        struct kobject *qgroups_kobj;
+       struct kobject *discard_kobj;
 
        /* used to keep from writing metadata until there is a nice batch */
        struct percpu_counter dirty_metadata_bytes;
@@ -1005,6 +956,7 @@ struct btrfs_fs_info {
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
        bool qgroup_rescan_running;     /* protected by qgroup_rescan_lock */
+       u8 qgroup_drop_subtree_thres;
 
        /* filesystem state */
        unsigned long fs_state;
@@ -1092,6 +1044,23 @@ struct btrfs_fs_info {
        /* Updates are not protected by any lock */
        struct btrfs_commit_stats commit_stats;
 
+       /*
+        * Last generation where we dropped a non-relocation root.
+        * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
+        * to change it and to read it, respectively.
+        */
+       u64 last_root_drop_gen;
+
+       /*
+        * Annotations for transaction events (structures are empty when
+        * compiled without lockdep).
+        */
+       struct lockdep_map btrfs_trans_num_writers_map;
+       struct lockdep_map btrfs_trans_num_extwriters_map;
+       struct lockdep_map btrfs_state_change_map[4];
+       struct lockdep_map btrfs_trans_pending_ordered_map;
+       struct lockdep_map btrfs_ordered_extent_map;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
        spinlock_t ref_verify_lock;
        struct rb_root block_tree;
@@ -1099,7 +1068,6 @@ struct btrfs_fs_info {
 
 #ifdef CONFIG_BTRFS_DEBUG
        struct kobject *debug_kobj;
-       struct kobject *discard_debug_kobj;
        struct list_head allocated_roots;
 
        spinlock_t eb_leak_lock;
@@ -1107,12 +1075,85 @@ struct btrfs_fs_info {
 #endif
 };
 
+static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
+                                               u64 gen)
+{
+       WRITE_ONCE(fs_info->last_root_drop_gen, gen);
+}
+
+static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
+{
+       return READ_ONCE(fs_info->last_root_drop_gen);
+}
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
 }
 
 /*
+ * Take the number of bytes to be checksummed and figure out how many leaves
+ * it would require to store the csums for that many bytes.
+ */
+static inline u64 btrfs_csum_bytes_to_leaves(
+                       const struct btrfs_fs_info *fs_info, u64 csum_bytes)
+{
+       const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
+
+       return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
+}
+
+/*
+ * Use this if we would be adding new items, as we could split nodes as we cow
+ * down the tree.
+ */
+static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+                                                 unsigned num_items)
+{
+       return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+}
+
+/*
+ * Doing a truncate or a modification won't result in new nodes or leaves, just
+ * what we need for COW.
+ */
+static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
+                                                unsigned num_items)
+{
+       return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+}
+
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
+                                       sizeof(struct btrfs_item))
+
+static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
+{
+       return fs_info->zone_size > 0;
+}
+
+/*
+ * Count how many fs_info->max_extent_size cover the @size
+ */
+static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
+{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+       if (!fs_info)
+               return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+#endif
+
+       return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
+}
+
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+                       enum btrfs_exclusive_operation type);
+bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
+                                enum btrfs_exclusive_operation type);
+void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
+void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
+                         enum btrfs_exclusive_operation op);
+
+/*
  * The state of btrfs root
  */
 enum {
@@ -1174,6 +1215,82 @@ enum {
        BTRFS_ROOT_RESET_LOCKDEP_CLASS,
 };
 
+enum btrfs_lockdep_trans_states {
+       BTRFS_LOCKDEP_TRANS_COMMIT_START,
+       BTRFS_LOCKDEP_TRANS_UNBLOCKED,
+       BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
+       BTRFS_LOCKDEP_TRANS_COMPLETED,
+};
+
+/*
+ * Lockdep annotation for wait events.
+ *
+ * @owner:  The struct where the lockdep map is defined
+ * @lock:   The lockdep map corresponding to a wait event
+ *
+ * This macro is used to annotate a wait event. In this case a thread acquires
+ * the lockdep map as writer (exclusive lock) because it has to block until all
+ * the threads that hold the lock as readers signal the condition for the wait
+ * event and release their locks.
+ */
+#define btrfs_might_wait_for_event(owner, lock)                                        \
+       do {                                                                    \
+               rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_);             \
+               rwsem_release(&owner->lock##_map, _THIS_IP_);                   \
+       } while (0)
+
+/*
+ * Protection for the resource/condition of a wait event.
+ *
+ * @owner:  The struct where the lockdep map is defined
+ * @lock:   The lockdep map corresponding to a wait event
+ *
+ * Many threads can modify the condition for the wait event at the same time
+ * and signal the threads that block on the wait event. The threads that modify
+ * the condition and do the signaling acquire the lock as readers (shared
+ * lock).
+ */
+#define btrfs_lockdep_acquire(owner, lock)                                     \
+       rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
+
+/*
+ * Used after signaling the condition for a wait event to release the lockdep
+ * map held by a reader thread.
+ */
+#define btrfs_lockdep_release(owner, lock)                                     \
+       rwsem_release(&owner->lock##_map, _THIS_IP_)
+
+/*
+ * Macros for the transaction states wait events, similar to the generic wait
+ * event macros.
+ */
+#define btrfs_might_wait_for_state(owner, i)                                   \
+       do {                                                                    \
+               rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
+               rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_);    \
+       } while (0)
+
+#define btrfs_trans_state_lockdep_acquire(owner, i)                            \
+       rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
+
+#define btrfs_trans_state_lockdep_release(owner, i)                            \
+       rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
+
+/* Initialization of the lockdep map */
+#define btrfs_lockdep_init_map(owner, lock)                                    \
+       do {                                                                    \
+               static struct lock_class_key lock##_key;                        \
+               lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0);    \
+       } while (0)
+
+/* Initialization of the transaction states lockdep maps. */
+#define btrfs_state_lockdep_init_map(owner, lock, state)                       \
+       do {                                                                    \
+               static struct lock_class_key lock##_key;                        \
+               lockdep_init_map(&owner->btrfs_state_change_map[state], #lock,  \
+                                &lock##_key, 0);                               \
+       } while (0)
+
 static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
 {
        clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
@@ -2391,17 +2508,6 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
 
-/*
- * For extent tree v2 we overload the extent root with the block group root, as
- * we will have multiple extent roots.
- */
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
-                        extent_root, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
-                        extent_root_gen, 64);
-BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
-                        struct btrfs_root_backup, extent_root_level, 8);
-
 /* struct btrfs_balance_item */
 BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
@@ -2534,13 +2640,6 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
 BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
                         uuid_tree_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
-                        block_group_root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
-                        struct btrfs_super_block,
-                        block_group_root_generation, 64);
-BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
-                        block_group_root_level, 8);
 
 int btrfs_super_csum_size(const struct btrfs_super_block *s);
 const char *btrfs_super_csum_name(u16 csum_type);
@@ -2761,45 +2860,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
                                     enum btrfs_inline_ref_type is_data);
 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
 
-static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums,
-                                u64 offset)
-{
-       u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
-
-       return csums + offset_in_sectors * fs_info->csum_size;
-}
-
-/*
- * Take the number of bytes to be checksummed and figure out how many leaves
- * it would require to store the csums for that many bytes.
- */
-static inline u64 btrfs_csum_bytes_to_leaves(
-                       const struct btrfs_fs_info *fs_info, u64 csum_bytes)
-{
-       const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
-
-       return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
-}
-
-/*
- * Use this if we would be adding new items, as we could split nodes as we cow
- * down the tree.
- */
-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
-                                                 unsigned num_items)
-{
-       return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
-}
-
-/*
- * Doing a truncate or a modification won't result in new nodes or leaves, just
- * what we need for COW.
- */
-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
-                                                unsigned num_items)
-{
-       return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
-}
 
 int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
                              u64 start, u64 num_bytes);
@@ -3257,12 +3317,9 @@ int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
 blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            u64 objectid, u64 pos,
-                            u64 disk_offset, u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset, u64 ram_bytes,
-                            u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 objectid, u64 pos,
+                            u64 num_bytes);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid,
@@ -3273,7 +3330,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
                                u64 offset, bool one_ordered);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                            struct list_head *list, int search_commit);
+                            struct list_head *list, int search_commit,
+                            bool nowait);
 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
                                     const struct btrfs_path *path,
                                     struct btrfs_file_extent_item *fi,
@@ -3299,11 +3357,9 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
                                    u64 start, u64 end);
 int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
                          u32 bio_offset, struct page *page, u32 pgoff);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-                                          u64 start, u64 len);
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                              u64 *orig_start, u64 *orig_block_len,
-                             u64 *ram_bytes, bool strict);
+                             u64 *ram_bytes, bool nowait, bool strict);
 
 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
                                struct btrfs_inode *inode);
@@ -3358,7 +3414,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
 void btrfs_evict_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 void btrfs_free_inode(struct inode *inode);
@@ -3439,15 +3494,6 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
                                struct btrfs_ioctl_space_info *space);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
                               struct btrfs_ioctl_balance_args *bargs);
-bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
-                       enum btrfs_exclusive_operation type);
-bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
-                                enum btrfs_exclusive_operation type);
-void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
-void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
-                         enum btrfs_exclusive_operation op);
-
 
 /* file.c */
 int __init btrfs_auto_defrag_init(void);
@@ -3457,8 +3503,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
-                            int skip_pinned);
 extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_inode *inode,
@@ -3478,8 +3522,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
                      struct extent_state **cached, bool noreserve);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
-                          size_t *write_bytes);
+                          size_t *write_bytes, bool nowait);
 void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+                                 u64 *delalloc_start_ret, u64 *delalloc_end_ret);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -3745,7 +3791,7 @@ const char * __attribute_const__ btrfs_decode_error(int errno);
 __cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                               const char *function,
-                              unsigned int line, int errno);
+                              unsigned int line, int errno, bool first_hit);
 
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
@@ -3753,9 +3799,11 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
  */
 #define btrfs_abort_transaction(trans, errno)          \
 do {                                                           \
+       bool first = false;                                     \
        /* Report first abort since mount */                    \
        if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,     \
                        &((trans)->fs_info->fs_state))) {       \
+               first = true;                                   \
                if ((errno) != -EIO && (errno) != -EROFS) {             \
                        WARN(1, KERN_DEBUG                              \
                        "BTRFS: Transaction aborted (error %d)\n",      \
@@ -3767,7 +3815,7 @@ do {                                                              \
                }                                               \
        }                                                       \
        __btrfs_abort_transaction((trans), __func__,            \
-                                 __LINE__, (errno));           \
+                                 __LINE__, (errno), first);    \
 } while (0)
 
 #ifdef CONFIG_PRINTK_INDEX
@@ -3984,16 +4032,9 @@ int btrfs_scrub_cancel(struct btrfs_fs_info *info);
 int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
                         struct btrfs_scrub_progress *progress);
-static inline void btrfs_init_full_stripe_locks_tree(
-                       struct btrfs_full_stripe_locks_tree *locks_root)
-{
-       locks_root->root = RB_ROOT;
-       mutex_init(&locks_root->lock);
-}
 
 /* dev-replace.c */
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
 
 static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
@@ -4020,6 +4061,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 
 extern const struct fsverity_operations btrfs_verityops;
 int btrfs_drop_verity_items(struct btrfs_inode *inode);
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size);
 
 BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item,
                   encryption, 8);
@@ -4037,6 +4079,12 @@ static inline int btrfs_drop_verity_items(struct btrfs_inode *inode)
        return 0;
 }
 
+static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
+                                             size_t buf_size)
+{
+       return -EPERM;
+}
+
 #endif
 
 /* Sanity test specific functions */
@@ -4053,24 +4101,6 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
 }
 #endif
 
-static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
-{
-       return fs_info->zone_size > 0;
-}
-
-/*
- * Count how many fs_info->max_extent_size cover the @size
- */
-static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
-{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-       if (!fs_info)
-               return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
-#endif
-
-       return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
-}
-
 static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
 {
        return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
index 1e8f17f..118b2e2 100644 (file)
@@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 }
 
 int btrfs_check_data_free_space(struct btrfs_inode *inode,
-                       struct extent_changeset **reserved, u64 start, u64 len)
+                               struct extent_changeset **reserved, u64 start,
+                               u64 len, bool noflush)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
        int ret;
 
        /* align the range */
@@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
              round_down(start, fs_info->sectorsize);
        start = round_down(start, fs_info->sectorsize);
 
-       ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+       if (noflush)
+               flush = BTRFS_RESERVE_NO_FLUSH;
+       else if (btrfs_is_free_space_inode(inode))
+               flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
+
+       ret = btrfs_reserve_data_bytes(fs_info, len, flush);
        if (ret < 0)
                return ret;
 
@@ -454,7 +461,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 {
        int ret;
 
-       ret = btrfs_check_data_free_space(inode, reserved, start, len);
+       ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
        if (ret < 0)
                return ret;
        ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
index 28bf5c3..e07d460 100644 (file)
@@ -7,7 +7,8 @@ struct extent_changeset;
 
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
 int btrfs_check_data_free_space(struct btrfs_inode *inode,
-                       struct extent_changeset **reserved, u64 start, u64 len);
+                       struct extent_changeset **reserved, u64 start, u64 len,
+                       bool noflush);
 void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
                        struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct btrfs_inode *inode,
index e7f3487..cac5169 100644 (file)
@@ -302,15 +302,21 @@ static inline void btrfs_release_prepared_delayed_node(
        __btrfs_release_delayed_node(node, 1);
 }
 
-static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
+                                          struct btrfs_delayed_node *node,
+                                          enum btrfs_delayed_item_type type)
 {
        struct btrfs_delayed_item *item;
+
        item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
        if (item) {
                item->data_len = data_len;
-               item->ins_or_del = 0;
+               item->type = type;
                item->bytes_reserved = 0;
-               item->delayed_node = NULL;
+               item->delayed_node = node;
+               RB_CLEAR_NODE(&item->rb_node);
+               INIT_LIST_HEAD(&item->log_list);
+               item->logged = false;
                refcount_set(&item->refs, 1);
        }
        return item;
@@ -319,72 +325,32 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
 /*
  * __btrfs_lookup_delayed_item - look up the delayed item by key
  * @delayed_node: pointer to the delayed node
- * @key:         the key to look up
- * @prev:        used to store the prev item if the right item isn't found
- * @next:        used to store the next item if the right item isn't found
+ * @index:       the dir index value to lookup (offset of a dir index key)
  *
  * Note: if we don't find the right item, we will return the prev item and
  * the next item.
  */
 static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
                                struct rb_root *root,
-                               struct btrfs_key *key,
-                               struct btrfs_delayed_item **prev,
-                               struct btrfs_delayed_item **next)
+                               u64 index)
 {
-       struct rb_node *node, *prev_node = NULL;
+       struct rb_node *node = root->rb_node;
        struct btrfs_delayed_item *delayed_item = NULL;
-       int ret = 0;
-
-       node = root->rb_node;
 
        while (node) {
                delayed_item = rb_entry(node, struct btrfs_delayed_item,
                                        rb_node);
-               prev_node = node;
-               ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
-               if (ret < 0)
+               if (delayed_item->index < index)
                        node = node->rb_right;
-               else if (ret > 0)
+               else if (delayed_item->index > index)
                        node = node->rb_left;
                else
                        return delayed_item;
        }
 
-       if (prev) {
-               if (!prev_node)
-                       *prev = NULL;
-               else if (ret < 0)
-                       *prev = delayed_item;
-               else if ((node = rb_prev(prev_node)) != NULL) {
-                       *prev = rb_entry(node, struct btrfs_delayed_item,
-                                        rb_node);
-               } else
-                       *prev = NULL;
-       }
-
-       if (next) {
-               if (!prev_node)
-                       *next = NULL;
-               else if (ret > 0)
-                       *next = delayed_item;
-               else if ((node = rb_next(prev_node)) != NULL) {
-                       *next = rb_entry(node, struct btrfs_delayed_item,
-                                        rb_node);
-               } else
-                       *next = NULL;
-       }
        return NULL;
 }
 
-static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
-                                       struct btrfs_delayed_node *delayed_node,
-                                       struct btrfs_key *key)
-{
-       return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
-                                          NULL, NULL);
-}
-
 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
                                    struct btrfs_delayed_item *ins)
 {
@@ -392,15 +358,13 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
        struct rb_node *parent_node = NULL;
        struct rb_root_cached *root;
        struct btrfs_delayed_item *item;
-       int cmp;
        bool leftmost = true;
 
-       if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+       if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
                root = &delayed_node->ins_root;
-       else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
-               root = &delayed_node->del_root;
        else
-               BUG();
+               root = &delayed_node->del_root;
+
        p = &root->rb_root.rb_node;
        node = &ins->rb_node;
 
@@ -409,11 +373,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
                item = rb_entry(parent_node, struct btrfs_delayed_item,
                                 rb_node);
 
-               cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
-               if (cmp < 0) {
+               if (item->index < ins->index) {
                        p = &(*p)->rb_right;
                        leftmost = false;
-               } else if (cmp > 0) {
+               } else if (item->index > ins->index) {
                        p = &(*p)->rb_left;
                } else {
                        return -EEXIST;
@@ -422,14 +385,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 
        rb_link_node(node, parent_node, p);
        rb_insert_color_cached(node, root, leftmost);
-       ins->delayed_node = delayed_node;
-
-       /* Delayed items are always for dir index items. */
-       ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY);
 
-       if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM &&
-           ins->key.offset >= delayed_node->index_cnt)
-               delayed_node->index_cnt = ins->key.offset + 1;
+       if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
+           ins->index >= delayed_node->index_cnt)
+               delayed_node->index_cnt = ins->index + 1;
 
        delayed_node->count++;
        atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
@@ -451,21 +410,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
        struct rb_root_cached *root;
        struct btrfs_delayed_root *delayed_root;
 
-       /* Not associated with any delayed_node */
-       if (!delayed_item->delayed_node)
+       /* Not inserted, ignore it. */
+       if (RB_EMPTY_NODE(&delayed_item->rb_node))
                return;
+
        delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
 
        BUG_ON(!delayed_root);
-       BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
-              delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
 
-       if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+       if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
                root = &delayed_item->delayed_node->ins_root;
        else
                root = &delayed_item->delayed_node->del_root;
 
        rb_erase_cached(&delayed_item->rb_node, root);
+       RB_CLEAR_NODE(&delayed_item->rb_node);
        delayed_item->delayed_node->count--;
 
        finish_one_item(delayed_root);
@@ -520,12 +479,11 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
 }
 
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
-                                              struct btrfs_root *root,
                                               struct btrfs_delayed_item *item)
 {
        struct btrfs_block_rsv *src_rsv;
        struct btrfs_block_rsv *dst_rsv;
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 num_bytes;
        int ret;
 
@@ -545,14 +503,14 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
        if (!ret) {
                trace_btrfs_space_reservation(fs_info, "delayed_item",
-                                             item->key.objectid,
+                                             item->delayed_node->inode_id,
                                              num_bytes, 1);
                /*
                 * For insertions we track reserved metadata space by accounting
                 * for the number of leaves that will be used, based on the delayed
                 * node's index_items_size field.
                 */
-               if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM)
+               if (item->type == BTRFS_DELAYED_DELETION_ITEM)
                        item->bytes_reserved = num_bytes;
        }
 
@@ -574,8 +532,8 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
         * to release/reserve qgroup space.
         */
        trace_btrfs_space_reservation(fs_info, "delayed_item",
-                                     item->key.objectid, item->bytes_reserved,
-                                     0);
+                                     item->delayed_node->inode_id,
+                                     item->bytes_reserved, 0);
        btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
 }
 
@@ -688,6 +646,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_item *next;
        const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
        struct btrfs_item_batch batch;
+       struct btrfs_key first_key;
+       const u32 first_data_size = first_item->data_len;
        int total_size;
        char *ins_data = NULL;
        int ret;
@@ -716,9 +676,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
        ASSERT(first_item->bytes_reserved == 0);
 
        list_add_tail(&first_item->tree_list, &item_list);
-       batch.total_data_size = first_item->data_len;
+       batch.total_data_size = first_data_size;
        batch.nr = 1;
-       total_size = first_item->data_len + sizeof(struct btrfs_item);
+       total_size = first_data_size + sizeof(struct btrfs_item);
        curr = first_item;
 
        while (true) {
@@ -732,8 +692,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                 * We cannot allow gaps in the key space if we're doing log
                 * replay.
                 */
-               if (continuous_keys_only &&
-                   (next->key.offset != curr->key.offset + 1))
+               if (continuous_keys_only && (next->index != curr->index + 1))
                        break;
 
                ASSERT(next->bytes_reserved == 0);
@@ -750,8 +709,11 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
        }
 
        if (batch.nr == 1) {
-               batch.keys = &first_item->key;
-               batch.data_sizes = &first_item->data_len;
+               first_key.objectid = node->inode_id;
+               first_key.type = BTRFS_DIR_INDEX_KEY;
+               first_key.offset = first_item->index;
+               batch.keys = &first_key;
+               batch.data_sizes = &first_data_size;
        } else {
                struct btrfs_key *ins_keys;
                u32 *ins_sizes;
@@ -768,7 +730,9 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                batch.keys = ins_keys;
                batch.data_sizes = ins_sizes;
                list_for_each_entry(curr, &item_list, tree_list) {
-                       ins_keys[i] = curr->key;
+                       ins_keys[i].objectid = node->inode_id;
+                       ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
+                       ins_keys[i].offset = curr->index;
                        ins_sizes[i] = curr->data_len;
                        i++;
                }
@@ -864,6 +828,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
                                    struct btrfs_path *path,
                                    struct btrfs_delayed_item *item)
 {
+       const u64 ino = item->delayed_node->inode_id;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_delayed_item *curr, *next;
        struct extent_buffer *leaf = path->nodes[0];
@@ -902,7 +867,9 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
 
                slot++;
                btrfs_item_key_to_cpu(leaf, &key, slot);
-               if (btrfs_comp_cpu_keys(&next->key, &key) != 0)
+               if (key.objectid != ino ||
+                   key.type != BTRFS_DIR_INDEX_KEY ||
+                   key.offset != next->index)
                        break;
                nitems++;
                curr = next;
@@ -920,9 +887,8 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
                 * Check btrfs_delayed_item_reserve_metadata() to see why we
                 * don't need to release/reserve qgroup space.
                 */
-               trace_btrfs_space_reservation(fs_info, "delayed_item",
-                                             item->key.objectid, total_reserved_size,
-                                             0);
+               trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
+                                             total_reserved_size, 0);
                btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
                                        total_reserved_size, NULL);
        }
@@ -940,8 +906,12 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct btrfs_delayed_node *node)
 {
+       struct btrfs_key key;
        int ret = 0;
 
+       key.objectid = node->inode_id;
+       key.type = BTRFS_DIR_INDEX_KEY;
+
        while (ret == 0) {
                struct btrfs_delayed_item *item;
 
@@ -952,7 +922,8 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
                        break;
                }
 
-               ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1);
+               key.offset = item->index;
+               ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        /*
                         * There's no matching item in the leaf. This means we
@@ -1457,16 +1428,15 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        if (IS_ERR(delayed_node))
                return PTR_ERR(delayed_node);
 
-       delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+       delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
+                                               delayed_node,
+                                               BTRFS_DELAYED_INSERTION_ITEM);
        if (!delayed_item) {
                ret = -ENOMEM;
                goto release_node;
        }
 
-       delayed_item->key.objectid = btrfs_ino(dir);
-       delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
-       delayed_item->key.offset = index;
-       delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM;
+       delayed_item->index = index;
 
        dir_item = (struct btrfs_dir_item *)delayed_item->data;
        dir_item->location = *disk_key;
@@ -1490,8 +1460,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        }
 
        if (reserve_leaf_space) {
-               ret = btrfs_delayed_item_reserve_metadata(trans, dir->root,
-                                                         delayed_item);
+               ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
                /*
                 * Space was reserved for a dir index item insertion when we
                 * started the transaction, so getting a failure here should be
@@ -1538,12 +1507,12 @@ release_node:
 
 static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
                                               struct btrfs_delayed_node *node,
-                                              struct btrfs_key *key)
+                                              u64 index)
 {
        struct btrfs_delayed_item *item;
 
        mutex_lock(&node->mutex);
-       item = __btrfs_lookup_delayed_insertion_item(node, key);
+       item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
        if (!item) {
                mutex_unlock(&node->mutex);
                return 1;
@@ -1589,32 +1558,25 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 {
        struct btrfs_delayed_node *node;
        struct btrfs_delayed_item *item;
-       struct btrfs_key item_key;
        int ret;
 
        node = btrfs_get_or_create_delayed_node(dir);
        if (IS_ERR(node))
                return PTR_ERR(node);
 
-       item_key.objectid = btrfs_ino(dir);
-       item_key.type = BTRFS_DIR_INDEX_KEY;
-       item_key.offset = index;
-
-       ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
-                                                 &item_key);
+       ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
        if (!ret)
                goto end;
 
-       item = btrfs_alloc_delayed_item(0);
+       item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
        if (!item) {
                ret = -ENOMEM;
                goto end;
        }
 
-       item->key = item_key;
-       item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM;
+       item->index = index;
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
+       ret = btrfs_delayed_item_reserve_metadata(trans, item);
        /*
         * we have reserved enough space when we start a new transaction,
         * so reserving metadata failure is impossible.
@@ -1743,9 +1705,9 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
        int ret = 0;
 
        list_for_each_entry(curr, del_list, readdir_list) {
-               if (curr->key.offset > index)
+               if (curr->index > index)
                        break;
-               if (curr->key.offset == index) {
+               if (curr->index == index) {
                        ret = 1;
                        break;
                }
@@ -1779,13 +1741,13 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
                list_del(&curr->readdir_list);
 
-               if (curr->key.offset < ctx->pos) {
+               if (curr->index < ctx->pos) {
                        if (refcount_dec_and_test(&curr->refs))
                                kfree(curr);
                        continue;
                }
 
-               ctx->pos = curr->key.offset;
+               ctx->pos = curr->index;
 
                di = (struct btrfs_dir_item *)curr->data;
                name = (char *)(di + 1);
@@ -2085,3 +2047,113 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
        }
 }
 
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+                                struct list_head *ins_list,
+                                struct list_head *del_list)
+{
+       struct btrfs_delayed_node *node;
+       struct btrfs_delayed_item *item;
+
+       node = btrfs_get_delayed_node(inode);
+       if (!node)
+               return;
+
+       mutex_lock(&node->mutex);
+       item = __btrfs_first_delayed_insertion_item(node);
+       while (item) {
+               /*
+                * It's possible that the item is already in a log list. This
+                * can happen in case two tasks are trying to log the same
+                * directory. For example if we have tasks A and task B:
+                *
+                * Task A collected the delayed items into a log list while
+                * under the inode's log_mutex (at btrfs_log_inode()), but it
+                * only releases the items after logging the inodes they point
+                * to (if they are new inodes), which happens after unlocking
+                * the log mutex;
+                *
+                * Task B enters btrfs_log_inode() and acquires the log_mutex
+                * of the same directory inode, before task B releases the
+                * delayed items. This can happen for example when logging some
+                * inode we need to trigger logging of its parent directory, so
+                * logging two files that have the same parent directory can
+                * lead to this.
+                *
+                * If this happens, just ignore delayed items already in a log
+                * list. All the tasks logging the directory are under a log
+                * transaction and whichever finishes first can not sync the log
+                * before the other completes and leaves the log transaction.
+                */
+               if (!item->logged && list_empty(&item->log_list)) {
+                       refcount_inc(&item->refs);
+                       list_add_tail(&item->log_list, ins_list);
+               }
+               item = __btrfs_next_delayed_item(item);
+       }
+
+       item = __btrfs_first_delayed_deletion_item(node);
+       while (item) {
+               /* It may be non-empty, for the same reason mentioned above. */
+               if (!item->logged && list_empty(&item->log_list)) {
+                       refcount_inc(&item->refs);
+                       list_add_tail(&item->log_list, del_list);
+               }
+               item = __btrfs_next_delayed_item(item);
+       }
+       mutex_unlock(&node->mutex);
+
+       /*
+        * We are called during inode logging, which means the inode is in use
+        * and can not be evicted before we finish logging the inode. So we never
+        * have the last reference on the delayed inode.
+        * Also, we don't use btrfs_release_delayed_node() because that would
+        * requeue the delayed inode (change its order in the list of prepared
+        * nodes) and we don't want to do such change because we don't create or
+        * delete delayed items.
+        */
+       ASSERT(refcount_read(&node->refs) > 1);
+       refcount_dec(&node->refs);
+}
+
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+                                struct list_head *ins_list,
+                                struct list_head *del_list)
+{
+       struct btrfs_delayed_node *node;
+       struct btrfs_delayed_item *item;
+       struct btrfs_delayed_item *next;
+
+       node = btrfs_get_delayed_node(inode);
+       if (!node)
+               return;
+
+       mutex_lock(&node->mutex);
+
+       list_for_each_entry_safe(item, next, ins_list, log_list) {
+               item->logged = true;
+               list_del_init(&item->log_list);
+               if (refcount_dec_and_test(&item->refs))
+                       kfree(item);
+       }
+
+       list_for_each_entry_safe(item, next, del_list, log_list) {
+               item->logged = true;
+               list_del_init(&item->log_list);
+               if (refcount_dec_and_test(&item->refs))
+                       kfree(item);
+       }
+
+       mutex_unlock(&node->mutex);
+
+       /*
+        * We are called during inode logging, which means the inode is in use
+        * and can not be evicted before we finish logging the inode. So we never
+        * have the last reference on the delayed inode.
+        * Also, we don't use btrfs_release_delayed_node() because that would
+        * requeue the delayed inode (change its order in the list of prepared
+        * nodes) and we don't want to do such change because we don't create or
+        * delete delayed items.
+        */
+       ASSERT(refcount_read(&node->refs) > 1);
+       refcount_dec(&node->refs);
+}
index 9795dc2..0163ca6 100644 (file)
 #include <linux/refcount.h>
 #include "ctree.h"
 
-/* types of the delayed item */
-#define BTRFS_DELAYED_INSERTION_ITEM   1
-#define BTRFS_DELAYED_DELETION_ITEM    2
+enum btrfs_delayed_item_type {
+       BTRFS_DELAYED_INSERTION_ITEM,
+       BTRFS_DELAYED_DELETION_ITEM
+};
 
 struct btrfs_delayed_root {
        spinlock_t lock;
@@ -73,14 +74,27 @@ struct btrfs_delayed_node {
 
 struct btrfs_delayed_item {
        struct rb_node rb_node;
-       struct btrfs_key key;
+       /* Offset value of the corresponding dir index key. */
+       u64 index;
        struct list_head tree_list;     /* used for batch insert/delete items */
        struct list_head readdir_list;  /* used for readdir items */
+       /*
+        * Used when logging a directory.
+        * Insertions and deletions to this list are protected by the parent
+        * delayed node's mutex.
+        */
+       struct list_head log_list;
        u64 bytes_reserved;
        struct btrfs_delayed_node *delayed_node;
        refcount_t refs;
-       int ins_or_del;
-       u32 data_len;
+       enum btrfs_delayed_item_type type:8;
+       /*
+        * Track if this delayed item was already logged.
+        * Protected by the mutex of the parent delayed inode.
+        */
+       bool logged;
+       /* The maximum leaf size is 64K, so u16 is more than enough. */
+       u16 data_len;
        char data[];
 };
 
@@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
                                    struct list_head *ins_list);
 
+/* Used during directory logging. */
+void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
+                                struct list_head *ins_list,
+                                struct list_head *del_list);
+void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
+                                struct list_head *ins_list,
+                                struct list_head *del_list);
+
 /* for init */
 int __init btrfs_delayed_inode_init(void);
 void __cold btrfs_delayed_inode_exit(void);
index 41cddd3..61e5806 100644 (file)
@@ -545,10 +545,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
                if (!cache)
                        continue;
 
-               spin_lock(&cache->lock);
-               cache->to_copy = 1;
-               spin_unlock(&cache->lock);
-
+               set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
                btrfs_put_block_group(cache);
        }
        if (iter_ret < 0)
@@ -577,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
                return true;
 
        spin_lock(&cache->lock);
-       if (cache->removed) {
+       if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
                spin_unlock(&cache->lock);
                return true;
        }
@@ -610,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
        }
 
        /* Last stripe on this device */
-       spin_lock(&cache->lock);
-       cache->to_copy = 0;
-       spin_unlock(&cache->lock);
+       clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
 
        return true;
 }
@@ -1288,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
        return 1;
 }
 
-void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
-{
-       percpu_counter_inc(&fs_info->dev_replace.bio_counter);
-}
-
 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 {
        percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
index 3911049..6084b31 100644 (file)
@@ -7,6 +7,10 @@
 #define BTRFS_DEV_REPLACE_H
 
 struct btrfs_ioctl_dev_replace_args;
+struct btrfs_fs_info;
+struct btrfs_trans_handle;
+struct btrfs_dev_replace;
+struct btrfs_block_group;
 
 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
index 2633137..a2da931 100644 (file)
@@ -131,8 +131,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
        if (atomic)
                return -EAGAIN;
 
-       lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-                        &cached_state);
+       lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state);
        if (extent_buffer_uptodate(eb) &&
            btrfs_header_generation(eb) == parent_transid) {
                ret = 0;
@@ -145,8 +144,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
        ret = 1;
        clear_extent_buffer_uptodate(eb);
 out:
-       unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
-                            &cached_state);
+       unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+                     &cached_state);
        return ret;
 }
 
@@ -647,16 +646,14 @@ static void run_one_async_start(struct btrfs_work *work)
  */
 static void run_one_async_done(struct btrfs_work *work)
 {
-       struct async_submit_bio *async;
-       struct inode *inode;
-
-       async = container_of(work, struct  async_submit_bio, work);
-       inode = async->inode;
+       struct async_submit_bio *async =
+               container_of(work, struct  async_submit_bio, work);
+       struct inode *inode = async->inode;
+       struct btrfs_bio *bbio = btrfs_bio(async->bio);
 
        /* If an error occurred we just want to clean up the bio and move on */
        if (async->status) {
-               async->bio->bi_status = async->status;
-               bio_endio(async->bio);
+               btrfs_bio_end_io(bbio, async->status);
                return;
        }
 
@@ -757,6 +754,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
 void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_bio *bbio = btrfs_bio(bio);
        blk_status_t ret;
 
        bio->bi_opf |= REQ_META;
@@ -776,8 +774,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
 
        ret = btree_csum_one_bio(bio);
        if (ret) {
-               bio->bi_status = ret;
-               bio_endio(bio);
+               btrfs_bio_end_io(bbio, ret);
                return;
        }
 
@@ -1524,6 +1521,9 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
        if (objectid == BTRFS_UUID_TREE_OBJECTID)
                return btrfs_grab_root(fs_info->uuid_root) ?
                        fs_info->uuid_root : ERR_PTR(-ENOENT);
+       if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID)
+               return btrfs_grab_root(fs_info->block_group_root) ?
+                       fs_info->block_group_root : ERR_PTR(-ENOENT);
        if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) {
                struct btrfs_root *root = btrfs_global_root(fs_info, &key);
 
@@ -1980,14 +1980,7 @@ static void backup_super_roots(struct btrfs_fs_info *info)
        btrfs_set_backup_chunk_root_level(root_backup,
                               btrfs_header_level(info->chunk_root->node));
 
-       if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
-               btrfs_set_backup_block_group_root(root_backup,
-                                       info->block_group_root->node->start);
-               btrfs_set_backup_block_group_root_gen(root_backup,
-                       btrfs_header_generation(info->block_group_root->node));
-               btrfs_set_backup_block_group_root_level(root_backup,
-                       btrfs_header_level(info->block_group_root->node));
-       } else {
+       if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
                struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
                struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
 
@@ -2225,6 +2218,8 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 {
        struct inode *inode = fs_info->btree_inode;
+       unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+                                             fs_info->tree_root);
 
        inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
        set_nlink(inode, 1);
@@ -2238,8 +2233,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
 
        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
-                           IO_TREE_BTREE_INODE_IO, inode);
-       BTRFS_I(inode)->io_tree.track_uptodate = false;
+                           IO_TREE_BTREE_INODE_IO, NULL);
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
 
        BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@@ -2247,7 +2241,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
        BTRFS_I(inode)->location.type = 0;
        BTRFS_I(inode)->location.offset = 0;
        set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
-       btrfs_insert_inode_hash(inode);
+       __insert_inode_hash(inode, hash);
 }
 
 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
@@ -2266,6 +2260,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
        fs_info->qgroup_seq = 1;
        fs_info->qgroup_ulist = NULL;
        fs_info->qgroup_rescan_running = false;
+       fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
        mutex_init(&fs_info->qgroup_rescan_lock);
 }
 
@@ -2529,10 +2524,24 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
        if (ret)
                return ret;
 
-       location.objectid = BTRFS_DEV_TREE_OBJECTID;
        location.type = BTRFS_ROOT_ITEM_KEY;
        location.offset = 0;
 
+       if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
+               location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
+               root = btrfs_read_tree_root(tree_root, &location);
+               if (IS_ERR(root)) {
+                       if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+                               ret = PTR_ERR(root);
+                               goto out;
+                       }
+               } else {
+                       set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+                       fs_info->block_group_root = root;
+               }
+       }
+
+       location.objectid = BTRFS_DEV_TREE_OBJECTID;
        root = btrfs_read_tree_root(tree_root, &location);
        if (IS_ERR(root)) {
                if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
@@ -2600,8 +2609,8 @@ out:
  *             1, 2    2nd and 3rd backup copy
  *            -1       skip bytenr check
  */
-static int validate_super(struct btrfs_fs_info *fs_info,
-                           struct btrfs_super_block *sb, int mirror_num)
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+                        struct btrfs_super_block *sb, int mirror_num)
 {
        u64 nodesize = btrfs_super_nodesize(sb);
        u64 sectorsize = btrfs_super_sectorsize(sb);
@@ -2703,6 +2712,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
                ret = -EINVAL;
        }
 
+       /*
+        * Artificial requirement for block-group-tree to force newer features
+        * (free-space-tree, no-holes) so the test matrix is smaller.
+        */
+       if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+           (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+            !btrfs_fs_incompat(fs_info, NO_HOLES))) {
+               btrfs_err(fs_info,
+               "block-group-tree feature requires fres-space-tree and no-holes");
+               ret = -EINVAL;
+       }
+
        if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
                   BTRFS_FSID_SIZE) != 0) {
                btrfs_err(fs_info,
@@ -2785,7 +2806,7 @@ static int validate_super(struct btrfs_fs_info *fs_info,
  */
 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
 {
-       return validate_super(fs_info, fs_info->super_copy, 0);
+       return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
 }
 
 /*
@@ -2799,7 +2820,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
 {
        int ret;
 
-       ret = validate_super(fs_info, sb, -1);
+       ret = btrfs_validate_super(fs_info, sb, -1);
        if (ret < 0)
                goto out;
        if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
@@ -2860,17 +2881,7 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
                btrfs_warn(fs_info, "couldn't read tree root");
                return ret;
        }
-
-       if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
-               return 0;
-
-       bytenr = btrfs_super_block_group_root(sb);
-       gen = btrfs_super_block_group_root_generation(sb);
-       level = btrfs_super_block_group_root_level(sb);
-       ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
-       if (ret)
-               btrfs_warn(fs_info, "couldn't read block group root");
-       return ret;
+       return 0;
 }
 
 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
@@ -2882,16 +2893,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
        int ret = 0;
        int i;
 
-       if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
-               struct btrfs_root *root;
-
-               root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
-                                       GFP_KERNEL);
-               if (!root)
-                       return -ENOMEM;
-               fs_info->block_group_root = root;
-       }
-
        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
                if (handle_error) {
                        if (!IS_ERR(tree_root->node))
@@ -2990,6 +2991,19 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        mutex_init(&fs_info->zoned_data_reloc_io_lock);
        seqlock_init(&fs_info->profiles_lock);
 
+       btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
+       btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
+       btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
+       btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
+       btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start,
+                                    BTRFS_LOCKDEP_TRANS_COMMIT_START);
+       btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
+                                    BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+       btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
+                                    BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+       btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
+                                    BTRFS_LOCKDEP_TRANS_COMPLETED);
+
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -3279,6 +3293,112 @@ out:
        return ret;
 }
 
+/*
+ * Do various sanity and dependency checks of different features.
+ *
+ * This is the place for less strict checks (like for subpage or artificial
+ * feature dependencies).
+ *
+ * For strict checks or possible corruption detection, see
+ * btrfs_validate_super().
+ *
+ * This should be called after btrfs_parse_options(), as some mount options
+ * (space cache related) can modify on-disk format like free space tree and
+ * screw up certain feature dependencies.
+ */
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+{
+       struct btrfs_super_block *disk_super = fs_info->super_copy;
+       u64 incompat = btrfs_super_incompat_flags(disk_super);
+       const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
+       const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);
+
+       if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
+               btrfs_err(fs_info,
+               "cannot mount because of unknown incompat features (0x%llx)",
+                   incompat);
+               return -EINVAL;
+       }
+
+       /* Runtime limitation for mixed block groups. */
+       if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+           (fs_info->sectorsize != fs_info->nodesize)) {
+               btrfs_err(fs_info,
+"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
+                       fs_info->nodesize, fs_info->sectorsize);
+               return -EINVAL;
+       }
+
+       /* Mixed backref is an always-enabled feature. */
+       incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+
+       /* Set compression related flags just in case. */
+       if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
+               incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+       else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+               incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
+
+       /*
+        * An ancient flag, which should really be marked deprecated.
+        * Such runtime limitation doesn't really need a incompat flag.
+        */
+       if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
+               incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+
+       if (compat_ro_unsupp && !sb_rdonly(sb)) {
+               btrfs_err(fs_info,
+       "cannot mount read-write because of unknown compat_ro features (0x%llx)",
+                      compat_ro);
+               return -EINVAL;
+       }
+
+       /*
+        * We have unsupported RO compat features, although RO mounted, we
+        * should not cause any metadata writes, including log replay.
+        * Or we could screw up whatever the new feature requires.
+        */
+       if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
+           !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+               btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+                         compat_ro);
+               return -EINVAL;
+       }
+
+       /*
+        * Artificial limitations for block group tree, to force
+        * block-group-tree to rely on no-holes and free-space-tree.
+        */
+       if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
+           (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
+            !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
+               btrfs_err(fs_info,
+"block-group-tree feature requires no-holes and free-space-tree features");
+               return -EINVAL;
+       }
+
+       /*
+        * Subpage runtime limitation on v1 cache.
+        *
+        * V1 space cache still has some hard codeed PAGE_SIZE usage, while
+        * we're already defaulting to v2 cache, no need to bother v1 as it's
+        * going to be deprecated anyway.
+        */
+       if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+               btrfs_warn(fs_info,
+       "v1 space cache is not supported for page size %lu with sectorsize %u",
+                          PAGE_SIZE, fs_info->sectorsize);
+               return -EINVAL;
+       }
+
+       /* This can be called by remount, we need to protect the super block. */
+       spin_lock(&fs_info->super_lock);
+       btrfs_set_super_incompat_flags(disk_super, incompat);
+       spin_unlock(&fs_info->super_lock);
+
+       return 0;
+}
+
 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
                      char *options)
 {
@@ -3428,72 +3548,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
                goto fail_alloc;
        }
 
-       features = btrfs_super_incompat_flags(disk_super) &
-               ~BTRFS_FEATURE_INCOMPAT_SUPP;
-       if (features) {
-               btrfs_err(fs_info,
-                   "cannot mount because of unsupported optional features (0x%llx)",
-                   features);
-               err = -EINVAL;
-               goto fail_alloc;
-       }
-
-       features = btrfs_super_incompat_flags(disk_super);
-       features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-       if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
-               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-       else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
-               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
-
-       /*
-        * Flag our filesystem as having big metadata blocks if they are bigger
-        * than the page size.
-        */
-       if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
-               features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
-
-       /*
-        * mixed block groups end up with duplicate but slightly offset
-        * extent buffers for the same range.  It leads to corruptions
-        */
-       if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
-           (sectorsize != nodesize)) {
-               btrfs_err(fs_info,
-"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
-                       nodesize, sectorsize);
-               goto fail_alloc;
-       }
-
-       /*
-        * Needn't use the lock because there is no other task which will
-        * update the flag.
-        */
-       btrfs_set_super_incompat_flags(disk_super, features);
-
-       features = btrfs_super_compat_ro_flags(disk_super) &
-               ~BTRFS_FEATURE_COMPAT_RO_SUPP;
-       if (!sb_rdonly(sb) && features) {
-               btrfs_err(fs_info,
-       "cannot mount read-write because of unsupported optional features (0x%llx)",
-                      features);
-               err = -EINVAL;
-               goto fail_alloc;
-       }
-       /*
-        * We have unsupported RO compat features, although RO mounted, we
-        * should not cause any metadata write, including log replay.
-        * Or we could screw up whatever the new feature requires.
-        */
-       if (unlikely(features && btrfs_super_log_root(disk_super) &&
-                    !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
-               btrfs_err(fs_info,
-"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
-                         features);
-               err = -EINVAL;
+       ret = btrfs_check_features(fs_info, sb);
+       if (ret < 0) {
+               err = ret;
                goto fail_alloc;
        }
 
-
        if (sectorsize < PAGE_SIZE) {
                struct btrfs_subpage_info *subpage_info;
 
@@ -3833,7 +3893,7 @@ static void btrfs_end_super_write(struct bio *bio)
 }
 
 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
-                                                  int copy_num)
+                                                  int copy_num, bool drop_cache)
 {
        struct btrfs_super_block *super;
        struct page *page;
@@ -3851,6 +3911,19 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
        if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
                return ERR_PTR(-EINVAL);
 
+       if (drop_cache) {
+               /* This should only be called with the primary sb. */
+               ASSERT(copy_num == 0);
+
+               /*
+                * Drop the page of the primary superblock, so later read will
+                * always read from the device.
+                */
+               invalidate_inode_pages2_range(mapping,
+                               bytenr >> PAGE_SHIFT,
+                               (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+       }
+
        page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
        if (IS_ERR(page))
                return ERR_CAST(page);
@@ -3882,7 +3955,7 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
         */
        for (i = 0; i < 1; i++) {
-               super = btrfs_read_dev_one_super(bdev, i);
+               super = btrfs_read_dev_one_super(bdev, i, false);
                if (IS_ERR(super))
                        continue;
 
index 47ad8e0..c67c15d 100644 (file)
@@ -46,10 +46,13 @@ int __cold open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options);
 void __cold close_ctree(struct btrfs_fs_info *fs_info);
+int btrfs_validate_super(struct btrfs_fs_info *fs_info,
+                        struct btrfs_super_block *sb, int mirror_num);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
-                                                  int copy_num);
+                                                  int copy_num, bool drop_cache);
 int btrfs_commit_super(struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
                                        struct btrfs_key *key);
@@ -103,7 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
 
 static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
 {
-       if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+       if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
                return fs_info->block_group_root;
        return btrfs_extent_root(fs_info, 0);
 }
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
new file mode 100644 (file)
index 0000000..618275a
--- /dev/null
@@ -0,0 +1,1673 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <trace/events/btrfs.h>
+#include "ctree.h"
+#include "extent-io-tree.h"
+#include "btrfs_inode.h"
+#include "misc.h"
+
+static struct kmem_cache *extent_state_cache;
+
+static inline bool extent_state_in_tree(const struct extent_state *state)
+{
+       return !RB_EMPTY_NODE(&state->rb_node);
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static LIST_HEAD(states);
+static DEFINE_SPINLOCK(leak_lock);
+
+static inline void btrfs_leak_debug_add_state(struct extent_state *state)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&leak_lock, flags);
+       list_add(&state->leak_list, &states);
+       spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_leak_debug_del_state(struct extent_state *state)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&leak_lock, flags);
+       list_del(&state->leak_list);
+       spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+       struct extent_state *state;
+
+       while (!list_empty(&states)) {
+               state = list_entry(states.next, struct extent_state, leak_list);
+               pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+                      state->start, state->end, state->state,
+                      extent_state_in_tree(state),
+                      refcount_read(&state->refs));
+               list_del(&state->leak_list);
+               kmem_cache_free(extent_state_cache, state);
+       }
+}
+
+#define btrfs_debug_check_extent_io_range(tree, start, end)            \
+       __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+                                                      struct extent_io_tree *tree,
+                                                      u64 start, u64 end)
+{
+       struct inode *inode = tree->private_data;
+       u64 isize;
+
+       if (!inode)
+               return;
+
+       isize = i_size_read(inode);
+       if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+               btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+                   "%s: ino %llu isize %llu odd range [%llu,%llu]",
+                       caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
+       }
+}
+#else
+#define btrfs_leak_debug_add_state(state)              do {} while (0)
+#define btrfs_leak_debug_del_state(state)              do {} while (0)
+#define btrfs_extent_state_leak_debug_check()          do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)     do {} while (0)
+#endif
+
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc.  These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
+struct tree_entry {
+       u64 start;
+       u64 end;
+       struct rb_node rb_node;
+};
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+                        struct extent_io_tree *tree, unsigned int owner,
+                        void *private_data)
+{
+       tree->fs_info = fs_info;
+       tree->state = RB_ROOT;
+       spin_lock_init(&tree->lock);
+       tree->private_data = private_data;
+       tree->owner = owner;
+       if (owner == IO_TREE_INODE_FILE_EXTENT)
+               lockdep_set_class(&tree->lock, &file_extent_tree_class);
+}
+
+void extent_io_tree_release(struct extent_io_tree *tree)
+{
+       spin_lock(&tree->lock);
+       /*
+        * Do a single barrier for the waitqueue_active check here, the state
+        * of the waitqueue should not change once extent_io_tree_release is
+        * called.
+        */
+       smp_mb();
+       while (!RB_EMPTY_ROOT(&tree->state)) {
+               struct rb_node *node;
+               struct extent_state *state;
+
+               node = rb_first(&tree->state);
+               state = rb_entry(node, struct extent_state, rb_node);
+               rb_erase(&state->rb_node, &tree->state);
+               RB_CLEAR_NODE(&state->rb_node);
+               /*
+                * btree io trees aren't supposed to have tasks waiting for
+                * changes in the flags of extent states ever.
+                */
+               ASSERT(!waitqueue_active(&state->wq));
+               free_extent_state(state);
+
+               cond_resched_lock(&tree->lock);
+       }
+       spin_unlock(&tree->lock);
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+       struct extent_state *state;
+
+       /*
+        * The given mask might be not appropriate for the slab allocator,
+        * drop the unsupported bits
+        */
+       mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
+       state = kmem_cache_alloc(extent_state_cache, mask);
+       if (!state)
+               return state;
+       state->state = 0;
+       RB_CLEAR_NODE(&state->rb_node);
+       btrfs_leak_debug_add_state(state);
+       refcount_set(&state->refs, 1);
+       init_waitqueue_head(&state->wq);
+       trace_alloc_extent_state(state, mask, _RET_IP_);
+       return state;
+}
+
+static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+       if (!prealloc)
+               prealloc = alloc_extent_state(GFP_ATOMIC);
+
+       return prealloc;
+}
+
+void free_extent_state(struct extent_state *state)
+{
+       if (!state)
+               return;
+       if (refcount_dec_and_test(&state->refs)) {
+               WARN_ON(extent_state_in_tree(state));
+               btrfs_leak_debug_del_state(state);
+               trace_free_extent_state(state, _RET_IP_);
+               kmem_cache_free(extent_state_cache, state);
+       }
+}
+
+static int add_extent_changeset(struct extent_state *state, u32 bits,
+                                struct extent_changeset *changeset,
+                                int set)
+{
+       int ret;
+
+       if (!changeset)
+               return 0;
+       if (set && (state->state & bits) == bits)
+               return 0;
+       if (!set && (state->state & bits) == 0)
+               return 0;
+       changeset->bytes_changed += state->end - state->start + 1;
+       ret = ulist_add(&changeset->range_changed, state->start, state->end,
+                       GFP_ATOMIC);
+       return ret;
+}
+
+static inline struct extent_state *next_state(struct extent_state *state)
+{
+       struct rb_node *next = rb_next(&state->rb_node);
+
+       if (next)
+               return rb_entry(next, struct extent_state, rb_node);
+       else
+               return NULL;
+}
+
+static inline struct extent_state *prev_state(struct extent_state *state)
+{
+       struct rb_node *next = rb_prev(&state->rb_node);
+
+       if (next)
+               return rb_entry(next, struct extent_state, rb_node);
+       else
+               return NULL;
+}
+
+/*
+ * Search @tree for an entry that contains @offset. Such entry would have
+ * entry->start <= offset && entry->end >= offset.
+ *
+ * @tree:       the tree to search
+ * @offset:     offset that should fall within an entry in @tree
+ * @node_ret:   pointer where new node should be anchored (used when inserting an
+ *             entry in the tree)
+ * @parent_ret: points to entry which would have been the parent of the entry,
+ *               containing @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address and don't change
+ * @node_ret and @parent_ret.
+ *
+ * If no such entry exists, return pointer to entry that ends before @offset
+ * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ */
+static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
+                                                         u64 offset,
+                                                         struct rb_node ***node_ret,
+                                                         struct rb_node **parent_ret)
+{
+       struct rb_root *root = &tree->state;
+       struct rb_node **node = &root->rb_node;
+       struct rb_node *prev = NULL;
+       struct extent_state *entry = NULL;
+
+       while (*node) {
+               prev = *node;
+               entry = rb_entry(prev, struct extent_state, rb_node);
+
+               if (offset < entry->start)
+                       node = &(*node)->rb_left;
+               else if (offset > entry->end)
+                       node = &(*node)->rb_right;
+               else
+                       return entry;
+       }
+
+       if (node_ret)
+               *node_ret = node;
+       if (parent_ret)
+               *parent_ret = prev;
+
+       /* Search neighbors until we find the first one past the end */
+       while (entry && offset > entry->end)
+               entry = next_state(entry);
+
+       return entry;
+}
+
+/*
+ * Search offset in the tree or fill neighbor rbtree node pointers.
+ *
+ * @tree:      the tree to search
+ * @offset:    offset that should fall within an entry in @tree
+ * @next_ret:  pointer to the first entry whose range ends after @offset
+ * @prev_ret:  pointer to the first entry whose range begins before @offset
+ *
+ * Return a pointer to the entry that contains @offset byte address. If no
+ * such entry exists, then return NULL and fill @prev_ret and @next_ret.
+ * Otherwise return the found entry and other pointers are left untouched.
+ */
+static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
+                                                 u64 offset,
+                                                 struct extent_state **prev_ret,
+                                                 struct extent_state **next_ret)
+{
+       struct rb_root *root = &tree->state;
+       struct rb_node **node = &root->rb_node;
+       struct extent_state *orig_prev;
+       struct extent_state *entry = NULL;
+
+       ASSERT(prev_ret);
+       ASSERT(next_ret);
+
+       while (*node) {
+               entry = rb_entry(*node, struct extent_state, rb_node);
+
+               if (offset < entry->start)
+                       node = &(*node)->rb_left;
+               else if (offset > entry->end)
+                       node = &(*node)->rb_right;
+               else
+                       return entry;
+       }
+
+       orig_prev = entry;
+       while (entry && offset > entry->end)
+               entry = next_state(entry);
+       *next_ret = entry;
+       entry = orig_prev;
+
+       while (entry && offset < entry->start)
+               entry = prev_state(entry);
+       *prev_ret = entry;
+
+       return NULL;
+}
+
+/*
+ * Inexact rb-tree search, return the next entry if @offset is not found
+ */
+static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
+{
+       return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+       btrfs_panic(tree->fs_info, err,
+       "locking error: extent tree was modified by another thread while locked");
+}
+
+/*
+ * Utility function to look for merge candidates inside a given range.  Any
+ * extents with matching state are merged together into a single extent in the
+ * tree.  Extents with EXTENT_IO in their state field are not merged because
+ * the end_io handlers need to be able to do operations on them without
+ * sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
+{
+       struct extent_state *other;
+
+       if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+               return;
+
+       other = prev_state(state);
+       if (other && other->end == state->start - 1 &&
+           other->state == state->state) {
+               if (tree->private_data)
+                       btrfs_merge_delalloc_extent(tree->private_data,
+                                                   state, other);
+               state->start = other->start;
+               rb_erase(&other->rb_node, &tree->state);
+               RB_CLEAR_NODE(&other->rb_node);
+               free_extent_state(other);
+       }
+       other = next_state(state);
+       if (other && other->start == state->end + 1 &&
+           other->state == state->state) {
+               if (tree->private_data)
+                       btrfs_merge_delalloc_extent(tree->private_data, state,
+                                                   other);
+               state->end = other->end;
+               rb_erase(&other->rb_node, &tree->state);
+               RB_CLEAR_NODE(&other->rb_node);
+               free_extent_state(other);
+       }
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+                          struct extent_state *state,
+                          u32 bits, struct extent_changeset *changeset)
+{
+       u32 bits_to_set = bits & ~EXTENT_CTLBITS;
+       int ret;
+
+       if (tree->private_data)
+               btrfs_set_delalloc_extent(tree->private_data, state, bits);
+
+       ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+       BUG_ON(ret < 0);
+       state->state |= bits_to_set;
+}
+
+/*
+ * Insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+                       struct extent_state *state,
+                       u32 bits, struct extent_changeset *changeset)
+{
+       struct rb_node **node;
+       struct rb_node *parent;
+       const u64 end = state->end;
+
+       set_state_bits(tree, state, bits, changeset);
+
+       node = &tree->state.rb_node;
+       while (*node) {
+               struct extent_state *entry;
+
+               parent = *node;
+               entry = rb_entry(parent, struct extent_state, rb_node);
+
+               if (end < entry->start) {
+                       node = &(*node)->rb_left;
+               } else if (end > entry->end) {
+                       node = &(*node)->rb_right;
+               } else {
+                       btrfs_err(tree->fs_info,
+                              "found node %llu %llu on insert of %llu %llu",
+                              entry->start, entry->end, state->start, end);
+                       return -EEXIST;
+               }
+       }
+
+       rb_link_node(&state->rb_node, parent, node);
+       rb_insert_color(&state->rb_node, &tree->state);
+
+       merge_state(tree, state);
+       return 0;
+}
+
+/*
+ * Insert state to @tree to the location given by @node and @parent.
+ */
+static void insert_state_fast(struct extent_io_tree *tree,
+                             struct extent_state *state, struct rb_node **node,
+                             struct rb_node *parent, unsigned bits,
+                             struct extent_changeset *changeset)
+{
+       set_state_bits(tree, state, bits, changeset);
+       rb_link_node(&state->rb_node, parent, node);
+       rb_insert_color(&state->rb_node, &tree->state);
+       merge_state(tree, state);
+}
+
+/*
+ * Split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+                      struct extent_state *prealloc, u64 split)
+{
+       struct rb_node *parent = NULL;
+       struct rb_node **node;
+
+       if (tree->private_data)
+               btrfs_split_delalloc_extent(tree->private_data, orig, split);
+
+       prealloc->start = orig->start;
+       prealloc->end = split - 1;
+       prealloc->state = orig->state;
+       orig->start = split;
+
+       parent = &orig->rb_node;
+       node = &parent;
+       while (*node) {
+               struct extent_state *entry;
+
+               parent = *node;
+               entry = rb_entry(parent, struct extent_state, rb_node);
+
+               if (prealloc->end < entry->start) {
+                       node = &(*node)->rb_left;
+               } else if (prealloc->end > entry->end) {
+                       node = &(*node)->rb_right;
+               } else {
+                       free_extent_state(prealloc);
+                       return -EEXIST;
+               }
+       }
+
+       rb_link_node(&prealloc->rb_node, parent, node);
+       rb_insert_color(&prealloc->rb_node, &tree->state);
+
+       return 0;
+}
+
+/*
+ * Utility function to clear some bits in an extent state struct.  It will
+ * optionally wake up anyone waiting on this state (wake == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+                                           struct extent_state *state,
+                                           u32 bits, int wake,
+                                           struct extent_changeset *changeset)
+{
+       struct extent_state *next;
+       u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
+       int ret;
+
+       if (tree->private_data)
+               btrfs_clear_delalloc_extent(tree->private_data, state, bits);
+
+       ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+       BUG_ON(ret < 0);
+       state->state &= ~bits_to_clear;
+       if (wake)
+               wake_up(&state->wq);
+       if (state->state == 0) {
+               next = next_state(state);
+               if (extent_state_in_tree(state)) {
+                       rb_erase(&state->rb_node, &tree->state);
+                       RB_CLEAR_NODE(&state->rb_node);
+                       free_extent_state(state);
+               } else {
+                       WARN_ON(1);
+               }
+       } else {
+               merge_state(tree, state);
+               next = next_state(state);
+       }
+       return next;
+}
+
+/*
+ * Clear some bits on a range in the tree.  This may require splitting or
+ * inserting elements in the tree, so the gfp mask is used to indicate which
+ * allocations or sleeping are allowed.
+ *
+ * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
+ * range from the tree regardless of state (ie for truncate).
+ *
+ * The range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
+ */
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                      u32 bits, struct extent_state **cached_state,
+                      gfp_t mask, struct extent_changeset *changeset)
+{
+       struct extent_state *state;
+       struct extent_state *cached;
+       struct extent_state *prealloc = NULL;
+       u64 last_end;
+       int err;
+       int clear = 0;
+       int wake;
+       int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+
+       btrfs_debug_check_extent_io_range(tree, start, end);
+       trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
+
+       if (delete)
+               bits |= ~EXTENT_CTLBITS;
+
+       if (bits & EXTENT_DELALLOC)
+               bits |= EXTENT_NORESERVE;
+
+       wake = (bits & EXTENT_LOCKED) ? 1 : 0;
+       if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
+               clear = 1;
+again:
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
+               /*
+                * Don't care for allocation failure here because we might end
+                * up not needing the pre-allocated extent state at all, which
+                * is the case if we only have in the tree extent states that
+                * cover our input range and don't cover too any other range.
+                * If we end up needing a new extent state we allocate it later.
+                */
+               prealloc = alloc_extent_state(mask);
+       }
+
+       spin_lock(&tree->lock);
+       if (cached_state) {
+               cached = *cached_state;
+
+               if (clear) {
+                       *cached_state = NULL;
+                       cached_state = NULL;
+               }
+
+               if (cached && extent_state_in_tree(cached) &&
+                   cached->start <= start && cached->end > start) {
+                       if (clear)
+                               refcount_dec(&cached->refs);
+                       state = cached;
+                       goto hit_next;
+               }
+               if (clear)
+                       free_extent_state(cached);
+       }
+
+       /* This search will find the extents that end after our range starts. */
+       state = tree_search(tree, start);
+       if (!state)
+               goto out;
+hit_next:
+       if (state->start > end)
+               goto out;
+       WARN_ON(state->end < start);
+       last_end = state->end;
+
+       /* The state doesn't have the wanted bits, go ahead. */
+       if (!(state->state & bits)) {
+               state = next_state(state);
+               goto next;
+       }
+
+       /*
+        *     | ---- desired range ---- |
+        *  | state | or
+        *  | ------------- state -------------- |
+        *
+        * We need to split the extent we found, and may flip bits on second
+        * half.
+        *
+        * If the extent we found extends past our range, we just split and
+        * search again.  It'll get split again the next time though.
+        *
+        * If the extent we found is inside our range, we clear the desired bit
+        * on it.
+        */
+
+       if (state->start < start) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+               err = split_state(tree, state, prealloc, start);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               prealloc = NULL;
+               if (err)
+                       goto out;
+               if (state->end <= end) {
+                       state = clear_state_bit(tree, state, bits, wake, changeset);
+                       goto next;
+               }
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *                        | state |
+        * We need to split the extent, and clear the bit on the first half.
+        */
+       if (state->start <= end && state->end > end) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+               err = split_state(tree, state, prealloc, end + 1);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               if (wake)
+                       wake_up(&state->wq);
+
+               clear_state_bit(tree, prealloc, bits, wake, changeset);
+
+               prealloc = NULL;
+               goto out;
+       }
+
+       state = clear_state_bit(tree, state, bits, wake, changeset);
+next:
+       if (last_end == (u64)-1)
+               goto out;
+       start = last_end + 1;
+       if (start <= end && state && !need_resched())
+               goto hit_next;
+
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       if (gfpflags_allow_blocking(mask))
+               cond_resched();
+       goto again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return 0;
+
+}
+
+static void wait_on_state(struct extent_io_tree *tree,
+                         struct extent_state *state)
+               __releases(tree->lock)
+               __acquires(tree->lock)
+{
+       DEFINE_WAIT(wait);
+       prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+       spin_unlock(&tree->lock);
+       schedule();
+       spin_lock(&tree->lock);
+       finish_wait(&state->wq, &wait);
+}
+
+/*
+ * Wait for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
+{
+       struct extent_state *state;
+
+       btrfs_debug_check_extent_io_range(tree, start, end);
+
+       spin_lock(&tree->lock);
+again:
+       while (1) {
+               /*
+                * This search will find all the extents that end after our
+                * range starts.
+                */
+               state = tree_search(tree, start);
+process_node:
+               if (!state)
+                       break;
+               if (state->start > end)
+                       goto out;
+
+               if (state->state & bits) {
+                       start = state->start;
+                       refcount_inc(&state->refs);
+                       wait_on_state(tree, state);
+                       free_extent_state(state);
+                       goto again;
+               }
+               start = state->end + 1;
+
+               if (start > end)
+                       break;
+
+               if (!cond_resched_lock(&tree->lock)) {
+                       state = next_state(state);
+                       goto process_node;
+               }
+       }
+out:
+       spin_unlock(&tree->lock);
+}
+
+static void cache_state_if_flags(struct extent_state *state,
+                                struct extent_state **cached_ptr,
+                                unsigned flags)
+{
+       if (cached_ptr && !(*cached_ptr)) {
+               if (!flags || (state->state & flags)) {
+                       *cached_ptr = state;
+                       refcount_inc(&state->refs);
+               }
+       }
+}
+
+static void cache_state(struct extent_state *state,
+                       struct extent_state **cached_ptr)
+{
+       return cache_state_if_flags(state, cached_ptr,
+                                   EXTENT_LOCKED | EXTENT_BOUNDARY);
+}
+
+/*
+ * Find the first state struct with 'bits' set after 'start', and return it.
+ * tree->lock must be held.  NULL will returned if nothing was found after
+ * 'start'.
+ */
+static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+                                                       u64 start, u32 bits)
+{
+       struct extent_state *state;
+
+       /*
+        * This search will find all the extents that end after our range
+        * starts.
+        */
+       state = tree_search(tree, start);
+       while (state) {
+               if (state->end >= start && (state->state & bits))
+                       return state;
+               state = next_state(state);
+       }
+       return NULL;
+}
+
+/*
+ * Find the first offset in the io tree with one or more @bits set.
+ *
+ * Note: If there are multiple bits set in @bits, any of them will match.
+ *
+ * Return 0 if we find something, and update @start_ret and @end_ret.
+ * Return 1 if we found nothing.
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                         u64 *start_ret, u64 *end_ret, u32 bits,
+                         struct extent_state **cached_state)
+{
+       struct extent_state *state;
+       int ret = 1;
+
+       spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->end == start - 1 && extent_state_in_tree(state)) {
+                       while ((state = next_state(state)) != NULL) {
+                               if (state->state & bits)
+                                       goto got_it;
+                       }
+                       free_extent_state(*cached_state);
+                       *cached_state = NULL;
+                       goto out;
+               }
+               free_extent_state(*cached_state);
+               *cached_state = NULL;
+       }
+
+       state = find_first_extent_bit_state(tree, start, bits);
+got_it:
+       if (state) {
+               cache_state_if_flags(state, cached_state, 0);
+               *start_ret = state->start;
+               *end_ret = state->end;
+               ret = 0;
+       }
+out:
+       spin_unlock(&tree->lock);
+       return ret;
+}
+
+/*
+ * Find a contiguous area of bits
+ *
+ * @tree:      io tree to check
+ * @start:     offset to start the search from
+ * @start_ret: the first offset we found with the bits set
+ * @end_ret:   the final contiguous range of the bits that were set
+ * @bits:      bits to look for
+ *
+ * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
+ * to set bits appropriately, and then merge them again.  During this time it
+ * will drop the tree->lock, so use this helper if you want to find the actual
+ * contiguous area for given bits.  We will search to the first bit we find, and
+ * then walk down the tree until we find a non-contiguous area.  The area
+ * returned will be the full contiguous area with the bits set.
+ */
+int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+                              u64 *start_ret, u64 *end_ret, u32 bits)
+{
+       struct extent_state *state;
+       int ret = 1;
+
+       spin_lock(&tree->lock);
+       state = find_first_extent_bit_state(tree, start, bits);
+       if (state) {
+               *start_ret = state->start;
+               *end_ret = state->end;
+               while ((state = next_state(state)) != NULL) {
+                       if (state->start > (*end_ret + 1))
+                               break;
+                       *end_ret = state->end;
+               }
+               ret = 0;
+       }
+       spin_unlock(&tree->lock);
+       return ret;
+}
+
+/*
+ * Find a contiguous range of bytes in the file marked as delalloc, not more
+ * than 'max_bytes'.  start and end are used to return the range,
+ *
+ * True is returned if we find something, false if nothing was in the tree.
+ */
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+                              u64 *end, u64 max_bytes,
+                              struct extent_state **cached_state)
+{
+       struct extent_state *state;
+       u64 cur_start = *start;
+       bool found = false;
+       u64 total_bytes = 0;
+
+       spin_lock(&tree->lock);
+
+       /*
+        * This search will find all the extents that end after our range
+        * starts.
+        */
+       state = tree_search(tree, cur_start);
+       if (!state) {
+               *end = (u64)-1;
+               goto out;
+       }
+
+       while (state) {
+               if (found && (state->start != cur_start ||
+                             (state->state & EXTENT_BOUNDARY))) {
+                       goto out;
+               }
+               if (!(state->state & EXTENT_DELALLOC)) {
+                       if (!found)
+                               *end = state->end;
+                       goto out;
+               }
+               if (!found) {
+                       *start = state->start;
+                       *cached_state = state;
+                       refcount_inc(&state->refs);
+               }
+               found = true;
+               *end = state->end;
+               cur_start = state->end + 1;
+               total_bytes += state->end - state->start + 1;
+               if (total_bytes >= max_bytes)
+                       break;
+               state = next_state(state);
+       }
+out:
+       spin_unlock(&tree->lock);
+       return found;
+}
+
+/*
+ * Set some bits on a range in the tree.  This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set.  The start of the
+ * existing range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive This takes the tree lock.
+ */
+static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                           u32 bits, u64 *failed_start,
+                           struct extent_state **cached_state,
+                           struct extent_changeset *changeset, gfp_t mask)
+{
+       struct extent_state *state;
+       struct extent_state *prealloc = NULL;
+       struct rb_node **p;
+       struct rb_node *parent;
+       int err = 0;
+       u64 last_start;
+       u64 last_end;
+       u32 exclusive_bits = (bits & EXTENT_LOCKED);
+
+       btrfs_debug_check_extent_io_range(tree, start, end);
+       trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
+
+       if (exclusive_bits)
+               ASSERT(failed_start);
+       else
+               ASSERT(failed_start == NULL);
+again:
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
+               /*
+                * Don't care for allocation failure here because we might end
+                * up not needing the pre-allocated extent state at all, which
+                * is the case if we only have in the tree extent states that
+                * cover our input range and don't cover too any other range.
+                * If we end up needing a new extent state we allocate it later.
+                */
+               prealloc = alloc_extent_state(mask);
+       }
+
+       spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->start <= start && state->end > start &&
+                   extent_state_in_tree(state))
+                       goto hit_next;
+       }
+       /*
+        * This search will find all the extents that end after our range
+        * starts.
+        */
+       state = tree_search_for_insert(tree, start, &p, &parent);
+       if (!state) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+               prealloc->start = start;
+               prealloc->end = end;
+               insert_state_fast(tree, prealloc, p, parent, bits, changeset);
+               cache_state(prealloc, cached_state);
+               prealloc = NULL;
+               goto out;
+       }
+hit_next:
+       last_start = state->start;
+       last_end = state->end;
+
+       /*
+        * | ---- desired range ---- |
+        * | state |
+        *
+        * Just lock what we found and keep going
+        */
+       if (state->start == start && state->end <= end) {
+               if (state->state & exclusive_bits) {
+                       *failed_start = state->start;
+                       err = -EEXIST;
+                       goto out;
+               }
+
+               set_state_bits(tree, state, bits, changeset);
+               cache_state(state, cached_state);
+               merge_state(tree, state);
+               if (last_end == (u64)-1)
+                       goto out;
+               start = last_end + 1;
+               state = next_state(state);
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
+               goto search_again;
+       }
+
+       /*
+        *     | ---- desired range ---- |
+        * | state |
+        *   or
+        * | ------------- state -------------- |
+        *
+        * We need to split the extent we found, and may flip bits on second
+        * half.
+        *
+        * If the extent we found extends past our range, we just split and
+        * search again.  It'll get split again the next time though.
+        *
+        * If the extent we found is inside our range, we set the desired bit
+        * on it.
+        */
+       if (state->start < start) {
+               if (state->state & exclusive_bits) {
+                       *failed_start = start;
+                       err = -EEXIST;
+                       goto out;
+               }
+
+               /*
+                * If this extent already has all the bits we want set, then
+                * skip it, not necessary to split it or do anything with it.
+                */
+               if ((state->state & bits) == bits) {
+                       start = state->end + 1;
+                       cache_state(state, cached_state);
+                       goto search_again;
+               }
+
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+               err = split_state(tree, state, prealloc, start);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               prealloc = NULL;
+               if (err)
+                       goto out;
+               if (state->end <= end) {
+                       set_state_bits(tree, state, bits, changeset);
+                       cache_state(state, cached_state);
+                       merge_state(tree, state);
+                       if (last_end == (u64)-1)
+                               goto out;
+                       start = last_end + 1;
+                       state = next_state(state);
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
+               }
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *     | state | or               | state |
+        *
+        * There's a hole, we need to insert something in it and ignore the
+        * extent we found.
+        */
+       if (state->start > start) {
+               u64 this_end;
+               if (end < last_start)
+                       this_end = end;
+               else
+                       this_end = last_start - 1;
+
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+
+               /*
+                * Avoid to free 'prealloc' if it can be merged with the later
+                * extent.
+                */
+               prealloc->start = start;
+               prealloc->end = this_end;
+               err = insert_state(tree, prealloc, bits, changeset);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               cache_state(prealloc, cached_state);
+               prealloc = NULL;
+               start = this_end + 1;
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *                        | state |
+        *
+        * We need to split the extent, and set the bit on the first half
+        */
+       if (state->start <= end && state->end > end) {
+               if (state->state & exclusive_bits) {
+                       *failed_start = start;
+                       err = -EEXIST;
+                       goto out;
+               }
+
+               prealloc = alloc_extent_state_atomic(prealloc);
+               BUG_ON(!prealloc);
+               err = split_state(tree, state, prealloc, end + 1);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               set_state_bits(tree, prealloc, bits, changeset);
+               cache_state(prealloc, cached_state);
+               merge_state(tree, prealloc);
+               prealloc = NULL;
+               goto out;
+       }
+
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       if (gfpflags_allow_blocking(mask))
+               cond_resched();
+       goto again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return err;
+
+}
+
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                  u32 bits, struct extent_state **cached_state, gfp_t mask)
+{
+       return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
+                               NULL, mask);
+}
+
+/*
+ * Convert all bits in a given range from one bit to another
+ *
+ * @tree:      the io tree to search
+ * @start:     the start offset in bytes
+ * @end:       the end offset in bytes (inclusive)
+ * @bits:      the bits to set in this range
+ * @clear_bits:        the bits to clear in this range
+ * @cached_state:      state that we're going to cache
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie.
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                      u32 bits, u32 clear_bits,
+                      struct extent_state **cached_state)
+{
+       struct extent_state *state;
+       struct extent_state *prealloc = NULL;
+       struct rb_node **p;
+       struct rb_node *parent;
+       int err = 0;
+       u64 last_start;
+       u64 last_end;
+       bool first_iteration = true;
+
+       btrfs_debug_check_extent_io_range(tree, start, end);
+       trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
+                                      clear_bits);
+
+again:
+       if (!prealloc) {
+               /*
+                * Best effort, don't worry if extent state allocation fails
+                * here for the first iteration. We might have a cached state
+                * that matches exactly the target range, in which case no
+                * extent state allocations are needed. We'll only know this
+                * after locking the tree.
+                */
+               prealloc = alloc_extent_state(GFP_NOFS);
+               if (!prealloc && !first_iteration)
+                       return -ENOMEM;
+       }
+
+       spin_lock(&tree->lock);
+       if (cached_state && *cached_state) {
+               state = *cached_state;
+               if (state->start <= start && state->end > start &&
+                   extent_state_in_tree(state))
+                       goto hit_next;
+       }
+
+       /*
+        * This search will find all the extents that end after our range
+        * starts.
+        */
+       state = tree_search_for_insert(tree, start, &p, &parent);
+       if (!state) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               prealloc->start = start;
+               prealloc->end = end;
+               insert_state_fast(tree, prealloc, p, parent, bits, NULL);
+               cache_state(prealloc, cached_state);
+               prealloc = NULL;
+               goto out;
+       }
+hit_next:
+       last_start = state->start;
+       last_end = state->end;
+
+       /*
+        * | ---- desired range ---- |
+        * | state |
+        *
+        * Just lock what we found and keep going.
+        */
+       if (state->start == start && state->end <= end) {
+               set_state_bits(tree, state, bits, NULL);
+               cache_state(state, cached_state);
+               state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+               if (last_end == (u64)-1)
+                       goto out;
+               start = last_end + 1;
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
+               goto search_again;
+       }
+
+       /*
+        *     | ---- desired range ---- |
+        * | state |
+        *   or
+        * | ------------- state -------------- |
+        *
+        * We need to split the extent we found, and may flip bits on second
+        * half.
+        *
+        * If the extent we found extends past our range, we just split and
+        * search again.  It'll get split again the next time though.
+        *
+        * If the extent we found is inside our range, we set the desired bit
+        * on it.
+        */
+       if (state->start < start) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               err = split_state(tree, state, prealloc, start);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+               prealloc = NULL;
+               if (err)
+                       goto out;
+               if (state->end <= end) {
+                       set_state_bits(tree, state, bits, NULL);
+                       cache_state(state, cached_state);
+                       state = clear_state_bit(tree, state, clear_bits, 0, NULL);
+                       if (last_end == (u64)-1)
+                               goto out;
+                       start = last_end + 1;
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
+               }
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *     | state | or               | state |
+        *
+        * There's a hole, we need to insert something in it and ignore the
+        * extent we found.
+        */
+       if (state->start > start) {
+               u64 this_end;
+               if (end < last_start)
+                       this_end = end;
+               else
+                       this_end = last_start - 1;
+
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               /*
+                * Avoid to free 'prealloc' if it can be merged with the later
+                * extent.
+                */
+               prealloc->start = start;
+               prealloc->end = this_end;
+               err = insert_state(tree, prealloc, bits, NULL);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+               cache_state(prealloc, cached_state);
+               prealloc = NULL;
+               start = this_end + 1;
+               goto search_again;
+       }
+       /*
+        * | ---- desired range ---- |
+        *                        | state |
+        *
+        * We need to split the extent, and set the bit on the first half.
+        */
+       if (state->start <= end && state->end > end) {
+               prealloc = alloc_extent_state_atomic(prealloc);
+               if (!prealloc) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = split_state(tree, state, prealloc, end + 1);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
+               set_state_bits(tree, prealloc, bits, NULL);
+               cache_state(prealloc, cached_state);
+               clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+               prealloc = NULL;
+               goto out;
+       }
+
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       cond_resched();
+       first_iteration = false;
+       goto again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return err;
+}
+
+/*
+ * Find the first range that has @bits not set. This range could start before
+ * @start.
+ *
+ * @tree:      the tree to search
+ * @start:     offset at/after which the found extent should start
+ * @start_ret: records the beginning of the range
+ * @end_ret:   records the end of the range (inclusive)
+ * @bits:      the set of bits which must be unset
+ *
+ * Since unallocated range is also considered one which doesn't have the bits
+ * set it's possible that @end_ret contains -1, this happens in case the range
+ * spans (last_range_end, end of device]. In this case it's up to the caller to
+ * trim @end_ret to the appropriate size.
+ */
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+                                u64 *start_ret, u64 *end_ret, u32 bits)
+{
+       struct extent_state *state;
+       struct extent_state *prev = NULL, *next;
+
+       spin_lock(&tree->lock);
+
+       /* Find first extent with bits cleared */
+       while (1) {
+               state = tree_search_prev_next(tree, start, &prev, &next);
+               if (!state && !next && !prev) {
+                       /*
+                        * Tree is completely empty, send full range and let
+                        * caller deal with it
+                        */
+                       *start_ret = 0;
+                       *end_ret = -1;
+                       goto out;
+               } else if (!state && !next) {
+                       /*
+                        * We are past the last allocated chunk, set start at
+                        * the end of the last extent.
+                        */
+                       *start_ret = prev->end + 1;
+                       *end_ret = -1;
+                       goto out;
+               } else if (!state) {
+                       state = next;
+               }
+
+               /*
+                * At this point 'state' either contains 'start' or start is
+                * before 'state'
+                */
+               if (in_range(start, state->start, state->end - state->start + 1)) {
+                       if (state->state & bits) {
+                               /*
+                                * |--range with bits sets--|
+                                *    |
+                                *    start
+                                */
+                               start = state->end + 1;
+                       } else {
+                               /*
+                                * 'start' falls within a range that doesn't
+                                * have the bits set, so take its start as the
+                                * beginning of the desired range
+                                *
+                                * |--range with bits cleared----|
+                                *      |
+                                *      start
+                                */
+                               *start_ret = state->start;
+                               break;
+                       }
+               } else {
+                       /*
+                        * |---prev range---|---hole/unset---|---node range---|
+                        *                          |
+                        *                        start
+                        *
+                        *                        or
+                        *
+                        * |---hole/unset--||--first node--|
+                        * 0   |
+                        *    start
+                        */
+                       if (prev)
+                               *start_ret = prev->end + 1;
+                       else
+                               *start_ret = 0;
+                       break;
+               }
+       }
+
+       /*
+        * Find the longest stretch from start until an entry which has the
+        * bits set
+        */
+       while (state) {
+               if (state->end >= start && !(state->state & bits)) {
+                       *end_ret = state->end;
+               } else {
+                       *end_ret = state->start - 1;
+                       break;
+               }
+               state = next_state(state);
+       }
+out:
+       spin_unlock(&tree->lock);
+}
+
+/*
+ * Count the number of bytes in the tree that have a given bit(s) set.  This
+ * can be fairly slow, except for EXTENT_DIRTY which is cached.  The total
+ * number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+                    u64 *start, u64 search_end, u64 max_bytes,
+                    u32 bits, int contig)
+{
+       struct extent_state *state;
+       u64 cur_start = *start;
+       u64 total_bytes = 0;
+       u64 last = 0;
+       int found = 0;
+
+       if (WARN_ON(search_end <= cur_start))
+               return 0;
+
+       spin_lock(&tree->lock);
+
+       /*
+        * This search will find all the extents that end after our range
+        * starts.
+        */
+       state = tree_search(tree, cur_start);
+       while (state) {
+               if (state->start > search_end)
+                       break;
+               if (contig && found && state->start > last + 1)
+                       break;
+               if (state->end >= cur_start && (state->state & bits) == bits) {
+                       total_bytes += min(search_end, state->end) + 1 -
+                                      max(cur_start, state->start);
+                       if (total_bytes >= max_bytes)
+                               break;
+                       if (!found) {
+                               *start = max(cur_start, state->start);
+                               found = 1;
+                       }
+                       last = state->end;
+               } else if (contig && found) {
+                       break;
+               }
+               state = next_state(state);
+       }
+       spin_unlock(&tree->lock);
+       return total_bytes;
+}
+
+/*
+ * Searche a range in the state tree for a given mask.  If 'filled' == 1, this
+ * returns 1 only if every extent in the tree has the bits set.  Otherwise, 1
+ * is returned if any bit in the range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                  u32 bits, int filled, struct extent_state *cached)
+{
+       struct extent_state *state = NULL;
+       int bitset = 0;
+
+       spin_lock(&tree->lock);
+       if (cached && extent_state_in_tree(cached) && cached->start <= start &&
+           cached->end > start)
+               state = cached;
+       else
+               state = tree_search(tree, start);
+       while (state && start <= end) {
+               if (filled && state->start > start) {
+                       bitset = 0;
+                       break;
+               }
+
+               if (state->start > end)
+                       break;
+
+               if (state->state & bits) {
+                       bitset = 1;
+                       if (!filled)
+                               break;
+               } else if (filled) {
+                       bitset = 0;
+                       break;
+               }
+
+               if (state->end == (u64)-1)
+                       break;
+
+               start = state->end + 1;
+               if (start > end)
+                       break;
+               state = next_state(state);
+       }
+
+       /* We ran out of states and were still inside of our range. */
+       if (filled && !state)
+               bitset = 0;
+       spin_unlock(&tree->lock);
+       return bitset;
+}
+
+/* Wrappers around set/clear extent bit */
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          u32 bits, struct extent_changeset *changeset)
+{
+       /*
+        * We don't support EXTENT_LOCKED yet, as current changeset will
+        * record any bits changed, so for EXTENT_LOCKED case, it will
+        * either fail with -EEXIST or changeset will record the whole
+        * range.
+        */
+       ASSERT(!(bits & EXTENT_LOCKED));
+
+       return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
+                               GFP_NOFS);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            u32 bits, struct extent_changeset *changeset)
+{
+       /*
+        * Don't support EXTENT_LOCKED case, same reason as
+        * set_record_extent_bits().
+        */
+       ASSERT(!(bits & EXTENT_LOCKED));
+
+       return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS,
+                                 changeset);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+       int err;
+       u64 failed_start;
+
+       err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
+                              NULL, NULL, GFP_NOFS);
+       if (err == -EEXIST) {
+               if (failed_start > start)
+                       clear_extent_bit(tree, start, failed_start - 1,
+                                        EXTENT_LOCKED, NULL);
+               return 0;
+       }
+       return 1;
+}
+
+/*
+ * Either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+               struct extent_state **cached_state)
+{
+       int err;
+       u64 failed_start;
+
+       while (1) {
+               err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
+                                      &failed_start, cached_state, NULL,
+                                      GFP_NOFS);
+               if (err == -EEXIST) {
+                       wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+                       start = failed_start;
+               } else
+                       break;
+               WARN_ON(start > end);
+       }
+       return err;
+}
+
+void __cold extent_state_free_cachep(void)
+{
+       btrfs_extent_state_leak_debug_check();
+       kmem_cache_destroy(extent_state_cache);
+}
+
+int __init extent_state_init_cachep(void)
+{
+       extent_state_cache = kmem_cache_create("btrfs_extent_state",
+                       sizeof(struct extent_state), 0,
+                       SLAB_MEM_SPREAD, NULL);
+       if (!extent_state_cache)
+               return -ENOMEM;
+
+       return 0;
+}
index c3eb52d..a855f40 100644 (file)
@@ -17,7 +17,6 @@ struct io_failure_record;
 #define EXTENT_NODATASUM       (1U << 7)
 #define EXTENT_CLEAR_META_RESV (1U << 8)
 #define EXTENT_NEED_WAIT       (1U << 9)
-#define EXTENT_DAMAGED         (1U << 10)
 #define EXTENT_NORESERVE       (1U << 11)
 #define EXTENT_QGROUP_RESERVED (1U << 12)
 #define EXTENT_CLEAR_DATA_RESV (1U << 13)
@@ -35,10 +34,18 @@ struct io_failure_record;
  * delalloc bytes decremented, in an atomic way to prevent races with stat(2).
  */
 #define EXTENT_ADD_INODE_BYTES  (1U << 15)
+
+/*
+ * Set during truncate when we're clearing an entire range and we just want the
+ * extent states to go away.
+ */
+#define EXTENT_CLEAR_ALL_BITS  (1U << 16)
+
 #define EXTENT_DO_ACCOUNTING    (EXTENT_CLEAR_META_RESV | \
                                 EXTENT_CLEAR_DATA_RESV)
 #define EXTENT_CTLBITS         (EXTENT_DO_ACCOUNTING | \
-                                EXTENT_ADD_INODE_BYTES)
+                                EXTENT_ADD_INODE_BYTES | \
+                                EXTENT_CLEAR_ALL_BITS)
 
 /*
  * Redefined bits above which are used only in the device allocation tree,
@@ -56,7 +63,6 @@ enum {
        IO_TREE_FS_EXCLUDED_EXTENTS,
        IO_TREE_BTREE_INODE_IO,
        IO_TREE_INODE_IO,
-       IO_TREE_INODE_IO_FAILURE,
        IO_TREE_RELOC_BLOCKS,
        IO_TREE_TRANS_DIRTY_PAGES,
        IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -70,8 +76,6 @@ struct extent_io_tree {
        struct rb_root state;
        struct btrfs_fs_info *fs_info;
        void *private_data;
-       u64 dirty_bytes;
-       bool track_uptodate;
 
        /* Who owns this io tree, should be one of IO_TREE_* */
        u8 owner;
@@ -89,33 +93,23 @@ struct extent_state {
        refcount_t refs;
        u32 state;
 
-       struct io_failure_record *failrec;
-
 #ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
 #endif
 };
 
-int __init extent_state_cache_init(void);
-void __cold extent_state_cache_exit(void);
-
 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
                         struct extent_io_tree *tree, unsigned int owner,
                         void *private_data);
 void extent_io_tree_release(struct extent_io_tree *tree);
 
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    struct extent_state **cached);
-
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
-       return lock_extent_bits(tree, start, end, NULL);
-}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+               struct extent_state **cached);
 
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
+int __init extent_state_init_cachep(void);
+void __cold extent_state_free_cachep(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
@@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   u32 bits, int filled, struct extent_state *cached_state);
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                             u32 bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    u32 bits, int wake, int delete,
-                    struct extent_state **cached);
 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    u32 bits, int wake, int delete,
-                    struct extent_state **cached, gfp_t mask,
-                    struct extent_changeset *changeset);
+                      u32 bits, struct extent_state **cached, gfp_t mask,
+                      struct extent_changeset *changeset);
 
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
+                                  u64 end, u32 bits,
+                                  struct extent_state **cached)
 {
-       return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+       return __clear_extent_bit(tree, start, end, bits, cached,
+                                 GFP_NOFS, NULL);
 }
 
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
-               u64 end, struct extent_state **cached)
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                               struct extent_state **cached)
 {
-       return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
-                               GFP_NOFS, NULL);
+       return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+                                 GFP_NOFS, NULL);
 }
 
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
-               u64 start, u64 end, struct extent_state **cached)
+static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
+                                      u64 end, struct extent_state **cached)
 {
-       return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
-                               GFP_ATOMIC, NULL);
+       return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
+                                 GFP_ATOMIC, NULL);
 }
 
 static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
                                    u64 end, u32 bits)
 {
-       int wake = 0;
-
-       if (bits & EXTENT_LOCKED)
-               wake = 1;
-
-       return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+       return clear_extent_bit(tree, start, end, bits, NULL);
 }
 
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                           u32 bits, struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  u32 bits, unsigned exclusive_bits, u64 *failed_start,
-                  struct extent_state **cached_state, gfp_t mask,
-                  struct extent_changeset *changeset);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
-                          u32 bits);
+                  u32 bits, struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
+                                        u64 end, u32 bits)
+{
+       return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
+}
 
 static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
                u64 end, u32 bits)
 {
-       return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
-                             NULL);
+       return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
 }
 
 static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
                u64 end, struct extent_state **cached_state)
 {
-       return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-                               cached_state, GFP_NOFS, NULL);
+       return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+                                 cached_state, GFP_NOFS, NULL);
 }
 
 static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
                u64 end, gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
-                             mask, NULL);
+       return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
 }
 
 static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                               EXTENT_DO_ACCOUNTING, 0, 0, cached);
+                               EXTENT_DO_ACCOUNTING, cached);
 }
 
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
                                      struct extent_state **cached_state)
 {
        return set_extent_bit(tree, start, end,
-                             EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
-                             0, NULL, cached_state, GFP_NOFS, NULL);
+                             EXTENT_DELALLOC | extra_bits,
+                             cached_state, GFP_NOFS);
 }
 
 static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
                u64 end, struct extent_state **cached_state)
 {
        return set_extent_bit(tree, start, end,
-                             EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
-                             0, NULL, cached_state, GFP_NOFS, NULL);
+                             EXTENT_DELALLOC | EXTENT_DEFRAG,
+                             cached_state, GFP_NOFS);
 }
 
 static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
                u64 end)
 {
-       return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
-                             GFP_NOFS, NULL);
+       return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
 }
 
 static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
                u64 end, struct extent_state **cached_state, gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-                             cached_state, mask, NULL);
+       return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
+                             cached_state, mask);
 }
 
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
                                 u64 *start_ret, u64 *end_ret, u32 bits);
 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
                               u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidate_folio(struct extent_io_tree *tree,
-                         struct folio *folio, size_t offset);
 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
                               u64 *end, u64 max_bytes,
                               struct extent_state **cached_state);
-
-/* This should be reworked in the future and put elsewhere. */
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
-                     struct io_failure_record *failrec);
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
-               u64 end);
-int free_io_failure(struct extent_io_tree *failure_tree,
-                   struct extent_io_tree *io_tree,
-                   struct io_failure_record *rec);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
-                    struct extent_io_tree *failure_tree,
-                    struct extent_io_tree *io_tree, u64 start,
-                    struct page *page, u64 ino, unsigned int pg_offset);
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
 
 #endif /* BTRFS_EXTENT_IO_TREE_H */
index 6914cd8..cd2d365 100644 (file)
@@ -2220,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        }
 
        if (!mutex_trylock(&head->mutex)) {
+               if (path->nowait) {
+                       spin_unlock(&delayed_refs->lock);
+                       btrfs_put_transaction(cur_trans);
+                       return -EAGAIN;
+               }
+
                refcount_inc(&head->refs);
                spin_unlock(&delayed_refs->lock);
 
@@ -2686,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
                len = cache->start + cache->length - start;
                len = min(len, end + 1 - start);
 
-               down_read(&fs_info->commit_root_sem);
-               if (start < cache->last_byte_to_unpin && return_free_space) {
-                       u64 add_len = min(len, cache->last_byte_to_unpin - start);
-
-                       btrfs_add_free_space(cache, start, add_len);
-               }
-               up_read(&fs_info->commit_root_sem);
+               if (return_free_space)
+                       btrfs_add_free_space(cache, start, len);
 
                start += len;
                total_unpinned += len;
@@ -3804,7 +3805,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
               block_group->start == fs_info->data_reloc_bg ||
               fs_info->data_reloc_bg == 0);
 
-       if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
+       if (block_group->ro ||
+           test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
                ret = 1;
                goto out;
        }
@@ -3881,7 +3883,7 @@ out:
                 * regular extents) at the same time to the same zone, which
                 * easily break the write pointer.
                 */
-               block_group->zoned_data_reloc_ongoing = 1;
+               set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
                fs_info->data_reloc_bg = 0;
        }
        spin_unlock(&fs_info->relocation_bg_lock);
@@ -4888,6 +4890,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
            !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
                lockdep_owner = BTRFS_FS_TREE_OBJECTID;
 
+       /* btrfs_clean_tree_block() accesses generation field. */
+       btrfs_set_header_generation(buf, trans->transid);
+
        /*
         * This needs to stay, because we could allocate a freed block from an
         * old tree into a new tree, so we need to make sure this new block is
@@ -5639,6 +5644,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  */
 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
 {
+       const bool is_reloc_root = (root->root_key.objectid ==
+                                   BTRFS_TREE_RELOC_OBJECTID);
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -5798,6 +5805,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
                                goto out_end_trans;
                        }
 
+                       if (!is_reloc_root)
+                               btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
                        btrfs_end_transaction_throttle(trans);
                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
                                btrfs_debug(fs_info,
@@ -5832,7 +5842,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
                goto out_end_trans;
        }
 
-       if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+       if (!is_reloc_root) {
                ret = btrfs_find_root(tree_root, &root->root_key, path,
                                      NULL, NULL);
                if (ret < 0) {
@@ -5864,6 +5874,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
                btrfs_put_root(root);
        root_dropped = true;
 out_end_trans:
+       if (!is_reloc_root)
+               btrfs_set_last_root_drop_gen(fs_info, trans->transid);
+
        btrfs_end_transaction_throttle(trans);
 out_free:
        kfree(wc);
index cf4f19e..1eae68f 100644 (file)
 #include "block-group.h"
 #include "compression.h"
 
-static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
-static struct bio_set btrfs_bioset;
-
-static inline bool extent_state_in_tree(const struct extent_state *state)
-{
-       return !RB_EMPTY_NODE(&state->rb_node);
-}
 
 #ifdef CONFIG_BTRFS_DEBUG
-static LIST_HEAD(states);
-static DEFINE_SPINLOCK(leak_lock);
-
-static inline void btrfs_leak_debug_add(spinlock_t *lock,
-                                       struct list_head *new,
-                                       struct list_head *head)
+static inline void btrfs_leak_debug_add_eb(struct extent_buffer *eb)
 {
+       struct btrfs_fs_info *fs_info = eb->fs_info;
        unsigned long flags;
 
-       spin_lock_irqsave(lock, flags);
-       list_add(new, head);
-       spin_unlock_irqrestore(lock, flags);
+       spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+       list_add(&eb->leak_list, &fs_info->allocated_ebs);
+       spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
 }
 
-static inline void btrfs_leak_debug_del(spinlock_t *lock,
-                                       struct list_head *entry)
+static inline void btrfs_leak_debug_del_eb(struct extent_buffer *eb)
 {
+       struct btrfs_fs_info *fs_info = eb->fs_info;
        unsigned long flags;
 
-       spin_lock_irqsave(lock, flags);
-       list_del(entry);
-       spin_unlock_irqrestore(lock, flags);
+       spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
+       list_del(&eb->leak_list);
+       spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
 }
 
 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
@@ -91,53 +80,11 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
        }
        spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
 }
-
-static inline void btrfs_extent_state_leak_debug_check(void)
-{
-       struct extent_state *state;
-
-       while (!list_empty(&states)) {
-               state = list_entry(states.next, struct extent_state, leak_list);
-               pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
-                      state->start, state->end, state->state,
-                      extent_state_in_tree(state),
-                      refcount_read(&state->refs));
-               list_del(&state->leak_list);
-               kmem_cache_free(extent_state_cache, state);
-       }
-}
-
-#define btrfs_debug_check_extent_io_range(tree, start, end)            \
-       __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
-static inline void __btrfs_debug_check_extent_io_range(const char *caller,
-               struct extent_io_tree *tree, u64 start, u64 end)
-{
-       struct inode *inode = tree->private_data;
-       u64 isize;
-
-       if (!inode || !is_data_inode(inode))
-               return;
-
-       isize = i_size_read(inode);
-       if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-               btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-                   "%s: ino %llu isize %llu odd range [%llu,%llu]",
-                       caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
-       }
-}
 #else
-#define btrfs_leak_debug_add(lock, new, head)  do {} while (0)
-#define btrfs_leak_debug_del(lock, entry)      do {} while (0)
-#define btrfs_extent_state_leak_debug_check()  do {} while (0)
-#define btrfs_debug_check_extent_io_range(c, s, e)     do {} while (0)
+#define btrfs_leak_debug_add_eb(eb)                    do {} while (0)
+#define btrfs_leak_debug_del_eb(eb)                    do {} while (0)
 #endif
 
-struct tree_entry {
-       u64 start;
-       u64 end;
-       struct rb_node rb_node;
-};
-
 /*
  * Structure to record info about the bio being assembled, and other info like
  * how many bytes are there before stripe/ordered extent boundary.
@@ -148,6 +95,7 @@ struct btrfs_bio_ctrl {
        enum btrfs_compression_type compress_type;
        u32 len_to_stripe_boundary;
        u32 len_to_oe_boundary;
+       btrfs_bio_end_io_t end_io_func;
 };
 
 struct extent_page_data {
@@ -161,24 +109,6 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
-static int add_extent_changeset(struct extent_state *state, u32 bits,
-                                struct extent_changeset *changeset,
-                                int set)
-{
-       int ret;
-
-       if (!changeset)
-               return 0;
-       if (set && (state->state & bits) == bits)
-               return 0;
-       if (!set && (state->state & bits) == 0)
-               return 0;
-       changeset->bytes_changed += state->end - state->start + 1;
-       ret = ulist_add(&changeset->range_changed, state->start, state->end,
-                       GFP_ATOMIC);
-       return ret;
-}
-
 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
 {
        struct bio *bio;
@@ -207,7 +137,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
                btrfs_submit_data_read_bio(inode, bio, mirror_num,
                                           bio_ctrl->compress_type);
 
-       /* The bio is owned by the bi_end_io handler now */
+       /* The bio is owned by the end_io handler now */
        bio_ctrl->bio = NULL;
 }
 
@@ -223,1304 +153,33 @@ static void submit_write_bio(struct extent_page_data *epd, int ret)
 
        if (ret) {
                ASSERT(ret < 0);
-               bio->bi_status = errno_to_blk_status(ret);
-               bio_endio(bio);
-               /* The bio is owned by the bi_end_io handler now */
+               btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
+               /* The bio is owned by the end_io handler now */
                epd->bio_ctrl.bio = NULL;
        } else {
                submit_one_bio(&epd->bio_ctrl);
        }
 }
 
-int __init extent_state_cache_init(void)
-{
-       extent_state_cache = kmem_cache_create("btrfs_extent_state",
-                       sizeof(struct extent_state), 0,
-                       SLAB_MEM_SPREAD, NULL);
-       if (!extent_state_cache)
-               return -ENOMEM;
-       return 0;
-}
-
-int __init extent_io_init(void)
+int __init extent_buffer_init_cachep(void)
 {
        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
                        sizeof(struct extent_buffer), 0,
                        SLAB_MEM_SPREAD, NULL);
        if (!extent_buffer_cache)
-               return -ENOMEM;
-
-       if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
-                       offsetof(struct btrfs_bio, bio),
-                       BIOSET_NEED_BVECS))
-               goto free_buffer_cache;
-
-       if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
-               goto free_bioset;
-
-       return 0;
-
-free_bioset:
-       bioset_exit(&btrfs_bioset);
-
-free_buffer_cache:
-       kmem_cache_destroy(extent_buffer_cache);
-       extent_buffer_cache = NULL;
-       return -ENOMEM;
-}
-
-void __cold extent_state_cache_exit(void)
-{
-       btrfs_extent_state_leak_debug_check();
-       kmem_cache_destroy(extent_state_cache);
-}
-
-void __cold extent_io_exit(void)
-{
-       /*
-        * Make sure all delayed rcu free are flushed before we
-        * destroy caches.
-        */
-       rcu_barrier();
-       kmem_cache_destroy(extent_buffer_cache);
-       bioset_exit(&btrfs_bioset);
-}
-
-/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc.  These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
- */
-static struct lock_class_key file_extent_tree_class;
-
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
-                        struct extent_io_tree *tree, unsigned int owner,
-                        void *private_data)
-{
-       tree->fs_info = fs_info;
-       tree->state = RB_ROOT;
-       tree->dirty_bytes = 0;
-       spin_lock_init(&tree->lock);
-       tree->private_data = private_data;
-       tree->owner = owner;
-       if (owner == IO_TREE_INODE_FILE_EXTENT)
-               lockdep_set_class(&tree->lock, &file_extent_tree_class);
-}
-
-void extent_io_tree_release(struct extent_io_tree *tree)
-{
-       spin_lock(&tree->lock);
-       /*
-        * Do a single barrier for the waitqueue_active check here, the state
-        * of the waitqueue should not change once extent_io_tree_release is
-        * called.
-        */
-       smp_mb();
-       while (!RB_EMPTY_ROOT(&tree->state)) {
-               struct rb_node *node;
-               struct extent_state *state;
-
-               node = rb_first(&tree->state);
-               state = rb_entry(node, struct extent_state, rb_node);
-               rb_erase(&state->rb_node, &tree->state);
-               RB_CLEAR_NODE(&state->rb_node);
-               /*
-                * btree io trees aren't supposed to have tasks waiting for
-                * changes in the flags of extent states ever.
-                */
-               ASSERT(!waitqueue_active(&state->wq));
-               free_extent_state(state);
-
-               cond_resched_lock(&tree->lock);
-       }
-       spin_unlock(&tree->lock);
-}
-
-static struct extent_state *alloc_extent_state(gfp_t mask)
-{
-       struct extent_state *state;
-
-       /*
-        * The given mask might be not appropriate for the slab allocator,
-        * drop the unsupported bits
-        */
-       mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
-       state = kmem_cache_alloc(extent_state_cache, mask);
-       if (!state)
-               return state;
-       state->state = 0;
-       state->failrec = NULL;
-       RB_CLEAR_NODE(&state->rb_node);
-       btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
-       refcount_set(&state->refs, 1);
-       init_waitqueue_head(&state->wq);
-       trace_alloc_extent_state(state, mask, _RET_IP_);
-       return state;
-}
-
-void free_extent_state(struct extent_state *state)
-{
-       if (!state)
-               return;
-       if (refcount_dec_and_test(&state->refs)) {
-               WARN_ON(extent_state_in_tree(state));
-               btrfs_leak_debug_del(&leak_lock, &state->leak_list);
-               trace_free_extent_state(state, _RET_IP_);
-               kmem_cache_free(extent_state_cache, state);
-       }
-}
-
-/**
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
- *
- * @tree:       the tree to search
- * @offset:     offset that should fall within an entry in @tree
- * @node_ret:   pointer where new node should be anchored (used when inserting an
- *             entry in the tree)
- * @parent_ret: points to entry which would have been the parent of the entry,
- *               containing @offset
- *
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
- *
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
- */
-static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
-                                                    u64 offset,
-                                                    struct rb_node ***node_ret,
-                                                    struct rb_node **parent_ret)
-{
-       struct rb_root *root = &tree->state;
-       struct rb_node **node = &root->rb_node;
-       struct rb_node *prev = NULL;
-       struct tree_entry *entry;
-
-       while (*node) {
-               prev = *node;
-               entry = rb_entry(prev, struct tree_entry, rb_node);
-
-               if (offset < entry->start)
-                       node = &(*node)->rb_left;
-               else if (offset > entry->end)
-                       node = &(*node)->rb_right;
-               else
-                       return *node;
-       }
-
-       if (node_ret)
-               *node_ret = node;
-       if (parent_ret)
-               *parent_ret = prev;
-
-       /* Search neighbors until we find the first one past the end */
-       while (prev && offset > entry->end) {
-               prev = rb_next(prev);
-               entry = rb_entry(prev, struct tree_entry, rb_node);
-       }
-
-       return prev;
-}
-
-/*
- * Inexact rb-tree search, return the next entry if @offset is not found
- */
-static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
-{
-       return tree_search_for_insert(tree, offset, NULL, NULL);
-}
-
-/**
- * Search offset in the tree or fill neighbor rbtree node pointers.
- *
- * @tree:      the tree to search
- * @offset:    offset that should fall within an entry in @tree
- * @next_ret:  pointer to the first entry whose range ends after @offset
- * @prev_ret:  pointer to the first entry whose range begins before @offset
- *
- * Return a pointer to the entry that contains @offset byte address. If no
- * such entry exists, then return NULL and fill @prev_ret and @next_ret.
- * Otherwise return the found entry and other pointers are left untouched.
- */
-static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
-                                            u64 offset,
-                                            struct rb_node **prev_ret,
-                                            struct rb_node **next_ret)
-{
-       struct rb_root *root = &tree->state;
-       struct rb_node **node = &root->rb_node;
-       struct rb_node *prev = NULL;
-       struct rb_node *orig_prev = NULL;
-       struct tree_entry *entry;
-
-       ASSERT(prev_ret);
-       ASSERT(next_ret);
-
-       while (*node) {
-               prev = *node;
-               entry = rb_entry(prev, struct tree_entry, rb_node);
-
-               if (offset < entry->start)
-                       node = &(*node)->rb_left;
-               else if (offset > entry->end)
-                       node = &(*node)->rb_right;
-               else
-                       return *node;
-       }
-
-       orig_prev = prev;
-       while (prev && offset > entry->end) {
-               prev = rb_next(prev);
-               entry = rb_entry(prev, struct tree_entry, rb_node);
-       }
-       *next_ret = prev;
-       prev = orig_prev;
-
-       entry = rb_entry(prev, struct tree_entry, rb_node);
-       while (prev && offset < entry->start) {
-               prev = rb_prev(prev);
-               entry = rb_entry(prev, struct tree_entry, rb_node);
-       }
-       *prev_ret = prev;
-
-       return NULL;
-}
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree.  Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static void merge_state(struct extent_io_tree *tree,
-                       struct extent_state *state)
-{
-       struct extent_state *other;
-       struct rb_node *other_node;
-
-       if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
-               return;
-
-       other_node = rb_prev(&state->rb_node);
-       if (other_node) {
-               other = rb_entry(other_node, struct extent_state, rb_node);
-               if (other->end == state->start - 1 &&
-                   other->state == state->state) {
-                       if (tree->private_data &&
-                           is_data_inode(tree->private_data))
-                               btrfs_merge_delalloc_extent(tree->private_data,
-                                                           state, other);
-                       state->start = other->start;
-                       rb_erase(&other->rb_node, &tree->state);
-                       RB_CLEAR_NODE(&other->rb_node);
-                       free_extent_state(other);
-               }
-       }
-       other_node = rb_next(&state->rb_node);
-       if (other_node) {
-               other = rb_entry(other_node, struct extent_state, rb_node);
-               if (other->start == state->end + 1 &&
-                   other->state == state->state) {
-                       if (tree->private_data &&
-                           is_data_inode(tree->private_data))
-                               btrfs_merge_delalloc_extent(tree->private_data,
-                                                           state, other);
-                       state->end = other->end;
-                       rb_erase(&other->rb_node, &tree->state);
-                       RB_CLEAR_NODE(&other->rb_node);
-                       free_extent_state(other);
-               }
-       }
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, u32 bits,
-                          struct extent_changeset *changeset);
-
-/*
- * insert an extent_state struct into the tree.  'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally.  This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_io_tree *tree,
-                       struct extent_state *state,
-                       u32 bits, struct extent_changeset *changeset)
-{
-       struct rb_node **node;
-       struct rb_node *parent;
-       const u64 end = state->end;
-
-       set_state_bits(tree, state, bits, changeset);
-
-       node = &tree->state.rb_node;
-       while (*node) {
-               struct tree_entry *entry;
-
-               parent = *node;
-               entry = rb_entry(parent, struct tree_entry, rb_node);
-
-               if (end < entry->start) {
-                       node = &(*node)->rb_left;
-               } else if (end > entry->end) {
-                       node = &(*node)->rb_right;
-               } else {
-                       btrfs_err(tree->fs_info,
-                              "found node %llu %llu on insert of %llu %llu",
-                              entry->start, entry->end, state->start, end);
-                       return -EEXIST;
-               }
-       }
-
-       rb_link_node(&state->rb_node, parent, node);
-       rb_insert_color(&state->rb_node, &tree->state);
-
-       merge_state(tree, state);
-       return 0;
-}
-
-/*
- * Insert state to @tree to the location given by @node and @parent.
- */
-static void insert_state_fast(struct extent_io_tree *tree,
-                             struct extent_state *state, struct rb_node **node,
-                             struct rb_node *parent, unsigned bits,
-                             struct extent_changeset *changeset)
-{
-       set_state_bits(tree, state, bits, changeset);
-       rb_link_node(&state->rb_node, parent, node);
-       rb_insert_color(&state->rb_node, &tree->state);
-       merge_state(tree, state);
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half.  'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end].  After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
-                      struct extent_state *prealloc, u64 split)
-{
-       struct rb_node *parent = NULL;
-       struct rb_node **node;
-
-       if (tree->private_data && is_data_inode(tree->private_data))
-               btrfs_split_delalloc_extent(tree->private_data, orig, split);
-
-       prealloc->start = orig->start;
-       prealloc->end = split - 1;
-       prealloc->state = orig->state;
-       orig->start = split;
-
-       parent = &orig->rb_node;
-       node = &parent;
-       while (*node) {
-               struct tree_entry *entry;
-
-               parent = *node;
-               entry = rb_entry(parent, struct tree_entry, rb_node);
-
-               if (prealloc->end < entry->start) {
-                       node = &(*node)->rb_left;
-               } else if (prealloc->end > entry->end) {
-                       node = &(*node)->rb_right;
-               } else {
-                       free_extent_state(prealloc);
-                       return -EEXIST;
-               }
-       }
-
-       rb_link_node(&prealloc->rb_node, parent, node);
-       rb_insert_color(&prealloc->rb_node, &tree->state);
-
-       return 0;
-}
-
-static struct extent_state *next_state(struct extent_state *state)
-{
-       struct rb_node *next = rb_next(&state->rb_node);
-       if (next)
-               return rb_entry(next, struct extent_state, rb_node);
-       else
-               return NULL;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up anyone waiting on this state (wake == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
-                                           struct extent_state *state,
-                                           u32 bits, int wake,
-                                           struct extent_changeset *changeset)
-{
-       struct extent_state *next;
-       u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
-       int ret;
-
-       if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
-               u64 range = state->end - state->start + 1;
-               WARN_ON(range > tree->dirty_bytes);
-               tree->dirty_bytes -= range;
-       }
-
-       if (tree->private_data && is_data_inode(tree->private_data))
-               btrfs_clear_delalloc_extent(tree->private_data, state, bits);
-
-       ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
-       BUG_ON(ret < 0);
-       state->state &= ~bits_to_clear;
-       if (wake)
-               wake_up(&state->wq);
-       if (state->state == 0) {
-               next = next_state(state);
-               if (extent_state_in_tree(state)) {
-                       rb_erase(&state->rb_node, &tree->state);
-                       RB_CLEAR_NODE(&state->rb_node);
-                       free_extent_state(state);
-               } else {
-                       WARN_ON(1);
-               }
-       } else {
-               merge_state(tree, state);
-               next = next_state(state);
-       }
-       return next;
-}
-
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
-{
-       if (!prealloc)
-               prealloc = alloc_extent_state(GFP_ATOMIC);
-
-       return prealloc;
-}
-
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
-{
-       btrfs_panic(tree->fs_info, err,
-       "locking error: extent tree was modified by another thread while locked");
-}
-
-/*
- * clear some bits on a range in the tree.  This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns 0 on success and < 0 on error.
- */
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      u32 bits, int wake, int delete,
-                      struct extent_state **cached_state,
-                      gfp_t mask, struct extent_changeset *changeset)
-{
-       struct extent_state *state;
-       struct extent_state *cached;
-       struct extent_state *prealloc = NULL;
-       struct rb_node *node;
-       u64 last_end;
-       int err;
-       int clear = 0;
-
-       btrfs_debug_check_extent_io_range(tree, start, end);
-       trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
-
-       if (bits & EXTENT_DELALLOC)
-               bits |= EXTENT_NORESERVE;
-
-       if (delete)
-               bits |= ~EXTENT_CTLBITS;
-
-       if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
-               clear = 1;
-again:
-       if (!prealloc && gfpflags_allow_blocking(mask)) {
-               /*
-                * Don't care for allocation failure here because we might end
-                * up not needing the pre-allocated extent state at all, which
-                * is the case if we only have in the tree extent states that
-                * cover our input range and don't cover too any other range.
-                * If we end up needing a new extent state we allocate it later.
-                */
-               prealloc = alloc_extent_state(mask);
-       }
-
-       spin_lock(&tree->lock);
-       if (cached_state) {
-               cached = *cached_state;
-
-               if (clear) {
-                       *cached_state = NULL;
-                       cached_state = NULL;
-               }
-
-               if (cached && extent_state_in_tree(cached) &&
-                   cached->start <= start && cached->end > start) {
-                       if (clear)
-                               refcount_dec(&cached->refs);
-                       state = cached;
-                       goto hit_next;
-               }
-               if (clear)
-                       free_extent_state(cached);
-       }
-       /*
-        * this search will find the extents that end after
-        * our range starts
-        */
-       node = tree_search(tree, start);
-       if (!node)
-               goto out;
-       state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
-       if (state->start > end)
-               goto out;
-       WARN_ON(state->end < start);
-       last_end = state->end;
-
-       /* the state doesn't have the wanted bits, go ahead */
-       if (!(state->state & bits)) {
-               state = next_state(state);
-               goto next;
-       }
-
-       /*
-        *     | ---- desired range ---- |
-        *  | state | or
-        *  | ------------- state -------------- |
-        *
-        * We need to split the extent we found, and may flip
-        * bits on second half.
-        *
-        * If the extent we found extends past our range, we
-        * just split and search again.  It'll get split again
-        * the next time though.
-        *
-        * If the extent we found is inside our range, we clear
-        * the desired bit on it.
-        */
-
-       if (state->start < start) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-               err = split_state(tree, state, prealloc, start);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               prealloc = NULL;
-               if (err)
-                       goto out;
-               if (state->end <= end) {
-                       state = clear_state_bit(tree, state, bits, wake, changeset);
-                       goto next;
-               }
-               goto search_again;
-       }
-       /*
-        * | ---- desired range ---- |
-        *                        | state |
-        * We need to split the extent, and clear the bit
-        * on the first half
-        */
-       if (state->start <= end && state->end > end) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-               err = split_state(tree, state, prealloc, end + 1);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               if (wake)
-                       wake_up(&state->wq);
-
-               clear_state_bit(tree, prealloc, bits, wake, changeset);
-
-               prealloc = NULL;
-               goto out;
-       }
-
-       state = clear_state_bit(tree, state, bits, wake, changeset);
-next:
-       if (last_end == (u64)-1)
-               goto out;
-       start = last_end + 1;
-       if (start <= end && state && !need_resched())
-               goto hit_next;
-
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       goto again;
-
-out:
-       spin_unlock(&tree->lock);
-       if (prealloc)
-               free_extent_state(prealloc);
-
-       return 0;
-
-}
-
-static void wait_on_state(struct extent_io_tree *tree,
-                         struct extent_state *state)
-               __releases(tree->lock)
-               __acquires(tree->lock)
-{
-       DEFINE_WAIT(wait);
-       prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-       spin_unlock(&tree->lock);
-       schedule();
-       spin_lock(&tree->lock);
-       finish_wait(&state->wq, &wait);
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                           u32 bits)
-{
-       struct extent_state *state;
-       struct rb_node *node;
-
-       btrfs_debug_check_extent_io_range(tree, start, end);
-
-       spin_lock(&tree->lock);
-again:
-       while (1) {
-               /*
-                * this search will find all the extents that end after
-                * our range starts
-                */
-               node = tree_search(tree, start);
-process_node:
-               if (!node)
-                       break;
-
-               state = rb_entry(node, struct extent_state, rb_node);
-
-               if (state->start > end)
-                       goto out;
-
-               if (state->state & bits) {
-                       start = state->start;
-                       refcount_inc(&state->refs);
-                       wait_on_state(tree, state);
-                       free_extent_state(state);
-                       goto again;
-               }
-               start = state->end + 1;
-
-               if (start > end)
-                       break;
-
-               if (!cond_resched_lock(&tree->lock)) {
-                       node = rb_next(node);
-                       goto process_node;
-               }
-       }
-out:
-       spin_unlock(&tree->lock);
-}
-
-static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state,
-                          u32 bits, struct extent_changeset *changeset)
-{
-       u32 bits_to_set = bits & ~EXTENT_CTLBITS;
-       int ret;
-
-       if (tree->private_data && is_data_inode(tree->private_data))
-               btrfs_set_delalloc_extent(tree->private_data, state, bits);
-
-       if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-               u64 range = state->end - state->start + 1;
-               tree->dirty_bytes += range;
-       }
-       ret = add_extent_changeset(state, bits_to_set, changeset, 1);
-       BUG_ON(ret < 0);
-       state->state |= bits_to_set;
-}
-
-static void cache_state_if_flags(struct extent_state *state,
-                                struct extent_state **cached_ptr,
-                                unsigned flags)
-{
-       if (cached_ptr && !(*cached_ptr)) {
-               if (!flags || (state->state & flags)) {
-                       *cached_ptr = state;
-                       refcount_inc(&state->refs);
-               }
-       }
-}
-
-static void cache_state(struct extent_state *state,
-                       struct extent_state **cached_ptr)
-{
-       return cache_state_if_flags(state, cached_ptr,
-                                   EXTENT_LOCKED | EXTENT_BOUNDARY);
-}
-
-/*
- * set some bits on a range in the tree.  This may require allocations or
- * sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If any of the exclusive bits are set, this will fail with -EEXIST if some
- * part of the range already has the desired bits set.  The start of the
- * existing range is returned in failed_start in this case.
- *
- * [start, end] is inclusive This takes the tree lock.
- */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
-                  u32 exclusive_bits, u64 *failed_start,
-                  struct extent_state **cached_state, gfp_t mask,
-                  struct extent_changeset *changeset)
-{
-       struct extent_state *state;
-       struct extent_state *prealloc = NULL;
-       struct rb_node *node;
-       struct rb_node **p;
-       struct rb_node *parent;
-       int err = 0;
-       u64 last_start;
-       u64 last_end;
-
-       btrfs_debug_check_extent_io_range(tree, start, end);
-       trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
-
-       if (exclusive_bits)
-               ASSERT(failed_start);
-       else
-               ASSERT(failed_start == NULL);
-again:
-       if (!prealloc && gfpflags_allow_blocking(mask)) {
-               /*
-                * Don't care for allocation failure here because we might end
-                * up not needing the pre-allocated extent state at all, which
-                * is the case if we only have in the tree extent states that
-                * cover our input range and don't cover too any other range.
-                * If we end up needing a new extent state we allocate it later.
-                */
-               prealloc = alloc_extent_state(mask);
-       }
-
-       spin_lock(&tree->lock);
-       if (cached_state && *cached_state) {
-               state = *cached_state;
-               if (state->start <= start && state->end > start &&
-                   extent_state_in_tree(state)) {
-                       node = &state->rb_node;
-                       goto hit_next;
-               }
-       }
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search_for_insert(tree, start, &p, &parent);
-       if (!node) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-               prealloc->start = start;
-               prealloc->end = end;
-               insert_state_fast(tree, prealloc, p, parent, bits, changeset);
-               cache_state(prealloc, cached_state);
-               prealloc = NULL;
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
-       last_start = state->start;
-       last_end = state->end;
-
-       /*
-        * | ---- desired range ---- |
-        * | state |
-        *
-        * Just lock what we found and keep going
-        */
-       if (state->start == start && state->end <= end) {
-               if (state->state & exclusive_bits) {
-                       *failed_start = state->start;
-                       err = -EEXIST;
-                       goto out;
-               }
-
-               set_state_bits(tree, state, bits, changeset);
-               cache_state(state, cached_state);
-               merge_state(tree, state);
-               if (last_end == (u64)-1)
-                       goto out;
-               start = last_end + 1;
-               state = next_state(state);
-               if (start < end && state && state->start == start &&
-                   !need_resched())
-                       goto hit_next;
-               goto search_again;
-       }
-
-       /*
-        *     | ---- desired range ---- |
-        * | state |
-        *   or
-        * | ------------- state -------------- |
-        *
-        * We need to split the extent we found, and may flip bits on
-        * second half.
-        *
-        * If the extent we found extends past our
-        * range, we just split and search again.  It'll get split
-        * again the next time though.
-        *
-        * If the extent we found is inside our range, we set the
-        * desired bit on it.
-        */
-       if (state->start < start) {
-               if (state->state & exclusive_bits) {
-                       *failed_start = start;
-                       err = -EEXIST;
-                       goto out;
-               }
-
-               /*
-                * If this extent already has all the bits we want set, then
-                * skip it, not necessary to split it or do anything with it.
-                */
-               if ((state->state & bits) == bits) {
-                       start = state->end + 1;
-                       cache_state(state, cached_state);
-                       goto search_again;
-               }
-
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-               err = split_state(tree, state, prealloc, start);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               prealloc = NULL;
-               if (err)
-                       goto out;
-               if (state->end <= end) {
-                       set_state_bits(tree, state, bits, changeset);
-                       cache_state(state, cached_state);
-                       merge_state(tree, state);
-                       if (last_end == (u64)-1)
-                               goto out;
-                       start = last_end + 1;
-                       state = next_state(state);
-                       if (start < end && state && state->start == start &&
-                           !need_resched())
-                               goto hit_next;
-               }
-               goto search_again;
-       }
-       /*
-        * | ---- desired range ---- |
-        *     | state | or               | state |
-        *
-        * There's a hole, we need to insert something in it and
-        * ignore the extent we found.
-        */
-       if (state->start > start) {
-               u64 this_end;
-               if (end < last_start)
-                       this_end = end;
-               else
-                       this_end = last_start - 1;
-
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-
-               /*
-                * Avoid to free 'prealloc' if it can be merged with
-                * the later extent.
-                */
-               prealloc->start = start;
-               prealloc->end = this_end;
-               err = insert_state(tree, prealloc, bits, changeset);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               cache_state(prealloc, cached_state);
-               prealloc = NULL;
-               start = this_end + 1;
-               goto search_again;
-       }
-       /*
-        * | ---- desired range ---- |
-        *                        | state |
-        * We need to split the extent, and set the bit
-        * on the first half
-        */
-       if (state->start <= end && state->end > end) {
-               if (state->state & exclusive_bits) {
-                       *failed_start = start;
-                       err = -EEXIST;
-                       goto out;
-               }
-
-               prealloc = alloc_extent_state_atomic(prealloc);
-               BUG_ON(!prealloc);
-               err = split_state(tree, state, prealloc, end + 1);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               set_state_bits(tree, prealloc, bits, changeset);
-               cache_state(prealloc, cached_state);
-               merge_state(tree, prealloc);
-               prealloc = NULL;
-               goto out;
-       }
-
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       goto again;
-
-out:
-       spin_unlock(&tree->lock);
-       if (prealloc)
-               free_extent_state(prealloc);
-
-       return err;
-
-}
-
-/**
- * convert_extent_bit - convert all bits in a given range from one bit to
- *                     another
- * @tree:      the io tree to search
- * @start:     the start offset in bytes
- * @end:       the end offset in bytes (inclusive)
- * @bits:      the bits to set in this range
- * @clear_bits:        the bits to clear in this range
- * @cached_state:      state that we're going to cache
- *
- * This will go through and set bits for the given range.  If any states exist
- * already in this range they are set with the given bit and cleared of the
- * clear_bits.  This is only meant to be used by things that are mergeable, ie
- * converting from say DELALLOC to DIRTY.  This is not meant to be used with
- * boundary bits like LOCK.
- *
- * All allocations are done with GFP_NOFS.
- */
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                      u32 bits, u32 clear_bits,
-                      struct extent_state **cached_state)
-{
-       struct extent_state *state;
-       struct extent_state *prealloc = NULL;
-       struct rb_node *node;
-       struct rb_node **p;
-       struct rb_node *parent;
-       int err = 0;
-       u64 last_start;
-       u64 last_end;
-       bool first_iteration = true;
-
-       btrfs_debug_check_extent_io_range(tree, start, end);
-       trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
-                                      clear_bits);
-
-again:
-       if (!prealloc) {
-               /*
-                * Best effort, don't worry if extent state allocation fails
-                * here for the first iteration. We might have a cached state
-                * that matches exactly the target range, in which case no
-                * extent state allocations are needed. We'll only know this
-                * after locking the tree.
-                */
-               prealloc = alloc_extent_state(GFP_NOFS);
-               if (!prealloc && !first_iteration)
-                       return -ENOMEM;
-       }
-
-       spin_lock(&tree->lock);
-       if (cached_state && *cached_state) {
-               state = *cached_state;
-               if (state->start <= start && state->end > start &&
-                   extent_state_in_tree(state)) {
-                       node = &state->rb_node;
-                       goto hit_next;
-               }
-       }
-
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search_for_insert(tree, start, &p, &parent);
-       if (!node) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               if (!prealloc) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               prealloc->start = start;
-               prealloc->end = end;
-               insert_state_fast(tree, prealloc, p, parent, bits, NULL);
-               cache_state(prealloc, cached_state);
-               prealloc = NULL;
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-hit_next:
-       last_start = state->start;
-       last_end = state->end;
-
-       /*
-        * | ---- desired range ---- |
-        * | state |
-        *
-        * Just lock what we found and keep going
-        */
-       if (state->start == start && state->end <= end) {
-               set_state_bits(tree, state, bits, NULL);
-               cache_state(state, cached_state);
-               state = clear_state_bit(tree, state, clear_bits, 0, NULL);
-               if (last_end == (u64)-1)
-                       goto out;
-               start = last_end + 1;
-               if (start < end && state && state->start == start &&
-                   !need_resched())
-                       goto hit_next;
-               goto search_again;
-       }
-
-       /*
-        *     | ---- desired range ---- |
-        * | state |
-        *   or
-        * | ------------- state -------------- |
-        *
-        * We need to split the extent we found, and may flip bits on
-        * second half.
-        *
-        * If the extent we found extends past our
-        * range, we just split and search again.  It'll get split
-        * again the next time though.
-        *
-        * If the extent we found is inside our range, we set the
-        * desired bit on it.
-        */
-       if (state->start < start) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               if (!prealloc) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               err = split_state(tree, state, prealloc, start);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-               prealloc = NULL;
-               if (err)
-                       goto out;
-               if (state->end <= end) {
-                       set_state_bits(tree, state, bits, NULL);
-                       cache_state(state, cached_state);
-                       state = clear_state_bit(tree, state, clear_bits, 0, NULL);
-                       if (last_end == (u64)-1)
-                               goto out;
-                       start = last_end + 1;
-                       if (start < end && state && state->start == start &&
-                           !need_resched())
-                               goto hit_next;
-               }
-               goto search_again;
-       }
-       /*
-        * | ---- desired range ---- |
-        *     | state | or               | state |
-        *
-        * There's a hole, we need to insert something in it and
-        * ignore the extent we found.
-        */
-       if (state->start > start) {
-               u64 this_end;
-               if (end < last_start)
-                       this_end = end;
-               else
-                       this_end = last_start - 1;
-
-               prealloc = alloc_extent_state_atomic(prealloc);
-               if (!prealloc) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               /*
-                * Avoid to free 'prealloc' if it can be merged with
-                * the later extent.
-                */
-               prealloc->start = start;
-               prealloc->end = this_end;
-               err = insert_state(tree, prealloc, bits, NULL);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-               cache_state(prealloc, cached_state);
-               prealloc = NULL;
-               start = this_end + 1;
-               goto search_again;
-       }
-       /*
-        * | ---- desired range ---- |
-        *                        | state |
-        * We need to split the extent, and set the bit
-        * on the first half
-        */
-       if (state->start <= end && state->end > end) {
-               prealloc = alloc_extent_state_atomic(prealloc);
-               if (!prealloc) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               err = split_state(tree, state, prealloc, end + 1);
-               if (err)
-                       extent_io_tree_panic(tree, err);
-
-               set_state_bits(tree, prealloc, bits, NULL);
-               cache_state(prealloc, cached_state);
-               clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
-               prealloc = NULL;
-               goto out;
-       }
-
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       cond_resched();
-       first_iteration = false;
-       goto again;
-
-out:
-       spin_unlock(&tree->lock);
-       if (prealloc)
-               free_extent_state(prealloc);
-
-       return err;
-}
-
-/* wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                          u32 bits, struct extent_changeset *changeset)
-{
-       /*
-        * We don't support EXTENT_LOCKED yet, as current changeset will
-        * record any bits changed, so for EXTENT_LOCKED case, it will
-        * either fail with -EEXIST or changeset will record the whole
-        * range.
-        */
-       BUG_ON(bits & EXTENT_LOCKED);
-
-       return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
-                             changeset);
-}
-
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
-                          u32 bits)
-{
-       return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
-                             GFP_NOWAIT, NULL);
-}
-
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    u32 bits, int wake, int delete,
-                    struct extent_state **cached)
-{
-       return __clear_extent_bit(tree, start, end, bits, wake, delete,
-                                 cached, GFP_NOFS, NULL);
-}
-
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-               u32 bits, struct extent_changeset *changeset)
-{
-       /*
-        * Don't support EXTENT_LOCKED case, same reason as
-        * set_record_extent_bits().
-        */
-       BUG_ON(bits & EXTENT_LOCKED);
-
-       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
-                                 changeset);
-}
-
-/*
- * either insert or lock state struct between start and end use mask to tell
- * us if waiting is desired.
- */
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    struct extent_state **cached_state)
-{
-       int err;
-       u64 failed_start;
-
-       while (1) {
-               err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
-                                    EXTENT_LOCKED, &failed_start,
-                                    cached_state, GFP_NOFS, NULL);
-               if (err == -EEXIST) {
-                       wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
-                       start = failed_start;
-               } else
-                       break;
-               WARN_ON(start > end);
-       }
-       return err;
+               return -ENOMEM;
+
+       return 0;
 }
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+void __cold extent_buffer_free_cachep(void)
 {
-       int err;
-       u64 failed_start;
-
-       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                            &failed_start, NULL, GFP_NOFS, NULL);
-       if (err == -EEXIST) {
-               if (failed_start > start)
-                       clear_extent_bit(tree, start, failed_start - 1,
-                                        EXTENT_LOCKED, 1, 0, NULL);
-               return 0;
-       }
-       return 1;
+       /*
+        * Make sure all delayed rcu free are flushed before we
+        * destroy caches.
+        */
+       rcu_barrier();
+       kmem_cache_destroy(extent_buffer_cache);
 }
 
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
@@ -1554,295 +213,6 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
        }
 }
 
-/* find the first state struct with 'bits' set after 'start', and
- * return it.  tree->lock must be held.  NULL will returned if
- * nothing was found after 'start'
- */
-static struct extent_state *
-find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
-{
-       struct rb_node *node;
-       struct extent_state *state;
-
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node)
-               goto out;
-
-       while (1) {
-               state = rb_entry(node, struct extent_state, rb_node);
-               if (state->end >= start && (state->state & bits))
-                       return state;
-
-               node = rb_next(node);
-               if (!node)
-                       break;
-       }
-out:
-       return NULL;
-}
-
-/*
- * Find the first offset in the io tree with one or more @bits set.
- *
- * Note: If there are multiple bits set in @bits, any of them will match.
- *
- * Return 0 if we find something, and update @start_ret and @end_ret.
- * Return 1 if we found nothing.
- */
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                         u64 *start_ret, u64 *end_ret, u32 bits,
-                         struct extent_state **cached_state)
-{
-       struct extent_state *state;
-       int ret = 1;
-
-       spin_lock(&tree->lock);
-       if (cached_state && *cached_state) {
-               state = *cached_state;
-               if (state->end == start - 1 && extent_state_in_tree(state)) {
-                       while ((state = next_state(state)) != NULL) {
-                               if (state->state & bits)
-                                       goto got_it;
-                       }
-                       free_extent_state(*cached_state);
-                       *cached_state = NULL;
-                       goto out;
-               }
-               free_extent_state(*cached_state);
-               *cached_state = NULL;
-       }
-
-       state = find_first_extent_bit_state(tree, start, bits);
-got_it:
-       if (state) {
-               cache_state_if_flags(state, cached_state, 0);
-               *start_ret = state->start;
-               *end_ret = state->end;
-               ret = 0;
-       }
-out:
-       spin_unlock(&tree->lock);
-       return ret;
-}
-
-/**
- * Find a contiguous area of bits
- *
- * @tree:      io tree to check
- * @start:     offset to start the search from
- * @start_ret: the first offset we found with the bits set
- * @end_ret:   the final contiguous range of the bits that were set
- * @bits:      bits to look for
- *
- * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
- * to set bits appropriately, and then merge them again.  During this time it
- * will drop the tree->lock, so use this helper if you want to find the actual
- * contiguous area for given bits.  We will search to the first bit we find, and
- * then walk down the tree until we find a non-contiguous area.  The area
- * returned will be the full contiguous area with the bits set.
- */
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
-                              u64 *start_ret, u64 *end_ret, u32 bits)
-{
-       struct extent_state *state;
-       int ret = 1;
-
-       spin_lock(&tree->lock);
-       state = find_first_extent_bit_state(tree, start, bits);
-       if (state) {
-               *start_ret = state->start;
-               *end_ret = state->end;
-               while ((state = next_state(state)) != NULL) {
-                       if (state->start > (*end_ret + 1))
-                               break;
-                       *end_ret = state->end;
-               }
-               ret = 0;
-       }
-       spin_unlock(&tree->lock);
-       return ret;
-}
-
-/**
- * Find the first range that has @bits not set. This range could start before
- * @start.
- *
- * @tree:      the tree to search
- * @start:     offset at/after which the found extent should start
- * @start_ret: records the beginning of the range
- * @end_ret:   records the end of the range (inclusive)
- * @bits:      the set of bits which must be unset
- *
- * Since unallocated range is also considered one which doesn't have the bits
- * set it's possible that @end_ret contains -1, this happens in case the range
- * spans (last_range_end, end of device]. In this case it's up to the caller to
- * trim @end_ret to the appropriate size.
- */
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
-                                u64 *start_ret, u64 *end_ret, u32 bits)
-{
-       struct extent_state *state;
-       struct rb_node *node, *prev = NULL, *next;
-
-       spin_lock(&tree->lock);
-
-       /* Find first extent with bits cleared */
-       while (1) {
-               node = tree_search_prev_next(tree, start, &prev, &next);
-               if (!node && !next && !prev) {
-                       /*
-                        * Tree is completely empty, send full range and let
-                        * caller deal with it
-                        */
-                       *start_ret = 0;
-                       *end_ret = -1;
-                       goto out;
-               } else if (!node && !next) {
-                       /*
-                        * We are past the last allocated chunk, set start at
-                        * the end of the last extent.
-                        */
-                       state = rb_entry(prev, struct extent_state, rb_node);
-                       *start_ret = state->end + 1;
-                       *end_ret = -1;
-                       goto out;
-               } else if (!node) {
-                       node = next;
-               }
-               /*
-                * At this point 'node' either contains 'start' or start is
-                * before 'node'
-                */
-               state = rb_entry(node, struct extent_state, rb_node);
-
-               if (in_range(start, state->start, state->end - state->start + 1)) {
-                       if (state->state & bits) {
-                               /*
-                                * |--range with bits sets--|
-                                *    |
-                                *    start
-                                */
-                               start = state->end + 1;
-                       } else {
-                               /*
-                                * 'start' falls within a range that doesn't
-                                * have the bits set, so take its start as
-                                * the beginning of the desired range
-                                *
-                                * |--range with bits cleared----|
-                                *      |
-                                *      start
-                                */
-                               *start_ret = state->start;
-                               break;
-                       }
-               } else {
-                       /*
-                        * |---prev range---|---hole/unset---|---node range---|
-                        *                          |
-                        *                        start
-                        *
-                        *                        or
-                        *
-                        * |---hole/unset--||--first node--|
-                        * 0   |
-                        *    start
-                        */
-                       if (prev) {
-                               state = rb_entry(prev, struct extent_state,
-                                                rb_node);
-                               *start_ret = state->end + 1;
-                       } else {
-                               *start_ret = 0;
-                       }
-                       break;
-               }
-       }
-
-       /*
-        * Find the longest stretch from start until an entry which has the
-        * bits set
-        */
-       while (1) {
-               state = rb_entry(node, struct extent_state, rb_node);
-               if (state->end >= start && !(state->state & bits)) {
-                       *end_ret = state->end;
-               } else {
-                       *end_ret = state->start - 1;
-                       break;
-               }
-
-               node = rb_next(node);
-               if (!node)
-                       break;
-       }
-out:
-       spin_unlock(&tree->lock);
-}
-
-/*
- * find a contiguous range of bytes in the file marked as delalloc, not
- * more than 'max_bytes'.  start and end are used to return the range,
- *
- * true is returned if we find something, false if nothing was in the tree
- */
-bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
-                              u64 *end, u64 max_bytes,
-                              struct extent_state **cached_state)
-{
-       struct rb_node *node;
-       struct extent_state *state;
-       u64 cur_start = *start;
-       bool found = false;
-       u64 total_bytes = 0;
-
-       spin_lock(&tree->lock);
-
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, cur_start);
-       if (!node) {
-               *end = (u64)-1;
-               goto out;
-       }
-
-       while (1) {
-               state = rb_entry(node, struct extent_state, rb_node);
-               if (found && (state->start != cur_start ||
-                             (state->state & EXTENT_BOUNDARY))) {
-                       goto out;
-               }
-               if (!(state->state & EXTENT_DELALLOC)) {
-                       if (!found)
-                               *end = state->end;
-                       goto out;
-               }
-               if (!found) {
-                       *start = state->start;
-                       *cached_state = state;
-                       refcount_inc(&state->refs);
-               }
-               found = true;
-               *end = state->end;
-               cur_start = state->end + 1;
-               node = rb_next(node);
-               total_bytes += state->end - state->start + 1;
-               if (total_bytes >= max_bytes)
-                       break;
-               if (!node)
-                       break;
-       }
-out:
-       spin_unlock(&tree->lock);
-       return found;
-}
-
 /*
  * Process one page for __process_pages_contig().
  *
@@ -2094,14 +464,14 @@ again:
        }
 
        /* step three, lock the state bits for the whole range */
-       lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
+       lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
 
        /* then test to make sure it is all still delalloc */
        ret = test_range_bit(tree, delalloc_start, delalloc_end,
                             EXTENT_DELALLOC, 1, cached_state);
        if (!ret) {
-               unlock_extent_cached(tree, delalloc_start, delalloc_end,
-                                    &cached_state);
+               unlock_extent(tree, delalloc_start, delalloc_end,
+                             &cached_state);
                __unlock_for_delalloc(inode, locked_page,
                              delalloc_start, delalloc_end);
                cond_resched();
@@ -2118,210 +488,46 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
                                  u32 clear_bits, unsigned long page_ops)
 {
-       clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
+       clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL);
 
        __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
                               start, end, page_ops, NULL);
 }
 
-/*
- * count the number of bytes in the tree that have a given bit(s)
- * set.  This can be fairly slow, except for EXTENT_DIRTY which is
- * cached.  The total number found is returned.
- */
-u64 count_range_bits(struct extent_io_tree *tree,
-                    u64 *start, u64 search_end, u64 max_bytes,
-                    u32 bits, int contig)
+static int insert_failrec(struct btrfs_inode *inode,
+                         struct io_failure_record *failrec)
 {
-       struct rb_node *node;
-       struct extent_state *state;
-       u64 cur_start = *start;
-       u64 total_bytes = 0;
-       u64 last = 0;
-       int found = 0;
-
-       if (WARN_ON(search_end <= cur_start))
-               return 0;
-
-       spin_lock(&tree->lock);
-       if (cur_start == 0 && bits == EXTENT_DIRTY) {
-               total_bytes = tree->dirty_bytes;
-               goto out;
-       }
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, cur_start);
-       if (!node)
-               goto out;
-
-       while (1) {
-               state = rb_entry(node, struct extent_state, rb_node);
-               if (state->start > search_end)
-                       break;
-               if (contig && found && state->start > last + 1)
-                       break;
-               if (state->end >= cur_start && (state->state & bits) == bits) {
-                       total_bytes += min(search_end, state->end) + 1 -
-                                      max(cur_start, state->start);
-                       if (total_bytes >= max_bytes)
-                               break;
-                       if (!found) {
-                               *start = max(cur_start, state->start);
-                               found = 1;
-                       }
-                       last = state->end;
-               } else if (contig && found) {
-                       break;
-               }
-               node = rb_next(node);
-               if (!node)
-                       break;
-       }
-out:
-       spin_unlock(&tree->lock);
-       return total_bytes;
-}
+       struct rb_node *exist;
 
-/*
- * set the private field for a given byte offset in the tree.  If there isn't
- * an extent_state there already, this does nothing.
- */
-int set_state_failrec(struct extent_io_tree *tree, u64 start,
-                     struct io_failure_record *failrec)
-{
-       struct rb_node *node;
-       struct extent_state *state;
-       int ret = 0;
+       spin_lock(&inode->io_failure_lock);
+       exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+                                &failrec->rb_node);
+       spin_unlock(&inode->io_failure_lock);
 
-       spin_lock(&tree->lock);
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node) {
-               ret = -ENOENT;
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-       if (state->start != start) {
-               ret = -ENOENT;
-               goto out;
-       }
-       state->failrec = failrec;
-out:
-       spin_unlock(&tree->lock);
-       return ret;
+       return (exist == NULL) ? 0 : -EEXIST;
 }
 
-struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
 {
        struct rb_node *node;
-       struct extent_state *state;
-       struct io_failure_record *failrec;
-
-       spin_lock(&tree->lock);
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node) {
-               failrec = ERR_PTR(-ENOENT);
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-       if (state->start != start) {
-               failrec = ERR_PTR(-ENOENT);
-               goto out;
-       }
+       struct io_failure_record *failrec = ERR_PTR(-ENOENT);
 
-       failrec = state->failrec;
-out:
-       spin_unlock(&tree->lock);
+       spin_lock(&inode->io_failure_lock);
+       node = rb_simple_search(&inode->io_failure_tree, start);
+       if (node)
+               failrec = rb_entry(node, struct io_failure_record, rb_node);
+       spin_unlock(&inode->io_failure_lock);
        return failrec;
 }
 
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if every extent in the tree
- * has the bits set.  Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  u32 bits, int filled, struct extent_state *cached)
-{
-       struct extent_state *state = NULL;
-       struct rb_node *node;
-       int bitset = 0;
-
-       spin_lock(&tree->lock);
-       if (cached && extent_state_in_tree(cached) && cached->start <= start &&
-           cached->end > start)
-               node = &cached->rb_node;
-       else
-               node = tree_search(tree, start);
-       while (node && start <= end) {
-               state = rb_entry(node, struct extent_state, rb_node);
-
-               if (filled && state->start > start) {
-                       bitset = 0;
-                       break;
-               }
-
-               if (state->start > end)
-                       break;
-
-               if (state->state & bits) {
-                       bitset = 1;
-                       if (!filled)
-                               break;
-               } else if (filled) {
-                       bitset = 0;
-                       break;
-               }
-
-               if (state->end == (u64)-1)
-                       break;
-
-               start = state->end + 1;
-               if (start > end)
-                       break;
-               node = rb_next(node);
-               if (!node) {
-                       if (filled)
-                               bitset = 0;
-                       break;
-               }
-       }
-       spin_unlock(&tree->lock);
-       return bitset;
-}
-
-int free_io_failure(struct extent_io_tree *failure_tree,
-                   struct extent_io_tree *io_tree,
-                   struct io_failure_record *rec)
+static void free_io_failure(struct btrfs_inode *inode,
+                           struct io_failure_record *rec)
 {
-       int ret;
-       int err = 0;
-
-       set_state_failrec(failure_tree, rec->start, NULL);
-       ret = clear_extent_bits(failure_tree, rec->start,
-                               rec->start + rec->len - 1,
-                               EXTENT_LOCKED | EXTENT_DIRTY);
-       if (ret)
-               err = ret;
-
-       ret = clear_extent_bits(io_tree, rec->start,
-                               rec->start + rec->len - 1,
-                               EXTENT_DAMAGED);
-       if (ret && !err)
-               err = ret;
+       spin_lock(&inode->io_failure_lock);
+       rb_erase(&rec->rb_node, &inode->io_failure_tree);
+       spin_unlock(&inode->io_failure_lock);
 
        kfree(rec);
-       return err;
 }
 
 /*
@@ -2456,24 +662,18 @@ static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int clean_io_failure(struct btrfs_fs_info *fs_info,
-                    struct extent_io_tree *failure_tree,
-                    struct extent_io_tree *io_tree, u64 start,
-                    struct page *page, u64 ino, unsigned int pg_offset)
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+                          struct page *page, unsigned int pg_offset)
 {
-       u64 private;
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct extent_io_tree *io_tree = &inode->io_tree;
+       u64 ino = btrfs_ino(inode);
+       u64 locked_start, locked_end;
        struct io_failure_record *failrec;
-       struct extent_state *state;
        int mirror;
        int ret;
 
-       private = 0;
-       ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
-                              EXTENT_DIRTY, 0);
-       if (!ret)
-               return 0;
-
-       failrec = get_state_failrec(failure_tree, start);
+       failrec = get_failrec(inode, start);
        if (IS_ERR(failrec))
                return 0;
 
@@ -2482,14 +682,10 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
        if (sb_rdonly(fs_info->sb))
                goto out;
 
-       spin_lock(&io_tree->lock);
-       state = find_first_extent_bit_state(io_tree,
-                                           failrec->start,
-                                           EXTENT_LOCKED);
-       spin_unlock(&io_tree->lock);
-
-       if (!state || state->start > failrec->start ||
-           state->end < failrec->start + failrec->len - 1)
+       ret = find_first_extent_bit(io_tree, failrec->bytenr, &locked_start,
+                                   &locked_end, EXTENT_LOCKED, NULL);
+       if (ret || locked_start > failrec->bytenr ||
+           locked_end < failrec->bytenr + failrec->len - 1)
                goto out;
 
        mirror = failrec->this_mirror;
@@ -2500,7 +696,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
        } while (mirror != failrec->failed_mirror);
 
 out:
-       free_io_failure(failure_tree, io_tree, failrec);
+       free_io_failure(inode, failrec);
        return 0;
 }
 
@@ -2512,30 +708,26 @@ out:
  */
 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
 {
-       struct extent_io_tree *failure_tree = &inode->io_failure_tree;
        struct io_failure_record *failrec;
-       struct extent_state *state, *next;
+       struct rb_node *node, *next;
 
-       if (RB_EMPTY_ROOT(&failure_tree->state))
+       if (RB_EMPTY_ROOT(&inode->io_failure_tree))
                return;
 
-       spin_lock(&failure_tree->lock);
-       state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
-       while (state) {
-               if (state->start > end)
+       spin_lock(&inode->io_failure_lock);
+       node = rb_simple_search_first(&inode->io_failure_tree, start);
+       while (node) {
+               failrec = rb_entry(node, struct io_failure_record, rb_node);
+               if (failrec->bytenr > end)
                        break;
 
-               ASSERT(state->end <= end);
-
-               next = next_state(state);
-
-               failrec = state->failrec;
-               free_extent_state(state);
+               next = rb_next(node);
+               rb_erase(&failrec->rb_node, &inode->io_failure_tree);
                kfree(failrec);
 
-               state = next;
+               node = next;
        }
-       spin_unlock(&failure_tree->lock);
+       spin_unlock(&inode->io_failure_lock);
 }
 
 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
@@ -2545,16 +737,14 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 start = bbio->file_offset + bio_offset;
        struct io_failure_record *failrec;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        const u32 sectorsize = fs_info->sectorsize;
        int ret;
 
-       failrec = get_state_failrec(failure_tree, start);
+       failrec = get_failrec(BTRFS_I(inode), start);
        if (!IS_ERR(failrec)) {
                btrfs_debug(fs_info,
        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
-                       failrec->logical, failrec->start, failrec->len);
+                       failrec->logical, failrec->bytenr, failrec->len);
                /*
                 * when data can be on disk more than twice, add to failrec here
                 * (e.g. with a list for failed_mirror) to make
@@ -2569,7 +759,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        if (!failrec)
                return ERR_PTR(-ENOMEM);
 
-       failrec->start = start;
+       RB_CLEAR_NODE(&failrec->rb_node);
+       failrec->bytenr = start;
        failrec->len = sectorsize;
        failrec->failed_mirror = bbio->mirror_num;
        failrec->this_mirror = bbio->mirror_num;
@@ -2594,14 +785,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        }
 
        /* Set the bits in the private failure tree */
-       ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
-                             EXTENT_LOCKED | EXTENT_DIRTY);
-       if (ret >= 0) {
-               ret = set_state_failrec(failure_tree, start, failrec);
-               /* Set the bits in the inode's tree */
-               ret = set_extent_bits(tree, start, start + sectorsize - 1,
-                                     EXTENT_DAMAGED);
-       } else if (ret < 0) {
+       ret = insert_failrec(BTRFS_I(inode), failrec);
+       if (ret) {
                kfree(failrec);
                return ERR_PTR(ret);
        }
@@ -2616,8 +801,6 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
        u64 start = failed_bbio->file_offset + bio_offset;
        struct io_failure_record *failrec;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct bio *failed_bio = &failed_bbio->bio;
        const int icsum = bio_offset >> fs_info->sectorsize_bits;
        struct bio *repair_bio;
@@ -2646,17 +829,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
                btrfs_debug(fs_info,
                        "failed to repair num_copies %d this_mirror %d failed_mirror %d",
                        failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
-               free_io_failure(failure_tree, tree, failrec);
+               free_io_failure(BTRFS_I(inode), failrec);
                return -EIO;
        }
 
-       repair_bio = btrfs_bio_alloc(1);
+       repair_bio = btrfs_bio_alloc(1, REQ_OP_READ, failed_bbio->end_io,
+                                    failed_bbio->private);
        repair_bbio = btrfs_bio(repair_bio);
        repair_bbio->file_offset = start;
-       repair_bio->bi_opf = REQ_OP_READ;
-       repair_bio->bi_end_io = failed_bio->bi_end_io;
        repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
-       repair_bio->bi_private = failed_bio->bi_private;
 
        if (failed_bbio->csum) {
                const u32 csum_size = fs_info->csum_size;
@@ -2720,8 +901,8 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate)
        if (uptodate)
                set_extent_uptodate(&inode->io_tree, offset,
                                    offset + sectorsize - 1, &cached, GFP_ATOMIC);
-       unlock_extent_cached_atomic(&inode->io_tree, offset,
-                                   offset + sectorsize - 1, &cached);
+       unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1,
+                            &cached);
 }
 
 static void submit_data_read_repair(struct inode *inode,
@@ -2823,8 +1004,9 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_writepage(struct bio *bio)
+static void end_bio_extent_writepage(struct btrfs_bio *bbio)
 {
+       struct bio *bio = &bbio->bio;
        int error = blk_status_to_errno(bio->bi_status);
        struct bio_vec *bvec;
        u64 start;
@@ -2924,11 +1106,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed,
         * Now we don't have range contiguous to the processed range, release
         * the processed range now.
         */
-       if (processed->uptodate && tree->track_uptodate)
-               set_extent_uptodate(tree, processed->start, processed->end,
-                                   &cached, GFP_ATOMIC);
-       unlock_extent_cached_atomic(tree, processed->start, processed->end,
-                                   &cached);
+       unlock_extent_atomic(tree, processed->start, processed->end, &cached);
 
 update:
        /* Update processed to current range */
@@ -2988,11 +1166,10 @@ static struct extent_buffer *find_extent_buffer_readpage(
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_readpage(struct bio *bio)
+static void end_bio_extent_readpage(struct btrfs_bio *bbio)
 {
+       struct bio *bio = &bbio->bio;
        struct bio_vec *bvec;
-       struct btrfs_bio *bbio = btrfs_bio(bio);
-       struct extent_io_tree *tree, *failure_tree;
        struct processed_extent processed = { 0 };
        /*
         * The offset to the beginning of a bio, since one bio can never be
@@ -3019,8 +1196,6 @@ static void end_bio_extent_readpage(struct bio *bio)
                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
                        bio->bi_iter.bi_sector, bio->bi_status,
                        bbio->mirror_num);
-               tree = &BTRFS_I(inode)->io_tree;
-               failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
                /*
                 * We always issue full-sector reads, but if some block in a
@@ -3061,9 +1236,7 @@ static void end_bio_extent_readpage(struct bio *bio)
                        loff_t i_size = i_size_read(inode);
                        pgoff_t end_index = i_size >> PAGE_SHIFT;
 
-                       clean_io_failure(BTRFS_I(inode)->root->fs_info,
-                                        failure_tree, tree, start, page,
-                                        btrfs_ino(BTRFS_I(inode)), 0);
+                       btrfs_clean_io_failure(BTRFS_I(inode), start, page, 0);
 
                        /*
                         * Zero out the remaining part if this range straddles
@@ -3141,69 +1314,25 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
 {
        unsigned int allocated;
 
-       for (allocated = 0; allocated < nr_pages;) {
-               unsigned int last = allocated;
-
-               allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
-
-               if (allocated == nr_pages)
-                       return 0;
-
-               /*
-                * During this iteration, no page could be allocated, even
-                * though alloc_pages_bulk_array() falls back to alloc_page()
-                * if  it could not bulk-allocate. So we must be out of memory.
-                */
-               if (allocated == last)
-                       return -ENOMEM;
-
-               memalloc_retry_wait(GFP_NOFS);
-       }
-       return 0;
-}
-
-/*
- * Initialize the members up to but not including 'bio'. Use after allocating a
- * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
- * 'bio' because use of __GFP_ZERO is not supported.
- */
-static inline void btrfs_bio_init(struct btrfs_bio *bbio)
-{
-       memset(bbio, 0, offsetof(struct btrfs_bio, bio));
-}
-
-/*
- * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
- *
- * The bio allocation is backed by bioset and does not fail.
- */
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
-{
-       struct bio *bio;
-
-       ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
-       bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
-       btrfs_bio_init(btrfs_bio(bio));
-       return bio;
-}
-
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
-{
-       struct bio *bio;
-       struct btrfs_bio *bbio;
+       for (allocated = 0; allocated < nr_pages;) {
+               unsigned int last = allocated;
 
-       ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+               allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
 
-       /* this will never fail when it's backed by a bioset */
-       bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
-       ASSERT(bio);
+               if (allocated == nr_pages)
+                       return 0;
 
-       bbio = btrfs_bio(bio);
-       btrfs_bio_init(bbio);
+               /*
+                * During this iteration, no page could be allocated, even
+                * though alloc_pages_bulk_array() falls back to alloc_page()
+                * if  it could not bulk-allocate. So we must be out of memory.
+                */
+               if (allocated == last)
+                       return -ENOMEM;
 
-       bio_trim(bio, offset >> 9, size >> 9);
-       bbio->iter = bio->bi_iter;
-       return bio;
+               memalloc_retry_wait(GFP_NOFS);
+       }
+       return 0;
 }
 
 /**
@@ -3351,7 +1480,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
                         struct btrfs_bio_ctrl *bio_ctrl,
                         struct writeback_control *wbc,
                         blk_opf_t opf,
-                        bio_end_io_t end_io_func,
                         u64 disk_bytenr, u32 offset, u64 file_offset,
                         enum btrfs_compression_type compress_type)
 {
@@ -3359,7 +1487,9 @@ static int alloc_new_bio(struct btrfs_inode *inode,
        struct bio *bio;
        int ret;
 
-       bio = btrfs_bio_alloc(BIO_MAX_VECS);
+       ASSERT(bio_ctrl->end_io_func);
+
+       bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, bio_ctrl->end_io_func, NULL);
        /*
         * For compressed page range, its disk_bytenr is always @disk_bytenr
         * passed in, no matter if we have added any range into previous bio.
@@ -3370,8 +1500,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
                bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
        bio_ctrl->bio = bio;
        bio_ctrl->compress_type = compress_type;
-       bio->bi_end_io = end_io_func;
-       bio->bi_opf = opf;
        ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
        if (ret < 0)
                goto error;
@@ -3410,31 +1538,30 @@ static int alloc_new_bio(struct btrfs_inode *inode,
        return 0;
 error:
        bio_ctrl->bio = NULL;
-       bio->bi_status = errno_to_blk_status(ret);
-       bio_endio(bio);
+       btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
        return ret;
 }
 
 /*
  * @opf:       bio REQ_OP_* and REQ_* flags as one value
  * @wbc:       optional writeback control for io accounting
- * @page:      page to add to the bio
  * @disk_bytenr: logical bytenr where the write will be
+ * @page:      page to add to the bio
  * @size:      portion of page that we want to write to
  * @pg_offset: offset of the new bio or to check whether we are adding
  *              a contiguous page to the previous one
- * @bio_ret:   must be valid pointer, newly allocated bio will be stored there
- * @end_io_func:     end_io callback for new bio
- * @mirror_num:             desired mirror to read/write
- * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
  * @compress_type:   compress type for current bio
+ *
+ * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+ * new one in @bio_ctrl->bio.
+ * The mirror number for this IO should already be initizlied in
+ * @bio_ctrl->mirror_num.
  */
 static int submit_extent_page(blk_opf_t opf,
                              struct writeback_control *wbc,
                              struct btrfs_bio_ctrl *bio_ctrl,
-                             struct page *page, u64 disk_bytenr,
+                             u64 disk_bytenr, struct page *page,
                              size_t size, unsigned long pg_offset,
-                             bio_end_io_t end_io_func,
                              enum btrfs_compression_type compress_type,
                              bool force_bio_submit)
 {
@@ -3446,6 +1573,9 @@ static int submit_extent_page(blk_opf_t opf,
 
        ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
               pg_offset + size <= PAGE_SIZE);
+
+       ASSERT(bio_ctrl->end_io_func);
+
        if (force_bio_submit)
                submit_one_bio(bio_ctrl);
 
@@ -3456,7 +1586,7 @@ static int submit_extent_page(blk_opf_t opf,
                /* Allocate new bio if needed */
                if (!bio_ctrl->bio) {
                        ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
-                                           end_io_func, disk_bytenr, offset,
+                                           disk_bytenr, offset,
                                            page_offset(page) + cur,
                                            compress_type);
                        if (ret < 0)
@@ -3613,7 +1743,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
        u64 extent_offset;
        u64 last_byte = i_size_read(inode);
        u64 block_start;
-       u64 cur_end;
        struct extent_map *em;
        int ret = 0;
        size_t pg_offset = 0;
@@ -3623,7 +1752,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
        ret = set_page_extent_mapped(page);
        if (ret < 0) {
-               unlock_extent(tree, start, end);
+               unlock_extent(tree, start, end, NULL);
                btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
                unlock_page(page);
                goto out;
@@ -3637,6 +1766,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                        memzero_page(page, zero_offset, iosize);
                }
        }
+       bio_ctrl->end_io_func = end_bio_extent_readpage;
        begin_page_read(fs_info, page);
        while (cur <= end) {
                unsigned long this_bio_flag = 0;
@@ -3651,15 +1781,14 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                        memzero_page(page, pg_offset, iosize);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                       unlock_extent_cached(tree, cur,
-                                            cur + iosize - 1, &cached);
+                       unlock_extent(tree, cur, cur + iosize - 1, &cached);
                        end_page_read(page, true, cur, iosize);
                        break;
                }
                em = __get_extent_map(inode, page, pg_offset, cur,
                                      end - cur + 1, em_cached);
                if (IS_ERR(em)) {
-                       unlock_extent(tree, cur, end);
+                       unlock_extent(tree, cur, end, NULL);
                        end_page_read(page, false, cur, end + 1 - cur);
                        ret = PTR_ERR(em);
                        break;
@@ -3672,7 +1801,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
                        this_bio_flag = em->compress_type;
 
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
-               cur_end = min(extent_map_end(em) - 1, end);
                iosize = ALIGN(iosize, blocksize);
                if (this_bio_flag != BTRFS_COMPRESS_NONE)
                        disk_bytenr = em->block_start;
@@ -3735,43 +1863,31 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                       unlock_extent_cached(tree, cur,
-                                            cur + iosize - 1, &cached);
+                       unlock_extent(tree, cur, cur + iosize - 1, &cached);
                        end_page_read(page, true, cur, iosize);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                }
                /* the get_extent function already copied into the page */
-               if (test_range_bit(tree, cur, cur_end,
-                                  EXTENT_UPTODATE, 1, NULL)) {
-                       unlock_extent(tree, cur, cur + iosize - 1);
-                       end_page_read(page, true, cur, iosize);
-                       cur = cur + iosize;
-                       pg_offset += iosize;
-                       continue;
-               }
-               /* we have an inline extent but it didn't get marked up
-                * to date.  Error out
-                */
                if (block_start == EXTENT_MAP_INLINE) {
-                       unlock_extent(tree, cur, cur + iosize - 1);
-                       end_page_read(page, false, cur, iosize);
+                       unlock_extent(tree, cur, cur + iosize - 1, NULL);
+                       end_page_read(page, true, cur, iosize);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                }
 
                ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
-                                        bio_ctrl, page, disk_bytenr, iosize,
-                                        pg_offset, end_bio_extent_readpage,
-                                        this_bio_flag, force_bio_submit);
+                                        bio_ctrl, disk_bytenr, page, iosize,
+                                        pg_offset, this_bio_flag,
+                                        force_bio_submit);
                if (ret) {
                        /*
                         * We have to unlock the remaining range, or the page
                         * will never be unlocked.
                         */
-                       unlock_extent(tree, cur, end);
+                       unlock_extent(tree, cur, end, NULL);
                        end_page_read(page, false, cur, end + 1 - cur);
                        goto out;
                }
@@ -3984,6 +2100,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
         */
        wbc->nr_to_write--;
 
+       epd->bio_ctrl.end_io_func = end_bio_extent_writepage;
        while (cur <= end) {
                u64 disk_bytenr;
                u64 em_end;
@@ -4077,10 +2194,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
                btrfs_page_clear_dirty(fs_info, page, cur, iosize);
 
                ret = submit_extent_page(op | write_flags, wbc,
-                                        &epd->bio_ctrl, page,
-                                        disk_bytenr, iosize,
+                                        &epd->bio_ctrl, disk_bytenr,
+                                        page, iosize,
                                         cur - page_offset(page),
-                                        end_bio_extent_writepage,
                                         0, false);
                if (ret) {
                        has_error = true;
@@ -4431,8 +2547,9 @@ static struct extent_buffer *find_extent_buffer_nolock(
  * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
  * after all extent buffers in the page has finished their writeback.
  */
-static void end_bio_subpage_eb_writepage(struct bio *bio)
+static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio)
 {
+       struct bio *bio = &bbio->bio;
        struct btrfs_fs_info *fs_info;
        struct bio_vec *bvec;
        struct bvec_iter_all iter_all;
@@ -4488,8 +2605,9 @@ static void end_bio_subpage_eb_writepage(struct bio *bio)
        bio_put(bio);
 }
 
-static void end_bio_extent_buffer_writepage(struct bio *bio)
+static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio)
 {
+       struct bio *bio = &bbio->bio;
        struct bio_vec *bvec;
        struct extent_buffer *eb;
        int done;
@@ -4571,10 +2689,11 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
        if (no_dirty_ebs)
                clear_page_dirty_for_io(page);
 
+       epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage;
+
        ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-                       &epd->bio_ctrl, page, eb->start, eb->len,
-                       eb->start - page_offset(page),
-                       end_bio_subpage_eb_writepage, 0, false);
+                       &epd->bio_ctrl, eb->start, page, eb->len,
+                       eb->start - page_offset(page), 0, false);
        if (ret) {
                btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
                set_btree_ioerr(page, eb);
@@ -4605,6 +2724,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
        prepare_eb_write(eb);
 
+       epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage;
+
        num_pages = num_extent_pages(eb);
        for (i = 0; i < num_pages; i++) {
                struct page *p = eb->pages[i];
@@ -4612,10 +2733,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
                clear_page_dirty_for_io(p);
                set_page_writeback(p);
                ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
-                                        &epd->bio_ctrl, p, disk_bytenr,
-                                        PAGE_SIZE, 0,
-                                        end_bio_extent_buffer_writepage,
-                                        0, false);
+                                        &epd->bio_ctrl, disk_bytenr, p,
+                                        PAGE_SIZE, 0, 0, false);
                if (ret) {
                        set_btree_ioerr(p, eb);
                        if (PageWriteback(p))
@@ -5236,7 +3355,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
        if (start > end)
                return 0;
 
-       lock_extent_bits(tree, start, end, &cached_state);
+       lock_extent(tree, start, end, &cached_state);
        folio_wait_writeback(folio);
 
        /*
@@ -5244,7 +3363,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
         * so here we only need to unlock the extent range to free any
         * existing extent state.
         */
-       unlock_extent_cached(tree, start, end, &cached_state);
+       unlock_extent(tree, start, end, &cached_state);
        return 0;
 }
 
@@ -5263,15 +3382,17 @@ static int try_release_extent_state(struct extent_io_tree *tree,
        if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
                ret = 0;
        } else {
+               u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
+                                  EXTENT_DELALLOC_NEW | EXTENT_CTLBITS);
+
                /*
                 * At this point we can safely clear everything except the
                 * locked bit, the nodatasum bit and the delalloc new bit.
                 * The delalloc new bit will be cleared by ordered extent
                 * completion.
                 */
-               ret = __clear_extent_bit(tree, start, end,
-                        ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
-                        0, 0, NULL, mask, NULL);
+               ret = __clear_extent_bit(tree, start, end, clear_bits, NULL,
+                                        mask, NULL);
 
                /* if clear_extent_bit failed for enomem reasons,
                 * we can't allow the release to continue.
@@ -5370,42 +3491,6 @@ next:
 }
 
 /*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
-                                               u64 offset, u64 last)
-{
-       u64 sectorsize = btrfs_inode_sectorsize(inode);
-       struct extent_map *em;
-       u64 len;
-
-       if (offset >= last)
-               return NULL;
-
-       while (1) {
-               len = last - offset;
-               if (len == 0)
-                       break;
-               len = ALIGN(len, sectorsize);
-               em = btrfs_get_extent_fiemap(inode, offset, len);
-               if (IS_ERR(em))
-                       return em;
-
-               /* if this isn't a hole return it */
-               if (em->block_start != EXTENT_MAP_HOLE)
-                       return em;
-
-               /* this is a hole, advance to the next extent */
-               offset = extent_map_end(em);
-               free_extent_map(em);
-               if (offset >= last)
-                       break;
-       }
-       return NULL;
-}
-
-/*
  * To cache previous fiemap extent
  *
  * Will be used for merging fiemap extent
@@ -5434,6 +3519,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
 {
        int ret = 0;
 
+       /* Set at the end of extent_fiemap(). */
+       ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
        if (!cache->cached)
                goto assign;
 
@@ -5457,16 +3545,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
         *    So truly compressed (physical size smaller than logical size)
         *    extents won't get merged with each other
         *
-        * 3) Share same flags except FIEMAP_EXTENT_LAST
-        *    So regular extent won't get merged with prealloc extent
+        * 3) Share same flags
         */
        if (cache->offset + cache->len  == offset &&
            cache->phys + cache->len == phys  &&
-           (cache->flags & ~FIEMAP_EXTENT_LAST) ==
-                       (flags & ~FIEMAP_EXTENT_LAST)) {
+           cache->flags == flags) {
                cache->len += len;
-               cache->flags |= flags;
-               goto try_submit_last;
+               return 0;
        }
 
        /* Not mergeable, need to submit cached one */
@@ -5481,13 +3566,8 @@ assign:
        cache->phys = phys;
        cache->len = len;
        cache->flags = flags;
-try_submit_last:
-       if (cache->flags & FIEMAP_EXTENT_LAST) {
-               ret = fiemap_fill_next_extent(fieinfo, cache->offset,
-                               cache->phys, cache->len, cache->flags);
-               cache->cached = false;
-       }
-       return ret;
+
+       return 0;
 }
 
 /*
@@ -5517,215 +3597,534 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
        return ret;
 }
 
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
-                 u64 start, u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
 {
-       int ret = 0;
-       u64 off;
-       u64 max = start + len;
-       u32 flags = 0;
-       u32 found_type;
-       u64 last;
-       u64 last_for_get_extent = 0;
-       u64 disko = 0;
-       u64 isize = i_size_read(&inode->vfs_inode);
-       struct btrfs_key found_key;
-       struct extent_map *em = NULL;
-       struct extent_state *cached_state = NULL;
-       struct btrfs_path *path;
-       struct btrfs_root *root = inode->root;
-       struct fiemap_cache cache = { 0 };
-       struct ulist *roots;
-       struct ulist *tmp_ulist;
-       int end = 0;
-       u64 em_start = 0;
-       u64 em_len = 0;
-       u64 em_end = 0;
+       struct extent_buffer *clone;
+       struct btrfs_key key;
+       int slot;
+       int ret;
 
-       if (len == 0)
-               return -EINVAL;
+       path->slots[0]++;
+       if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+               return 0;
 
-       path = btrfs_alloc_path();
-       if (!path)
+       ret = btrfs_next_leaf(inode->root, path);
+       if (ret != 0)
+               return ret;
+
+       /*
+        * Don't bother with cloning if there are no more file extent items for
+        * our inode.
+        */
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+       if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+               return 1;
+
+       /* See the comment at fiemap_search_slot() about why we clone. */
+       clone = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!clone)
                return -ENOMEM;
 
-       roots = ulist_alloc(GFP_KERNEL);
-       tmp_ulist = ulist_alloc(GFP_KERNEL);
-       if (!roots || !tmp_ulist) {
-               ret = -ENOMEM;
-               goto out_free_ulist;
+       slot = path->slots[0];
+       btrfs_release_path(path);
+       path->nodes[0] = clone;
+       path->slots[0] = slot;
+
+       return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+                             u64 file_offset)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *clone;
+       struct btrfs_key key;
+       int slot;
+       int ret;
+
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = file_offset;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       if (ret > 0 && path->slots[0] > 0) {
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+               if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+                       path->slots[0]--;
+       }
+
+       if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret != 0)
+                       return ret;
+
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+                       return 1;
        }
 
        /*
-        * We can't initialize that to 'start' as this could miss extents due
-        * to extent item merging
+        * We clone the leaf and use it during fiemap. This is because while
+        * using the leaf we do expensive things like checking if an extent is
+        * shared, which can take a long time. In order to prevent blocking
+        * other tasks for too long, we use a clone of the leaf. We have locked
+        * the file range in the inode's io tree, so we know none of our file
+        * extent items can change. This way we avoid blocking other tasks that
+        * want to insert items for other inodes in the same leaf or b+tree
+        * rebalance operations (triggered for example when someone is trying
+        * to push items into this leaf when trying to insert an item in a
+        * neighbour leaf).
+        * We also need the private clone because holding a read lock on an
+        * extent buffer of the subvolume's b+tree will make lockdep unhappy
+        * when we call fiemap_fill_next_extent(), because that may cause a page
+        * fault when filling the user space buffer with fiemap data.
         */
-       off = 0;
-       start = round_down(start, btrfs_inode_sectorsize(inode));
-       len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+       clone = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!clone)
+               return -ENOMEM;
+
+       slot = path->slots[0];
+       btrfs_release_path(path);
+       path->nodes[0] = clone;
+       path->slots[0] = slot;
+
+       return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+                              struct fiemap_extent_info *fieinfo,
+                              struct fiemap_cache *cache,
+                              struct btrfs_backref_shared_cache *backref_cache,
+                              u64 disk_bytenr, u64 extent_offset,
+                              u64 extent_gen,
+                              struct ulist *roots, struct ulist *tmp_ulist,
+                              u64 start, u64 end)
+{
+       const u64 i_size = i_size_read(&inode->vfs_inode);
+       const u64 ino = btrfs_ino(inode);
+       u64 cur_offset = start;
+       u64 last_delalloc_end = 0;
+       u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+       bool checked_extent_shared = false;
+       int ret;
 
        /*
-        * lookup the last file extent.  We're not using i_size here
-        * because there might be preallocation past i_size
+        * There can be no delalloc past i_size, so don't waste time looking for
+        * it beyond i_size.
         */
-       ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
-                                      0);
-       if (ret < 0) {
-               goto out_free_ulist;
-       } else {
-               WARN_ON(!ret);
-               if (ret == 1)
-                       ret = 0;
-       }
+       while (cur_offset < end && cur_offset < i_size) {
+               u64 delalloc_start;
+               u64 delalloc_end;
+               u64 prealloc_start;
+               u64 prealloc_len = 0;
+               bool delalloc;
+
+               delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+                                                       &delalloc_start,
+                                                       &delalloc_end);
+               if (!delalloc)
+                       break;
 
-       path->slots[0]--;
-       btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
-       found_type = found_key.type;
-
-       /* No extents, but there might be delalloc bits */
-       if (found_key.objectid != btrfs_ino(inode) ||
-           found_type != BTRFS_EXTENT_DATA_KEY) {
-               /* have to trust i_size as the end */
-               last = (u64)-1;
-               last_for_get_extent = isize;
-       } else {
                /*
-                * remember the start of the last extent.  There are a
-                * bunch of different factors that go into the length of the
-                * extent, so its much less complex to remember where it started
+                * If this is a prealloc extent we have to report every section
+                * of it that has no delalloc.
                 */
-               last = found_key.offset;
-               last_for_get_extent = last + 1;
+               if (disk_bytenr != 0) {
+                       if (last_delalloc_end == 0) {
+                               prealloc_start = start;
+                               prealloc_len = delalloc_start - start;
+                       } else {
+                               prealloc_start = last_delalloc_end + 1;
+                               prealloc_len = delalloc_start - prealloc_start;
+                       }
+               }
+
+               if (prealloc_len > 0) {
+                       if (!checked_extent_shared && fieinfo->fi_extents_max) {
+                               ret = btrfs_is_data_extent_shared(inode->root,
+                                                         ino, disk_bytenr,
+                                                         extent_gen, roots,
+                                                         tmp_ulist,
+                                                         backref_cache);
+                               if (ret < 0)
+                                       return ret;
+                               else if (ret > 0)
+                                       prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+                               checked_extent_shared = true;
+                       }
+                       ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+                                                disk_bytenr + extent_offset,
+                                                prealloc_len, prealloc_flags);
+                       if (ret)
+                               return ret;
+                       extent_offset += prealloc_len;
+               }
+
+               ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+                                        delalloc_end + 1 - delalloc_start,
+                                        FIEMAP_EXTENT_DELALLOC |
+                                        FIEMAP_EXTENT_UNKNOWN);
+               if (ret)
+                       return ret;
+
+               last_delalloc_end = delalloc_end;
+               cur_offset = delalloc_end + 1;
+               extent_offset += cur_offset - delalloc_start;
+               cond_resched();
        }
-       btrfs_release_path(path);
 
        /*
-        * we might have some extents allocated but more delalloc past those
-        * extents.  so, we trust isize unless the start of the last extent is
-        * beyond isize
+        * Either we found no delalloc for the whole prealloc extent or we have
+        * a prealloc extent that spans i_size or starts at or after i_size.
         */
-       if (last < isize) {
-               last = (u64)-1;
-               last_for_get_extent = isize;
+       if (disk_bytenr != 0 && last_delalloc_end < end) {
+               u64 prealloc_start;
+               u64 prealloc_len;
+
+               if (last_delalloc_end == 0) {
+                       prealloc_start = start;
+                       prealloc_len = end + 1 - start;
+               } else {
+                       prealloc_start = last_delalloc_end + 1;
+                       prealloc_len = end + 1 - prealloc_start;
+               }
+
+               if (!checked_extent_shared && fieinfo->fi_extents_max) {
+                       ret = btrfs_is_data_extent_shared(inode->root,
+                                                         ino, disk_bytenr,
+                                                         extent_gen, roots,
+                                                         tmp_ulist,
+                                                         backref_cache);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               prealloc_flags |= FIEMAP_EXTENT_SHARED;
+               }
+               ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+                                        disk_bytenr + extent_offset,
+                                        prealloc_len, prealloc_flags);
+               if (ret)
+                       return ret;
        }
 
-       lock_extent_bits(&inode->io_tree, start, start + len - 1,
-                        &cached_state);
+       return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+                                         struct btrfs_path *path,
+                                         u64 *last_extent_end_ret)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *ei;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       int ret;
 
-       em = get_extent_skip_holes(inode, start, last_for_get_extent);
-       if (!em)
-               goto out;
-       if (IS_ERR(em)) {
-               ret = PTR_ERR(em);
+       /*
+        * Lookup the last file extent. We're not using i_size here because
+        * there might be preallocation past i_size.
+        */
+       ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+       /* There can't be a file extent item at offset (u64)-1 */
+       ASSERT(ret != 0);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * For a non-existing key, btrfs_search_slot() always leaves us at a
+        * slot > 0, except if the btree is empty, which is impossible because
+        * at least it has the inode item for this inode and all the items for
+        * the root inode 256.
+        */
+       ASSERT(path->slots[0] > 0);
+       path->slots[0]--;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+       if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* No file extent items in the subvolume tree. */
+               *last_extent_end_ret = 0;
+               return 0;
+       }
+
+       /*
+        * For an inline extent, the disk_bytenr is where inline data starts at,
+        * so first check if we have an inline extent item before checking if we
+        * have an implicit hole (disk_bytenr == 0).
+        */
+       ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+       if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+               *last_extent_end_ret = btrfs_file_extent_end(path);
+               return 0;
+       }
+
+       /*
+        * Find the last file extent item that is not a hole (when NO_HOLES is
+        * not enabled). This should take at most 2 iterations in the worst
+        * case: we have one hole file extent item at slot 0 of a leaf and
+        * another hole file extent item as the last item in the previous leaf.
+        * This is because we merge file extent items that represent holes.
+        */
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+       while (disk_bytenr == 0) {
+               ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret > 0) {
+                       /* No file extent items that are not holes. */
+                       *last_extent_end_ret = 0;
+                       return 0;
+               }
+               leaf = path->nodes[0];
+               ei = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+       }
+
+       *last_extent_end_ret = btrfs_file_extent_end(path);
+       return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+                 u64 start, u64 len)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct extent_state *cached_state = NULL;
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct fiemap_cache cache = { 0 };
+       struct btrfs_backref_shared_cache *backref_cache;
+       struct ulist *roots;
+       struct ulist *tmp_ulist;
+       u64 last_extent_end;
+       u64 prev_extent_end;
+       u64 lockstart;
+       u64 lockend;
+       bool stopped = false;
+       int ret;
+
+       backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+       path = btrfs_alloc_path();
+       roots = ulist_alloc(GFP_KERNEL);
+       tmp_ulist = ulist_alloc(GFP_KERNEL);
+       if (!backref_cache || !path || !roots || !tmp_ulist) {
+               ret = -ENOMEM;
                goto out;
        }
 
-       while (!end) {
-               u64 offset_in_extent = 0;
+       lockstart = round_down(start, root->fs_info->sectorsize);
+       lockend = round_up(start + len, root->fs_info->sectorsize);
+       prev_extent_end = lockstart;
 
-               /* break if the extent we found is outside the range */
-               if (em->start >= max || extent_map_end(em) < off)
-                       break;
+       lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
 
-               /*
-                * get_extent may return an extent that starts before our
-                * requested range.  We have to make sure the ranges
-                * we return to fiemap always move forward and don't
-                * overlap, so adjust the offsets here
-                */
-               em_start = max(em->start, off);
+       ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+       if (ret < 0)
+               goto out_unlock;
+       btrfs_release_path(path);
 
+       path->reada = READA_FORWARD;
+       ret = fiemap_search_slot(inode, path, lockstart);
+       if (ret < 0) {
+               goto out_unlock;
+       } else if (ret > 0) {
                /*
-                * record the offset from the start of the extent
-                * for adjusting the disk offset below.  Only do this if the
-                * extent isn't compressed since our in ram offset may be past
-                * what we have actually allocated on disk.
+                * No file extent item found, but we may have delalloc between
+                * the current offset and i_size. So check for that.
                 */
-               if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-                       offset_in_extent = em_start - em->start;
-               em_end = extent_map_end(em);
-               em_len = em_end - em_start;
-               flags = 0;
-               if (em->block_start < EXTENT_MAP_LAST_BYTE)
-                       disko = em->block_start + offset_in_extent;
-               else
-                       disko = 0;
+               ret = 0;
+               goto check_eof_delalloc;
+       }
+
+       while (prev_extent_end < lockend) {
+               struct extent_buffer *leaf = path->nodes[0];
+               struct btrfs_file_extent_item *ei;
+               struct btrfs_key key;
+               u64 extent_end;
+               u64 extent_len;
+               u64 extent_offset = 0;
+               u64 extent_gen;
+               u64 disk_bytenr = 0;
+               u64 flags = 0;
+               int extent_type;
+               u8 compression;
+
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+                       break;
+
+               extent_end = btrfs_file_extent_end(path);
 
                /*
-                * bump off for our next call to get_extent
+                * The first iteration can leave us at an extent item that ends
+                * before our range's start. Move to the next item.
                 */
-               off = extent_map_end(em);
-               if (off >= max)
-                       end = 1;
-
-               if (em->block_start == EXTENT_MAP_LAST_BYTE) {
-                       end = 1;
-                       flags |= FIEMAP_EXTENT_LAST;
-               } else if (em->block_start == EXTENT_MAP_INLINE) {
-                       flags |= (FIEMAP_EXTENT_DATA_INLINE |
-                                 FIEMAP_EXTENT_NOT_ALIGNED);
-               } else if (em->block_start == EXTENT_MAP_DELALLOC) {
-                       flags |= (FIEMAP_EXTENT_DELALLOC |
-                                 FIEMAP_EXTENT_UNKNOWN);
-               } else if (fieinfo->fi_extents_max) {
-                       u64 bytenr = em->block_start -
-                               (em->start - em->orig_start);
+               if (extent_end <= lockstart)
+                       goto next_item;
 
-                       /*
-                        * As btrfs supports shared space, this information
-                        * can be exported to userspace tools via
-                        * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
-                        * then we're just getting a count and we can skip the
-                        * lookup stuff.
-                        */
-                       ret = btrfs_check_shared(root, btrfs_ino(inode),
-                                                bytenr, roots, tmp_ulist);
-                       if (ret < 0)
-                               goto out_free;
-                       if (ret)
-                               flags |= FIEMAP_EXTENT_SHARED;
-                       ret = 0;
+               /* We have in implicit hole (NO_HOLES feature enabled). */
+               if (prev_extent_end < key.offset) {
+                       const u64 range_end = min(key.offset, lockend) - 1;
+
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache, 0, 0, 0,
+                                                 roots, tmp_ulist,
+                                                 prev_extent_end, range_end);
+                       if (ret < 0) {
+                               goto out_unlock;
+                       } else if (ret > 0) {
+                               /* fiemap_fill_next_extent() told us to stop. */
+                               stopped = true;
+                               break;
+                       }
+
+                       /* We've reached the end of the fiemap range, stop. */
+                       if (key.offset >= lockend) {
+                               stopped = true;
+                               break;
+                       }
                }
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+
+               extent_len = extent_end - key.offset;
+               ei = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               compression = btrfs_file_extent_compression(leaf, ei);
+               extent_type = btrfs_file_extent_type(leaf, ei);
+               extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+               if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+                       if (compression == BTRFS_COMPRESS_NONE)
+                               extent_offset = btrfs_file_extent_offset(leaf, ei);
+               }
+
+               if (compression != BTRFS_COMPRESS_NONE)
                        flags |= FIEMAP_EXTENT_ENCODED;
-               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-                       flags |= FIEMAP_EXTENT_UNWRITTEN;
 
-               free_extent_map(em);
-               em = NULL;
-               if ((em_start >= last) || em_len == (u64)-1 ||
-                  (last == (u64)-1 && isize <= em_end)) {
-                       flags |= FIEMAP_EXTENT_LAST;
-                       end = 1;
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                       flags |= FIEMAP_EXTENT_DATA_INLINE;
+                       flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+                       ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+                                                extent_len, flags);
+               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache,
+                                                 disk_bytenr, extent_offset,
+                                                 extent_gen, roots, tmp_ulist,
+                                                 key.offset, extent_end - 1);
+               } else if (disk_bytenr == 0) {
+                       /* We have an explicit hole. */
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache, 0, 0, 0,
+                                                 roots, tmp_ulist,
+                                                 key.offset, extent_end - 1);
+               } else {
+                       /* We have a regular extent. */
+                       if (fieinfo->fi_extents_max) {
+                               ret = btrfs_is_data_extent_shared(root, ino,
+                                                                 disk_bytenr,
+                                                                 extent_gen,
+                                                                 roots,
+                                                                 tmp_ulist,
+                                                                 backref_cache);
+                               if (ret < 0)
+                                       goto out_unlock;
+                               else if (ret > 0)
+                                       flags |= FIEMAP_EXTENT_SHARED;
+                       }
+
+                       ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+                                                disk_bytenr + extent_offset,
+                                                extent_len, flags);
                }
 
-               /* now scan forward to see if this is really the last extent. */
-               em = get_extent_skip_holes(inode, off, last_for_get_extent);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out;
+               if (ret < 0) {
+                       goto out_unlock;
+               } else if (ret > 0) {
+                       /* fiemap_fill_next_extent() told us to stop. */
+                       stopped = true;
+                       break;
                }
-               if (!em) {
-                       flags |= FIEMAP_EXTENT_LAST;
-                       end = 1;
+
+               prev_extent_end = extent_end;
+next_item:
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       goto out_unlock;
                }
-               ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
-                                          em_len, flags);
-               if (ret) {
-                       if (ret == 1)
-                               ret = 0;
-                       goto out_free;
+
+               ret = fiemap_next_leaf_item(inode, path);
+               if (ret < 0) {
+                       goto out_unlock;
+               } else if (ret > 0) {
+                       /* No more file extent items for this inode. */
+                       break;
                }
+               cond_resched();
        }
-out_free:
-       if (!ret)
-               ret = emit_last_fiemap_cache(fieinfo, &cache);
-       free_extent_map(em);
-out:
-       unlock_extent_cached(&inode->io_tree, start, start + len - 1,
-                            &cached_state);
 
-out_free_ulist:
+check_eof_delalloc:
+       /*
+        * Release (and free) the path before emitting any final entries to
+        * fiemap_fill_next_extent() to keep lockdep happy. This is because
+        * once we find no more file extent items exist, we may have a
+        * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+        * faults when copying data to the user space buffer.
+        */
+       btrfs_free_path(path);
+       path = NULL;
+
+       if (!stopped && prev_extent_end < lockend) {
+               ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+                                         0, 0, 0, roots, tmp_ulist,
+                                         prev_extent_end, lockend - 1);
+               if (ret < 0)
+                       goto out_unlock;
+               prev_extent_end = lockend;
+       }
+
+       if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+               const u64 i_size = i_size_read(&inode->vfs_inode);
+
+               if (prev_extent_end < i_size) {
+                       u64 delalloc_start;
+                       u64 delalloc_end;
+                       bool delalloc;
+
+                       delalloc = btrfs_find_delalloc_in_range(inode,
+                                                               prev_extent_end,
+                                                               i_size - 1,
+                                                               &delalloc_start,
+                                                               &delalloc_end);
+                       if (!delalloc)
+                               cache.flags |= FIEMAP_EXTENT_LAST;
+               } else {
+                       cache.flags |= FIEMAP_EXTENT_LAST;
+               }
+       }
+
+       ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+       unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
+       kfree(backref_cache);
        btrfs_free_path(path);
        ulist_free(roots);
        ulist_free(tmp_ulist);
@@ -5856,7 +4255,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
        btrfs_release_extent_buffer_pages(eb);
-       btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+       btrfs_leak_debug_del_eb(eb);
        __free_extent_buffer(eb);
 }
 
@@ -5873,8 +4272,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
        eb->bflags = 0;
        init_rwsem(&eb->lock);
 
-       btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
-                            &fs_info->allocated_ebs);
+       btrfs_leak_debug_add_eb(eb);
        INIT_LIST_HEAD(&eb->release_list);
 
        spin_lock_init(&eb->refs_lock);
@@ -6342,7 +4740,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
                        spin_unlock(&eb->refs_lock);
                }
 
-               btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
+               btrfs_leak_debug_del_eb(eb);
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_pages(eb);
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -6362,18 +4760,16 @@ static int release_extent_buffer(struct extent_buffer *eb)
 void free_extent_buffer(struct extent_buffer *eb)
 {
        int refs;
-       int old;
        if (!eb)
                return;
 
+       refs = atomic_read(&eb->refs);
        while (1) {
-               refs = atomic_read(&eb->refs);
                if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
                    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
                        refs == 1))
                        break;
-               old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
-               if (old == refs)
+               if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
                        return;
        }
 
@@ -6569,7 +4965,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
                if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
                        return -EAGAIN;
        } else {
-               ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+               ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
                if (ret < 0)
                        return ret;
        }
@@ -6579,7 +4975,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
            PageUptodate(page) ||
            btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-               unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
+               unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL);
                return ret;
        }
 
@@ -6587,13 +4983,14 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
        eb->read_mirror = 0;
        atomic_set(&eb->io_pages, 1);
        check_buffer_tree_ref(eb);
+       bio_ctrl.end_io_func = end_bio_extent_readpage;
+
        btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
 
        btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
        ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
-                                page, eb->start, eb->len,
-                                eb->start - page_offset(page),
-                                end_bio_extent_readpage, 0, true);
+                                eb->start, page, eb->len,
+                                eb->start - page_offset(page), 0, true);
        if (ret) {
                /*
                 * In the endio function, if we hit something wrong we will
@@ -6684,6 +5081,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
         * set io_pages. See check_buffer_tree_ref for a more detailed comment.
         */
        check_buffer_tree_ref(eb);
+       bio_ctrl.end_io_func = end_bio_extent_readpage;
        for (i = 0; i < num_pages; i++) {
                page = eb->pages[i];
 
@@ -6696,9 +5094,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
 
                        ClearPageError(page);
                        err = submit_extent_page(REQ_OP_READ, NULL,
-                                        &bio_ctrl, page, page_offset(page),
-                                        PAGE_SIZE, 0, end_bio_extent_readpage,
-                                        0, false);
+                                        &bio_ctrl, page_offset(page), page,
+                                        PAGE_SIZE, 0, 0, false);
                        if (err) {
                                /*
                                 * We failed to submit the bio so it's the
index 4bc72a8..7929f05 100644 (file)
@@ -60,11 +60,13 @@ enum {
 struct btrfs_bio;
 struct btrfs_root;
 struct btrfs_inode;
-struct btrfs_io_bio;
 struct btrfs_fs_info;
 struct io_failure_record;
 struct extent_io_tree;
 
+int __init extent_buffer_init_cachep(void);
+void __cold extent_buffer_free_cachep(void);
+
 typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
                                         int mirror_num,
                                         enum btrfs_compression_type compress_type);
@@ -240,10 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
                                  struct page *locked_page,
                                  u32 bits_to_clear, unsigned long page_ops);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+                           struct folio *folio, size_t offset);
 
 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
-struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
-struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
 
 void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
@@ -257,8 +259,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
  * bio end_io callback is called to indicate things have failed.
  */
 struct io_failure_record {
+       /* Use rb_simple_node for search/insert */
+       struct {
+               struct rb_node rb_node;
+               u64 bytenr;
+       };
        struct page *page;
-       u64 start;
        u64 len;
        u64 logical;
        int this_mirror;
@@ -269,6 +275,9 @@ struct io_failure_record {
 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
                            u32 bio_offset, struct page *page, unsigned int pgoff,
                            submit_bio_hook_t *submit_bio_hook);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
+int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
+                          struct page *page, unsigned int pg_offset);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,
index 6fee14c..6092a4e 100644 (file)
@@ -7,6 +7,7 @@
 #include "volumes.h"
 #include "extent_map.h"
 #include "compression.h"
+#include "btrfs_inode.h"
 
 
 static struct kmem_cache *extent_map_cache;
@@ -54,9 +55,7 @@ struct extent_map *alloc_extent_map(void)
        if (!em)
                return NULL;
        RB_CLEAR_NODE(&em->rb_node);
-       em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
-       em->generation = 0;
        refcount_set(&em->refs, 1);
        INIT_LIST_HEAD(&em->list);
        return em;
@@ -73,7 +72,6 @@ void free_extent_map(struct extent_map *em)
 {
        if (!em)
                return;
-       WARN_ON(refcount_read(&em->refs) == 0);
        if (refcount_dec_and_test(&em->refs)) {
                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
@@ -143,8 +141,7 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
  * it can't be found, try to find some neighboring extents
  */
 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
-                                    struct rb_node **prev_ret,
-                                    struct rb_node **next_ret)
+                                    struct rb_node **prev_or_next_ret)
 {
        struct rb_node *n = root->rb_node;
        struct rb_node *prev = NULL;
@@ -152,6 +149,8 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
        struct extent_map *entry;
        struct extent_map *prev_entry = NULL;
 
+       ASSERT(prev_or_next_ret);
+
        while (n) {
                entry = rb_entry(n, struct extent_map, rb_node);
                prev = n;
@@ -165,24 +164,29 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
                        return n;
        }
 
-       if (prev_ret) {
-               orig_prev = prev;
-               while (prev && offset >= extent_map_end(prev_entry)) {
-                       prev = rb_next(prev);
-                       prev_entry = rb_entry(prev, struct extent_map, rb_node);
-               }
-               *prev_ret = prev;
-               prev = orig_prev;
+       orig_prev = prev;
+       while (prev && offset >= extent_map_end(prev_entry)) {
+               prev = rb_next(prev);
+               prev_entry = rb_entry(prev, struct extent_map, rb_node);
+       }
+
+       /*
+        * Previous extent map found, return as in this case the caller does not
+        * care about the next one.
+        */
+       if (prev) {
+               *prev_or_next_ret = prev;
+               return NULL;
        }
 
-       if (next_ret) {
+       prev = orig_prev;
+       prev_entry = rb_entry(prev, struct extent_map, rb_node);
+       while (prev && offset < prev_entry->start) {
+               prev = rb_prev(prev);
                prev_entry = rb_entry(prev, struct extent_map, rb_node);
-               while (prev && offset < prev_entry->start) {
-                       prev = rb_prev(prev);
-                       prev_entry = rb_entry(prev, struct extent_map, rb_node);
-               }
-               *next_ret = prev;
        }
+       *prev_or_next_ret = prev;
+
        return NULL;
 }
 
@@ -336,6 +340,8 @@ out:
 
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 {
+       lockdep_assert_held_write(&tree->lock);
+
        clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
        if (extent_map_in_tree(em))
                try_merge_map(tree, em);
@@ -382,7 +388,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
 
                __clear_extent_bit(&device->alloc_state, stripe->physical,
                                   stripe->physical + stripe_size - 1, bits,
-                                  0, 0, NULL, GFP_NOWAIT, NULL);
+                                  NULL, GFP_NOWAIT, NULL);
        }
 }
 
@@ -425,16 +431,13 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
 {
        struct extent_map *em;
        struct rb_node *rb_node;
-       struct rb_node *prev = NULL;
-       struct rb_node *next = NULL;
+       struct rb_node *prev_or_next = NULL;
        u64 end = range_end(start, len);
 
-       rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
+       rb_node = __tree_search(&tree->map.rb_root, start, &prev_or_next);
        if (!rb_node) {
-               if (prev)
-                       rb_node = prev;
-               else if (next)
-                       rb_node = next;
+               if (prev_or_next)
+                       rb_node = prev_or_next;
                else
                        return NULL;
        }
@@ -658,3 +661,293 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
        ASSERT(ret == 0 || ret == -EEXIST);
        return ret;
 }
+
+/*
+ * Drop all extent maps from a tree in the fastest possible way, rescheduling
+ * if needed. This avoids searching the tree, from the root down to the first
+ * extent map, before each deletion.
+ */
+static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
+{
+       write_lock(&tree->lock);
+       while (!RB_EMPTY_ROOT(&tree->map.rb_root)) {
+               struct extent_map *em;
+               struct rb_node *node;
+
+               node = rb_first_cached(&tree->map);
+               em = rb_entry(node, struct extent_map, rb_node);
+               clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+               clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+               remove_extent_mapping(tree, em);
+               free_extent_map(em);
+               cond_resched_rwlock_write(&tree->lock);
+       }
+       write_unlock(&tree->lock);
+}
+
+/*
+ * Drop all extent maps in a given range.
+ *
+ * @inode:       The target inode.
+ * @start:       Start offset of the range.
+ * @end:         End offset of the range (inclusive value).
+ * @skip_pinned: Indicate if pinned extent maps should be ignored or not.
+ *
+ * This drops all the extent maps that intersect the given range [@start, @end].
+ * Extent maps that partially overlap the range and extend behind or beyond it,
+ * are split.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
+                                bool skip_pinned)
+{
+       struct extent_map *split;
+       struct extent_map *split2;
+       struct extent_map *em;
+       struct extent_map_tree *em_tree = &inode->extent_tree;
+       u64 len = end - start + 1;
+
+       WARN_ON(end < start);
+       if (end == (u64)-1) {
+               if (start == 0 && !skip_pinned) {
+                       drop_all_extent_maps_fast(em_tree);
+                       return;
+               }
+               len = (u64)-1;
+       } else {
+               /* Make end offset exclusive for use in the loop below. */
+               end++;
+       }
+
+       /*
+        * It's ok if we fail to allocate the extent maps, see the comment near
+        * the bottom of the loop below. We only need two spare extent maps in
+        * the worst case, where the first extent map that intersects our range
+        * starts before the range and the last extent map that intersects our
+        * range ends after our range (and they might be the same extent map),
+        * because we need to split those two extent maps at the boundaries.
+        */
+       split = alloc_extent_map();
+       split2 = alloc_extent_map();
+
+       write_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, start, len);
+
+       while (em) {
+               /* extent_map_end() returns exclusive value (last byte + 1). */
+               const u64 em_end = extent_map_end(em);
+               struct extent_map *next_em = NULL;
+               u64 gen;
+               unsigned long flags;
+               bool modified;
+               bool compressed;
+
+               if (em_end < end) {
+                       next_em = next_extent_map(em);
+                       if (next_em) {
+                               if (next_em->start < end)
+                                       refcount_inc(&next_em->refs);
+                               else
+                                       next_em = NULL;
+                       }
+               }
+
+               if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+                       start = em_end;
+                       if (end != (u64)-1)
+                               len = start + len - em_end;
+                       goto next;
+               }
+
+               clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+               clear_bit(EXTENT_FLAG_LOGGING, &flags);
+               modified = !list_empty(&em->list);
+
+               /*
+                * The extent map does not cross our target range, so no need to
+                * split it, we can remove it directly.
+                */
+               if (em->start >= start && em_end <= end)
+                       goto remove_em;
+
+               flags = em->flags;
+               gen = em->generation;
+               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+               if (em->start < start) {
+                       if (!split) {
+                               split = split2;
+                               split2 = NULL;
+                               if (!split)
+                                       goto remove_em;
+                       }
+                       split->start = em->start;
+                       split->len = start - em->start;
+
+                       if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                               split->orig_start = em->orig_start;
+                               split->block_start = em->block_start;
+
+                               if (compressed)
+                                       split->block_len = em->block_len;
+                               else
+                                       split->block_len = split->len;
+                               split->orig_block_len = max(split->block_len,
+                                               em->orig_block_len);
+                               split->ram_bytes = em->ram_bytes;
+                       } else {
+                               split->orig_start = split->start;
+                               split->block_len = 0;
+                               split->block_start = em->block_start;
+                               split->orig_block_len = 0;
+                               split->ram_bytes = split->len;
+                       }
+
+                       split->generation = gen;
+                       split->flags = flags;
+                       split->compress_type = em->compress_type;
+                       replace_extent_mapping(em_tree, em, split, modified);
+                       free_extent_map(split);
+                       split = split2;
+                       split2 = NULL;
+               }
+               if (em_end > end) {
+                       if (!split) {
+                               split = split2;
+                               split2 = NULL;
+                               if (!split)
+                                       goto remove_em;
+                       }
+                       split->start = start + len;
+                       split->len = em_end - (start + len);
+                       split->block_start = em->block_start;
+                       split->flags = flags;
+                       split->compress_type = em->compress_type;
+                       split->generation = gen;
+
+                       if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+                               split->orig_block_len = max(em->block_len,
+                                                   em->orig_block_len);
+
+                               split->ram_bytes = em->ram_bytes;
+                               if (compressed) {
+                                       split->block_len = em->block_len;
+                                       split->orig_start = em->orig_start;
+                               } else {
+                                       const u64 diff = start + len - em->start;
+
+                                       split->block_len = split->len;
+                                       split->block_start += diff;
+                                       split->orig_start = em->orig_start;
+                               }
+                       } else {
+                               split->ram_bytes = split->len;
+                               split->orig_start = split->start;
+                               split->block_len = 0;
+                               split->orig_block_len = 0;
+                       }
+
+                       if (extent_map_in_tree(em)) {
+                               replace_extent_mapping(em_tree, em, split,
+                                                      modified);
+                       } else {
+                               int ret;
+
+                               ret = add_extent_mapping(em_tree, split,
+                                                        modified);
+                               /* Logic error, shouldn't happen. */
+                               ASSERT(ret == 0);
+                               if (WARN_ON(ret != 0) && modified)
+                                       btrfs_set_inode_full_sync(inode);
+                       }
+                       free_extent_map(split);
+                       split = NULL;
+               }
+remove_em:
+               if (extent_map_in_tree(em)) {
+                       /*
+                        * If the extent map is still in the tree it means that
+                        * either of the following is true:
+                        *
+                        * 1) It fits entirely in our range (doesn't end beyond
+                        *    it or starts before it);
+                        *
+                        * 2) It starts before our range and/or ends after our
+                        *    range, and we were not able to allocate the extent
+                        *    maps for split operations, @split and @split2.
+                        *
+                        * If we are at case 2) then we just remove the entire
+                        * extent map - this is fine since if anyone needs it to
+                        * access the subranges outside our range, will just
+                        * load it again from the subvolume tree's file extent
+                        * item. However if the extent map was in the list of
+                        * modified extents, then we must mark the inode for a
+                        * full fsync, otherwise a fast fsync will miss this
+                        * extent if it's new and needs to be logged.
+                        */
+                       if ((em->start < start || em_end > end) && modified) {
+                               ASSERT(!split);
+                               btrfs_set_inode_full_sync(inode);
+                       }
+                       remove_extent_mapping(em_tree, em);
+               }
+
+               /*
+                * Once for the tree reference (we replaced or removed the
+                * extent map from the tree).
+                */
+               free_extent_map(em);
+next:
+               /* Once for us (for our lookup reference). */
+               free_extent_map(em);
+
+               em = next_em;
+       }
+
+       write_unlock(&em_tree->lock);
+
+       free_extent_map(split);
+       free_extent_map(split2);
+}
+
+/*
+ * Replace a range in the inode's extent map tree with a new extent map.
+ *
+ * @inode:      The target inode.
+ * @new_em:     The new extent map to add to the inode's extent map tree.
+ * @modified:   Indicate if the new extent map should be added to the list of
+ *              modified extents (for fast fsync tracking).
+ *
+ * Drops all the extent maps in the inode's extent map tree that intersect the
+ * range of the new extent map and adds the new extent map to the tree.
+ * The caller should have locked an appropriate file range in the inode's io
+ * tree before calling this function.
+ */
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+                                  struct extent_map *new_em,
+                                  bool modified)
+{
+       const u64 end = new_em->start + new_em->len - 1;
+       struct extent_map_tree *tree = &inode->extent_tree;
+       int ret;
+
+       ASSERT(!extent_map_in_tree(new_em));
+
+       /*
+        * The caller has locked an appropriate file range in the inode's io
+        * tree, but getting -EEXIST when adding the new extent map can still
+        * happen in case there are extents that partially cover the range, and
+        * this is due to two tasks operating on different parts of the extent.
+        * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from
+        * btrfs_get_extent") for an example and details.
+        */
+       do {
+               btrfs_drop_extent_map_range(inode, new_em->start, end, false);
+               write_lock(&tree->lock);
+               ret = add_extent_mapping(tree, new_em, modified);
+               write_unlock(&tree->lock);
+       } while (ret == -EEXIST);
+
+       return ret;
+}
index d2fa32f..ad31186 100644 (file)
@@ -63,6 +63,8 @@ struct extent_map_tree {
        rwlock_t lock;
 };
 
+struct btrfs_inode;
+
 static inline int extent_map_in_tree(const struct extent_map *em)
 {
        return !RB_EMPTY_NODE(&em->rb_node);
@@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
                             struct extent_map_tree *em_tree,
                             struct extent_map **em_in, u64 start, u64 len);
+void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
+                                u64 start, u64 end,
+                                bool skip_pinned);
+int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
+                                  struct extent_map *new_em,
+                                  bool modified);
 
 #endif
index c828f97..6bb9fa9 100644 (file)
@@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
        if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
                return 0;
        return clear_extent_bit(&inode->file_extent_tree, start,
-                               start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
+                               start + len - 1, EXTENT_DIRTY, NULL);
 }
 
 static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
@@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
        return ncsums * fs_info->sectorsize;
 }
 
-int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+/*
+ * Calculate the total size needed to allocate for an ordered sum structure
+ * spanning @bytes in the file.
+ */
+static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
+{
+       int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
+
+       return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
+}
+
+int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
-                            u64 objectid, u64 pos,
-                            u64 disk_offset, u64 disk_num_bytes,
-                            u64 num_bytes, u64 offset, u64 ram_bytes,
-                            u8 compression, u8 encryption, u16 other_encoding)
+                            u64 objectid, u64 pos, u64 num_bytes)
 {
        int ret = 0;
        struct btrfs_file_extent_item *item;
@@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
-       btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
-       btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
-       btrfs_set_file_extent_offset(leaf, item, offset);
+       btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
+       btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
+       btrfs_set_file_extent_offset(leaf, item, 0);
        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
-       btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+       btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
        btrfs_set_file_extent_generation(leaf, item, trans->transid);
        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
-       btrfs_set_file_extent_compression(leaf, item, compression);
-       btrfs_set_file_extent_encryption(leaf, item, encryption);
-       btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+       btrfs_set_file_extent_compression(leaf, item, 0);
+       btrfs_set_file_extent_encryption(leaf, item, 0);
+       btrfs_set_file_extent_other_encoding(leaf, item, 0);
 
        btrfs_mark_buffer_dirty(leaf);
 out:
@@ -503,7 +511,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                            struct list_head *list, int search_commit)
+                            struct list_head *list, int search_commit,
+                            bool nowait)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_key key;
@@ -525,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        if (!path)
                return -ENOMEM;
 
+       path->nowait = nowait;
        if (search_commit) {
                path->skip_locking = 1;
                path->reada = READA_FORWARD;
index 5a3f6e0..176b432 100644 (file)
@@ -473,7 +473,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
         */
        clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                        0, 0, cached);
+                        cached);
 
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        extra_bits, cached);
@@ -499,159 +499,6 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 }
 
 /*
- * this drops all the extents in the cache that intersect the range
- * [start, end].  Existing extents are split as required.
- */
-void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
-                            int skip_pinned)
-{
-       struct extent_map *em;
-       struct extent_map *split = NULL;
-       struct extent_map *split2 = NULL;
-       struct extent_map_tree *em_tree = &inode->extent_tree;
-       u64 len = end - start + 1;
-       u64 gen;
-       int ret;
-       int testend = 1;
-       unsigned long flags;
-       int compressed = 0;
-       bool modified;
-
-       WARN_ON(end < start);
-       if (end == (u64)-1) {
-               len = (u64)-1;
-               testend = 0;
-       }
-       while (1) {
-               int no_splits = 0;
-
-               modified = false;
-               if (!split)
-                       split = alloc_extent_map();
-               if (!split2)
-                       split2 = alloc_extent_map();
-               if (!split || !split2)
-                       no_splits = 1;
-
-               write_lock(&em_tree->lock);
-               em = lookup_extent_mapping(em_tree, start, len);
-               if (!em) {
-                       write_unlock(&em_tree->lock);
-                       break;
-               }
-               flags = em->flags;
-               gen = em->generation;
-               if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-                       if (testend && em->start + em->len >= start + len) {
-                               free_extent_map(em);
-                               write_unlock(&em_tree->lock);
-                               break;
-                       }
-                       start = em->start + em->len;
-                       if (testend)
-                               len = start + len - (em->start + em->len);
-                       free_extent_map(em);
-                       write_unlock(&em_tree->lock);
-                       continue;
-               }
-               compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-               clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-               clear_bit(EXTENT_FLAG_LOGGING, &flags);
-               modified = !list_empty(&em->list);
-               if (no_splits)
-                       goto next;
-
-               if (em->start < start) {
-                       split->start = em->start;
-                       split->len = start - em->start;
-
-                       if (em->block_start < EXTENT_MAP_LAST_BYTE) {
-                               split->orig_start = em->orig_start;
-                               split->block_start = em->block_start;
-
-                               if (compressed)
-                                       split->block_len = em->block_len;
-                               else
-                                       split->block_len = split->len;
-                               split->orig_block_len = max(split->block_len,
-                                               em->orig_block_len);
-                               split->ram_bytes = em->ram_bytes;
-                       } else {
-                               split->orig_start = split->start;
-                               split->block_len = 0;
-                               split->block_start = em->block_start;
-                               split->orig_block_len = 0;
-                               split->ram_bytes = split->len;
-                       }
-
-                       split->generation = gen;
-                       split->flags = flags;
-                       split->compress_type = em->compress_type;
-                       replace_extent_mapping(em_tree, em, split, modified);
-                       free_extent_map(split);
-                       split = split2;
-                       split2 = NULL;
-               }
-               if (testend && em->start + em->len > start + len) {
-                       u64 diff = start + len - em->start;
-
-                       split->start = start + len;
-                       split->len = em->start + em->len - (start + len);
-                       split->flags = flags;
-                       split->compress_type = em->compress_type;
-                       split->generation = gen;
-
-                       if (em->block_start < EXTENT_MAP_LAST_BYTE) {
-                               split->orig_block_len = max(em->block_len,
-                                                   em->orig_block_len);
-
-                               split->ram_bytes = em->ram_bytes;
-                               if (compressed) {
-                                       split->block_len = em->block_len;
-                                       split->block_start = em->block_start;
-                                       split->orig_start = em->orig_start;
-                               } else {
-                                       split->block_len = split->len;
-                                       split->block_start = em->block_start
-                                               + diff;
-                                       split->orig_start = em->orig_start;
-                               }
-                       } else {
-                               split->ram_bytes = split->len;
-                               split->orig_start = split->start;
-                               split->block_len = 0;
-                               split->block_start = em->block_start;
-                               split->orig_block_len = 0;
-                       }
-
-                       if (extent_map_in_tree(em)) {
-                               replace_extent_mapping(em_tree, em, split,
-                                                      modified);
-                       } else {
-                               ret = add_extent_mapping(em_tree, split,
-                                                        modified);
-                               ASSERT(ret == 0); /* Logic error */
-                       }
-                       free_extent_map(split);
-                       split = NULL;
-               }
-next:
-               if (extent_map_in_tree(em))
-                       remove_extent_mapping(em_tree, em);
-               write_unlock(&em_tree->lock);
-
-               /* once for us */
-               free_extent_map(em);
-               /* once for the tree*/
-               free_extent_map(em);
-       }
-       if (split)
-               free_extent_map(split);
-       if (split2)
-               free_extent_map(split2);
-}
-
-/*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
  * that would be a good hint to the block allocator for this file.
@@ -708,7 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
        }
 
        if (args->drop_cache)
-               btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
+               btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
 
        if (args->start >= inode->disk_i_size && !args->replace_extent)
                modify_tree = 0;
@@ -1339,26 +1186,54 @@ static int prepare_uptodate_page(struct inode *inode,
        return 0;
 }
 
+static unsigned int get_prepare_fgp_flags(bool nowait)
+{
+       unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+
+       if (nowait)
+               fgp_flags |= FGP_NOWAIT;
+
+       return fgp_flags;
+}
+
+static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
+{
+       gfp_t gfp;
+
+       gfp = btrfs_alloc_write_mask(inode->i_mapping);
+       if (nowait) {
+               gfp &= ~__GFP_DIRECT_RECLAIM;
+               gfp |= GFP_NOWAIT;
+       }
+
+       return gfp;
+}
+
 /*
  * this just gets pages into the page cache and locks them down.
  */
 static noinline int prepare_pages(struct inode *inode, struct page **pages,
                                  size_t num_pages, loff_t pos,
-                                 size_t write_bytes, bool force_uptodate)
+                                 size_t write_bytes, bool force_uptodate,
+                                 bool nowait)
 {
        int i;
        unsigned long index = pos >> PAGE_SHIFT;
-       gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+       gfp_t mask = get_prepare_gfp_flags(inode, nowait);
+       unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
        int err = 0;
        int faili;
 
        for (i = 0; i < num_pages; i++) {
 again:
-               pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                              mask | __GFP_WRITE);
+               pages[i] = pagecache_get_page(inode->i_mapping, index + i,
+                                             fgp_flags, mask | __GFP_WRITE);
                if (!pages[i]) {
                        faili = i - 1;
-                       err = -ENOMEM;
+                       if (nowait)
+                               err = -EAGAIN;
+                       else
+                               err = -ENOMEM;
                        goto fail;
                }
 
@@ -1376,7 +1251,7 @@ again:
                                                    pos + write_bytes, false);
                if (err) {
                        put_page(pages[i]);
-                       if (err == -EAGAIN) {
+                       if (!nowait && err == -EAGAIN) {
                                err = 0;
                                goto again;
                        }
@@ -1411,7 +1286,7 @@ static noinline int
 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
                                size_t num_pages, loff_t pos,
                                size_t write_bytes,
-                               u64 *lockstart, u64 *lockend,
+                               u64 *lockstart, u64 *lockend, bool nowait,
                                struct extent_state **cached_state)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1426,15 +1301,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
        if (start_pos < inode->vfs_inode.i_size) {
                struct btrfs_ordered_extent *ordered;
 
-               lock_extent_bits(&inode->io_tree, start_pos, last_pos,
-                               cached_state);
+               if (nowait) {
+                       if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
+                               for (i = 0; i < num_pages; i++) {
+                                       unlock_page(pages[i]);
+                                       put_page(pages[i]);
+                                       pages[i] = NULL;
+                               }
+
+                               return -EAGAIN;
+                       }
+               } else {
+                       lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+               }
+
                ordered = btrfs_lookup_ordered_range(inode, start_pos,
                                                     last_pos - start_pos + 1);
                if (ordered &&
                    ordered->file_offset + ordered->num_bytes > start_pos &&
                    ordered->file_offset <= last_pos) {
-                       unlock_extent_cached(&inode->io_tree, start_pos,
-                                       last_pos, cached_state);
+                       unlock_extent(&inode->io_tree, start_pos, last_pos,
+                                     cached_state);
                        for (i = 0; i < num_pages; i++) {
                                unlock_page(pages[i]);
                                put_page(pages[i]);
@@ -1481,7 +1368,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
  */
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
-                          size_t *write_bytes)
+                          size_t *write_bytes, bool nowait)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_root *root = inode->root;
@@ -1500,17 +1387,22 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
                           fs_info->sectorsize) - 1;
        num_bytes = lockend - lockstart + 1;
 
-       btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+       if (nowait) {
+               if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
+                       btrfs_drew_write_unlock(&root->snapshot_lock);
+                       return -EAGAIN;
+               }
+       } else {
+               btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
+       }
        ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
-                       NULL, NULL, NULL, false);
-       if (ret <= 0) {
-               ret = 0;
+                       NULL, NULL, NULL, nowait, false);
+       if (ret <= 0)
                btrfs_drew_write_unlock(&root->snapshot_lock);
-       } else {
+       else
                *write_bytes = min_t(size_t, *write_bytes ,
                                     num_bytes - pos + lockstart);
-       }
-       unlock_extent(&inode->io_tree, lockstart, lockend);
+       unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
 
        return ret;
 }
@@ -1607,8 +1499,10 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
        bool force_page_uptodate = false;
        loff_t old_isize = i_size_read(inode);
        unsigned int ilock_flags = 0;
+       const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
+       unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
 
-       if (iocb->ki_flags & IOCB_NOWAIT)
+       if (nowait)
                ilock_flags |= BTRFS_ILOCK_TRY;
 
        ret = btrfs_inode_lock(inode, ilock_flags);
@@ -1664,18 +1558,29 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
                extent_changeset_release(data_reserved);
                ret = btrfs_check_data_free_space(BTRFS_I(inode),
                                                  &data_reserved, pos,
-                                                 write_bytes);
+                                                 write_bytes, nowait);
                if (ret < 0) {
+                       int can_nocow;
+
+                       if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
+                               ret = -EAGAIN;
+                               break;
+                       }
+
                        /*
                         * If we don't have to COW at the offset, reserve
                         * metadata only. write_bytes may get smaller than
                         * requested here.
                         */
-                       if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
-                                                  &write_bytes) > 0)
-                               only_release_metadata = true;
-                       else
+                       can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
+                                                          &write_bytes, nowait);
+                       if (can_nocow < 0)
+                               ret = can_nocow;
+                       if (can_nocow > 0)
+                               ret = 0;
+                       if (ret)
                                break;
+                       only_release_metadata = true;
                }
 
                num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
@@ -1685,7 +1590,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
                WARN_ON(reserve_bytes == 0);
                ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
                                                      reserve_bytes,
-                                                     reserve_bytes, false);
+                                                     reserve_bytes, nowait);
                if (ret) {
                        if (!only_release_metadata)
                                btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -1698,14 +1603,17 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 
                release_bytes = reserve_bytes;
 again:
+               ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
+               if (ret)
+                       break;
+
                /*
                 * This is going to setup the pages array with the number of
                 * pages we want, so we don't really need to worry about the
                 * contents of pages from loop to loop
                 */
                ret = prepare_pages(inode, pages, num_pages,
-                                   pos, write_bytes,
-                                   force_page_uptodate);
+                                   pos, write_bytes, force_page_uptodate, false);
                if (ret) {
                        btrfs_delalloc_release_extents(BTRFS_I(inode),
                                                       reserve_bytes);
@@ -1715,10 +1623,11 @@ again:
                extents_locked = lock_and_cleanup_extent_if_need(
                                BTRFS_I(inode), pages,
                                num_pages, pos, write_bytes, &lockstart,
-                               &lockend, &cached_state);
+                               &lockend, nowait, &cached_state);
                if (extents_locked < 0) {
-                       if (extents_locked == -EAGAIN)
+                       if (!nowait && extents_locked == -EAGAIN)
                                goto again;
+
                        btrfs_delalloc_release_extents(BTRFS_I(inode),
                                                       reserve_bytes);
                        ret = extents_locked;
@@ -1782,8 +1691,8 @@ again:
                 * possible cached extent state to avoid a memory leak.
                 */
                if (extents_locked)
-                       unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                            lockstart, lockend, &cached_state);
+                       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, &cached_state);
                else
                        free_extent_state(cached_state);
 
@@ -1801,8 +1710,6 @@ again:
 
                cond_resched();
 
-               balance_dirty_pages_ratelimited(inode->i_mapping);
-
                pos += copied;
                num_written += copied;
        }
@@ -2045,7 +1952,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
        if (BTRFS_FS_ERROR(inode->root->fs_info))
                return -EROFS;
 
-       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+       if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
                return -EOPNOTSUPP;
 
        if (sync)
@@ -2201,14 +2108,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        atomic_inc(&root->log_batch);
 
        /*
-        * Always check for the full sync flag while holding the inode's lock,
-        * to avoid races with other tasks. The flag must be either set all the
-        * time during logging or always off all the time while logging.
-        */
-       full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
-                            &BTRFS_I(inode)->runtime_flags);
-
-       /*
         * Before we acquired the inode's lock and the mmap lock, someone may
         * have dirtied more pages in the target range. We need to make sure
         * that writeback for any such pages does not start while we are logging
@@ -2233,6 +2132,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
 
        /*
+        * Always check for the full sync flag while holding the inode's lock,
+        * to avoid races with other tasks. The flag must be either set all the
+        * time during logging or always off all the time while logging.
+        * We check the flag here after starting delalloc above, because when
+        * running delalloc the full sync flag may be set if we need to drop
+        * extra extent map ranges due to temporary memory allocation failures.
+        */
+       full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                            &BTRFS_I(inode)->runtime_flags);
+
+       /*
         * We have to do this here to avoid the priority inversion of waiting on
         * IO of a lower priority task while holding a transaction open.
         *
@@ -2380,6 +2290,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        ret = btrfs_commit_transaction(trans);
 out:
        ASSERT(list_empty(&ctx.list));
+       ASSERT(list_empty(&ctx.conflict_inodes));
        err = file_check_and_advance_wb_err(file);
        if (!ret)
                ret = err;
@@ -2448,7 +2359,6 @@ static int fill_holes(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
        struct extent_map *hole_em;
-       struct extent_map_tree *em_tree = &inode->extent_tree;
        struct btrfs_key key;
        int ret;
 
@@ -2505,8 +2415,8 @@ static int fill_holes(struct btrfs_trans_handle *trans,
        }
        btrfs_release_path(path);
 
-       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
-                       offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
+       ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
+                                      end - offset);
        if (ret)
                return ret;
 
@@ -2515,7 +2425,7 @@ out:
 
        hole_em = alloc_extent_map();
        if (!hole_em) {
-               btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+               btrfs_drop_extent_map_range(inode, offset, end - 1, false);
                btrfs_set_inode_full_sync(inode);
        } else {
                hole_em->start = offset;
@@ -2529,12 +2439,7 @@ out:
                hole_em->compress_type = BTRFS_COMPRESS_NONE;
                hole_em->generation = trans->transid;
 
-               do {
-                       btrfs_drop_extent_cache(inode, offset, end - 1, 0);
-                       write_lock(&em_tree->lock);
-                       ret = add_extent_mapping(em_tree, hole_em, 1);
-                       write_unlock(&em_tree->lock);
-               } while (ret == -EEXIST);
+               ret = btrfs_replace_extent_map_range(inode, hole_em, true);
                free_extent_map(hole_em);
                if (ret)
                        btrfs_set_inode_full_sync(inode);
@@ -2591,8 +2496,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
        while (1) {
                truncate_pagecache_range(inode, lockstart, lockend);
 
-               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                cached_state);
+               lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                           cached_state);
                /*
                 * We can't have ordered extents in the range, nor dirty/writeback
                 * pages, because we have locked the inode's VFS lock in exclusive
@@ -2607,8 +2512,8 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
                                            page_lockend))
                        break;
 
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, cached_state);
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                             cached_state);
        }
 
        btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -3008,9 +2913,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
        if (ret)
                goto out_only_mutex;
 
-       lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
-       lockend = round_down(offset + len,
-                            btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
+       lockstart = round_up(offset, fs_info->sectorsize);
+       lockend = round_down(offset + len, fs_info->sectorsize) - 1;
        same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
                == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
        /*
@@ -3108,8 +3012,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
        btrfs_end_transaction(trans);
        btrfs_btree_balance_dirty(fs_info);
 out:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                            &cached_state);
+       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                     &cached_state);
 out_only_mutex:
        if (!updated_inode && truncated_block && !ret) {
                /*
@@ -3212,7 +3116,7 @@ enum {
 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
                                                 u64 offset)
 {
-       const u64 sectorsize = btrfs_inode_sectorsize(inode);
+       const u64 sectorsize = inode->root->fs_info->sectorsize;
        struct extent_map *em;
        int ret;
 
@@ -3242,7 +3146,7 @@ static int btrfs_zero_range(struct inode *inode,
        struct extent_changeset *data_reserved = NULL;
        int ret;
        u64 alloc_hint = 0;
-       const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
+       const u64 sectorsize = fs_info->sectorsize;
        u64 alloc_start = round_down(offset, sectorsize);
        u64 alloc_end = round_up(offset + len, sectorsize);
        u64 bytes_to_reserve = 0;
@@ -3382,16 +3286,16 @@ reserve_space:
                ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
                                                alloc_start, bytes_to_reserve);
                if (ret) {
-                       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-                                            lockend, &cached_state);
+                       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, &cached_state);
                        goto out;
                }
                ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
                                                alloc_end - alloc_start,
                                                i_blocksize(inode),
                                                offset + len, &alloc_hint);
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, &cached_state);
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                             &cached_state);
                /* btrfs_prealloc_file_range releases reserved space on error */
                if (ret) {
                        space_reserved = false;
@@ -3428,7 +3332,7 @@ static long btrfs_fallocate(struct file *file, int mode,
        u64 data_space_reserved = 0;
        u64 qgroup_reserved = 0;
        struct extent_map *em;
-       int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
+       int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
        int ret;
 
        /* Do not allow fallocate in ZONED mode */
@@ -3502,8 +3406,8 @@ static long btrfs_fallocate(struct file *file, int mode,
        }
 
        locked_end = alloc_end - 1;
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                        &cached_state);
+       lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                   &cached_state);
 
        btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
 
@@ -3592,31 +3496,290 @@ static long btrfs_fallocate(struct file *file, int mode,
         */
        ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                            &cached_state);
+       unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                     &cached_state);
 out:
        btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
        extent_changeset_free(data_reserved);
        return ret;
 }
 
+/*
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
+ */
+static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
+                                  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+       const u64 len = end + 1 - start;
+       struct extent_map_tree *em_tree = &inode->extent_tree;
+       struct extent_map *em;
+       u64 em_end;
+       u64 delalloc_len;
+
+       /*
+        * Search the io tree first for EXTENT_DELALLOC. If we find any, it
+        * means we have delalloc (dirty pages) for which writeback has not
+        * started yet.
+        */
+       *delalloc_start_ret = start;
+       delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
+                                       len, EXTENT_DELALLOC, 1);
+       /*
+        * If delalloc was found then *delalloc_start_ret has a sector size
+        * aligned value (rounded down).
+        */
+       if (delalloc_len > 0)
+               *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
+
+       /*
+        * Now also check if there's any extent map in the range that does not
+        * map to a hole or prealloc extent. We do this because:
+        *
+        * 1) When delalloc is flushed, the file range is locked, we clear the
+        *    EXTENT_DELALLOC bit from the io tree and create an extent map for
+        *    an allocated extent. So we might just have been called after
+        *    delalloc is flushed and before the ordered extent completes and
+        *    inserts the new file extent item in the subvolume's btree;
+        *
+        * 2) We may have an extent map created by flushing delalloc for a
+        *    subrange that starts before the subrange we found marked with
+        *    EXTENT_DELALLOC in the io tree.
+        */
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, start, len);
+       read_unlock(&em_tree->lock);
+
+       /* extent_map_end() returns a non-inclusive end offset. */
+       em_end = em ? extent_map_end(em) : 0;
+
+       /*
+        * If we have a hole/prealloc extent map, check the next one if this one
+        * ends before our range's end.
+        */
+       if (em && (em->block_start == EXTENT_MAP_HOLE ||
+                  test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
+               struct extent_map *next_em;
+
+               read_lock(&em_tree->lock);
+               next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
+               read_unlock(&em_tree->lock);
+
+               free_extent_map(em);
+               em_end = next_em ? extent_map_end(next_em) : 0;
+               em = next_em;
+       }
+
+       if (em && (em->block_start == EXTENT_MAP_HOLE ||
+                  test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+               free_extent_map(em);
+               em = NULL;
+       }
+
+       /*
+        * No extent map or one for a hole or prealloc extent. Use the delalloc
+        * range we found in the io tree if we have one.
+        */
+       if (!em)
+               return (delalloc_len > 0);
+
+       /*
+        * We don't have any range as EXTENT_DELALLOC in the io tree, so the
+        * extent map is the only subrange representing delalloc.
+        */
+       if (delalloc_len == 0) {
+               *delalloc_start_ret = em->start;
+               *delalloc_end_ret = min(end, em_end - 1);
+               free_extent_map(em);
+               return true;
+       }
+
+       /*
+        * The extent map represents a delalloc range that starts before the
+        * delalloc range we found in the io tree.
+        */
+       if (em->start < *delalloc_start_ret) {
+               *delalloc_start_ret = em->start;
+               /*
+                * If the ranges are adjacent, return a combined range.
+                * Otherwise return the extent map's range.
+                */
+               if (em_end < *delalloc_start_ret)
+                       *delalloc_end_ret = min(end, em_end - 1);
+
+               free_extent_map(em);
+               return true;
+       }
+
+       /*
+        * The extent map starts after the delalloc range we found in the io
+        * tree. If it's adjacent, return a combined range, otherwise return
+        * the range found in the io tree.
+        */
+       if (*delalloc_end_ret + 1 == em->start)
+               *delalloc_end_ret = min(end, em_end - 1);
+
+       free_extent_map(em);
+       return true;
+}
+
+/*
+ * Check if there's delalloc in a given range.
+ *
+ * @inode:               The inode.
+ * @start:               The start offset of the range. It does not need to be
+ *                       sector size aligned.
+ * @end:                 The end offset (inclusive value) of the search range.
+ *                       It does not need to be sector size aligned.
+ * @delalloc_start_ret:  Output argument, set to the start offset of the
+ *                       subrange found with delalloc (may not be sector size
+ *                       aligned).
+ * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
+ *                       of the subrange found with delalloc.
+ *
+ * Returns true if a subrange with delalloc is found within the given range, and
+ * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
+ * end offsets of the subrange.
+ */
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+                                 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+{
+       u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
+       u64 prev_delalloc_end = 0;
+       bool ret = false;
+
+       while (cur_offset < end) {
+               u64 delalloc_start;
+               u64 delalloc_end;
+               bool delalloc;
+
+               delalloc = find_delalloc_subrange(inode, cur_offset, end,
+                                                 &delalloc_start,
+                                                 &delalloc_end);
+               if (!delalloc)
+                       break;
+
+               if (prev_delalloc_end == 0) {
+                       /* First subrange found. */
+                       *delalloc_start_ret = max(delalloc_start, start);
+                       *delalloc_end_ret = delalloc_end;
+                       ret = true;
+               } else if (delalloc_start == prev_delalloc_end + 1) {
+                       /* Subrange adjacent to the previous one, merge them. */
+                       *delalloc_end_ret = delalloc_end;
+               } else {
+                       /* Subrange not adjacent to the previous one, exit. */
+                       break;
+               }
+
+               prev_delalloc_end = delalloc_end;
+               cur_offset = delalloc_end + 1;
+               cond_resched();
+       }
+
+       return ret;
+}
+
+/*
+ * Check if there's a hole or delalloc range in a range representing a hole (or
+ * prealloc extent) found in the inode's subvolume btree.
+ *
+ * @inode:      The inode.
+ * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
+ * @start:      Start offset of the hole region. It does not need to be sector
+ *              size aligned.
+ * @end:        End offset (inclusive value) of the hole region. It does not
+ *              need to be sector size aligned.
+ * @start_ret:  Return parameter, used to set the start of the subrange in the
+ *              hole that matches the search criteria (seek mode), if such
+ *              subrange is found (return value of the function is true).
+ *              The value returned here may not be sector size aligned.
+ *
+ * Returns true if a subrange matching the given seek mode is found, and if one
+ * is found, it updates @start_ret with the start of the subrange.
+ */
+static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
+                                       u64 start, u64 end, u64 *start_ret)
+{
+       u64 delalloc_start;
+       u64 delalloc_end;
+       bool delalloc;
+
+       delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+                                               &delalloc_start, &delalloc_end);
+       if (delalloc && whence == SEEK_DATA) {
+               *start_ret = delalloc_start;
+               return true;
+       }
+
+       if (delalloc && whence == SEEK_HOLE) {
+               /*
+                * We found delalloc but it starts after out start offset. So we
+                * have a hole between our start offset and the delalloc start.
+                */
+               if (start < delalloc_start) {
+                       *start_ret = start;
+                       return true;
+               }
+               /*
+                * Delalloc range starts at our start offset.
+                * If the delalloc range's length is smaller than our range,
+                * then it means we have a hole that starts where the delalloc
+                * subrange ends.
+                */
+               if (delalloc_end < end) {
+                       *start_ret = delalloc_end + 1;
+                       return true;
+               }
+
+               /* There's delalloc for the whole range. */
+               return false;
+       }
+
+       if (!delalloc && whence == SEEK_HOLE) {
+               *start_ret = start;
+               return true;
+       }
+
+       /*
+        * No delalloc in the range and we are seeking for data. The caller has
+        * to iterate to the next extent item in the subvolume btree.
+        */
+       return false;
+}
+
 static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
                                  int whence)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
-       loff_t i_size = inode->vfs_inode.i_size;
+       const loff_t i_size = i_size_read(&inode->vfs_inode);
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_root *root = inode->root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       u64 last_extent_end;
        u64 lockstart;
        u64 lockend;
        u64 start;
-       u64 len;
-       int ret = 0;
+       int ret;
+       bool found = false;
 
        if (i_size == 0 || offset >= i_size)
                return -ENXIO;
 
        /*
+        * Quick path. If the inode has no prealloc extents and its number of
+        * bytes used matches its i_size, then it can not have holes.
+        */
+       if (whence == SEEK_HOLE &&
+           !(inode->flags & BTRFS_INODE_PREALLOC) &&
+           inode_get_bytes(&inode->vfs_inode) == i_size)
+               return i_size;
+
+       /*
         * offset can be negative, in this case we start finding DATA/HOLE from
         * the very start of the file.
         */
@@ -3627,45 +3790,164 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
        if (lockend <= lockstart)
                lockend = lockstart + fs_info->sectorsize;
        lockend--;
-       len = lockend - lockstart + 1;
 
-       lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->reada = READA_FORWARD;
+
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = start;
+
+       last_extent_end = lockstart;
+
+       lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0) {
+               goto out;
+       } else if (ret > 0 && path->slots[0] > 0) {
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+               if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+                       path->slots[0]--;
+       }
 
        while (start < i_size) {
-               em = btrfs_get_extent_fiemap(inode, start, len);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       em = NULL;
-                       break;
+               struct extent_buffer *leaf = path->nodes[0];
+               struct btrfs_file_extent_item *extent;
+               u64 extent_end;
+
+               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret < 0)
+                               goto out;
+                       else if (ret > 0)
+                               break;
+
+                       leaf = path->nodes[0];
                }
 
-               if (whence == SEEK_HOLE &&
-                   (em->block_start == EXTENT_MAP_HOLE ||
-                    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
-                       break;
-               else if (whence == SEEK_DATA &&
-                          (em->block_start != EXTENT_MAP_HOLE &&
-                           !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
                        break;
 
-               start = em->start + em->len;
-               free_extent_map(em);
-               em = NULL;
+               extent_end = btrfs_file_extent_end(path);
+
+               /*
+                * In the first iteration we may have a slot that points to an
+                * extent that ends before our start offset, so skip it.
+                */
+               if (extent_end <= start) {
+                       path->slots[0]++;
+                       continue;
+               }
+
+               /* We have an implicit hole, NO_HOLES feature is likely set. */
+               if (last_extent_end < key.offset) {
+                       u64 search_start = last_extent_end;
+                       u64 found_start;
+
+                       /*
+                        * First iteration, @start matches @offset and it's
+                        * within the hole.
+                        */
+                       if (start == offset)
+                               search_start = offset;
+
+                       found = find_desired_extent_in_hole(inode, whence,
+                                                           search_start,
+                                                           key.offset - 1,
+                                                           &found_start);
+                       if (found) {
+                               start = found_start;
+                               break;
+                       }
+                       /*
+                        * Didn't find data or a hole (due to delalloc) in the
+                        * implicit hole range, so need to analyze the extent.
+                        */
+               }
+
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_file_extent_item);
+
+               if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+                   btrfs_file_extent_type(leaf, extent) ==
+                   BTRFS_FILE_EXTENT_PREALLOC) {
+                       /*
+                        * Explicit hole or prealloc extent, search for delalloc.
+                        * A prealloc extent is treated like a hole.
+                        */
+                       u64 search_start = key.offset;
+                       u64 found_start;
+
+                       /*
+                        * First iteration, @start matches @offset and it's
+                        * within the hole.
+                        */
+                       if (start == offset)
+                               search_start = offset;
+
+                       found = find_desired_extent_in_hole(inode, whence,
+                                                           search_start,
+                                                           extent_end - 1,
+                                                           &found_start);
+                       if (found) {
+                               start = found_start;
+                               break;
+                       }
+                       /*
+                        * Didn't find data or a hole (due to delalloc) in the
+                        * implicit hole range, so need to analyze the next
+                        * extent item.
+                        */
+               } else {
+                       /*
+                        * Found a regular or inline extent.
+                        * If we are seeking for data, adjust the start offset
+                        * and stop, we're done.
+                        */
+                       if (whence == SEEK_DATA) {
+                               start = max_t(u64, key.offset, offset);
+                               found = true;
+                               break;
+                       }
+                       /*
+                        * Else, we are seeking for a hole, check the next file
+                        * extent item.
+                        */
+               }
+
+               start = extent_end;
+               last_extent_end = extent_end;
+               path->slots[0]++;
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       goto out;
+               }
                cond_resched();
        }
-       free_extent_map(em);
-       unlock_extent_cached(&inode->io_tree, lockstart, lockend,
-                            &cached_state);
-       if (ret) {
-               offset = ret;
-       } else {
-               if (whence == SEEK_DATA && start >= i_size)
-                       offset = -ENXIO;
-               else
-                       offset = min_t(loff_t, start, i_size);
+
+       /* We have an implicit hole from the last extent found up to i_size. */
+       if (!found && start < i_size) {
+               found = find_desired_extent_in_hole(inode, whence, start,
+                                                   i_size - 1, &start);
+               if (!found)
+                       start = i_size;
        }
 
-       return offset;
+out:
+       unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+       btrfs_free_path(path);
+
+       if (ret < 0)
+               return ret;
+
+       if (whence == SEEK_DATA && start >= i_size)
+               return -ENXIO;
+
+       return min_t(loff_t, start, i_size);
 }
 
 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -3693,7 +3975,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
        int ret;
 
-       filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+       filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
 
        ret = fsverity_file_open(inode, filp);
        if (ret)
@@ -3810,6 +4092,7 @@ const struct file_operations btrfs_file_operations = {
        .mmap           = btrfs_file_mmap,
        .open           = btrfs_file_open,
        .release        = btrfs_release_file,
+       .get_unmapped_area = thp_get_unmapped_area,
        .fsync          = btrfs_sync_file,
        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
index 996da65..f402365 100644 (file)
@@ -48,6 +48,24 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info, u64 offset,
                              u64 bytes, bool update_stats);
 
+static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+       struct btrfs_free_space *info;
+       struct rb_node *node;
+
+       while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+               info = rb_entry(node, struct btrfs_free_space, offset_index);
+               if (!info->bitmap) {
+                       unlink_free_space(ctl, info, true);
+                       kmem_cache_free(btrfs_free_space_cachep, info);
+               } else {
+                       free_bitmap(ctl, info);
+               }
+
+               cond_resched_lock(&ctl->tree_lock);
+       }
+}
+
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                                               struct btrfs_path *path,
                                               u64 offset)
@@ -126,10 +144,8 @@ struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
        }
 
-       if (!block_group->iref) {
+       if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
                block_group->inode = igrab(inode);
-               block_group->iref = 1;
-       }
        spin_unlock(&block_group->lock);
 
        return inode;
@@ -241,8 +257,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
        clear_nlink(inode);
        /* One for the block groups ref */
        spin_lock(&block_group->lock);
-       if (block_group->iref) {
-               block_group->iref = 0;
+       if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
                block_group->inode = NULL;
                spin_unlock(&block_group->lock);
                iput(inode);
@@ -333,8 +348,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(inode, 0);
        truncate_pagecache(vfs_inode, 0);
 
-       lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
-       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+       lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+       btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
 
        /*
         * We skip the throttling logic for free space cache inodes, so we don't
@@ -345,7 +360,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
        inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
        btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 
-       unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
+       unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
        if (ret)
                goto fail;
 
@@ -693,6 +708,12 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 
        max_bitmaps = max_t(u64, max_bitmaps, 1);
 
+       if (ctl->total_bitmaps > max_bitmaps)
+               btrfs_err(block_group->fs_info,
+"invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
+                         block_group->start, block_group->length,
+                         ctl->total_bitmaps, ctl->unit, max_bitmaps,
+                         bytes_per_bg);
        ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
        /*
@@ -875,7 +896,10 @@ out:
        return ret;
 free_cache:
        io_ctl_drop_pages(&io_ctl);
+
+       spin_lock(&ctl->tree_lock);
        __btrfs_remove_free_space_cache(ctl);
+       spin_unlock(&ctl->tree_lock);
        goto out;
 }
 
@@ -914,6 +938,8 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group,
        return ret;
 }
 
+static struct lock_class_key btrfs_free_space_inode_key;
+
 int load_free_space_cache(struct btrfs_block_group *block_group)
 {
        struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -983,6 +1009,14 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
        }
        spin_unlock(&block_group->lock);
 
+       /*
+        * Reinitialize the class of struct inode's mapping->invalidate_lock for
+        * free space inodes to prevent false positives related to locks for normal
+        * inodes.
+        */
+       lockdep_set_class(&(&inode->i_data)->invalidate_lock,
+                         &btrfs_free_space_inode_key);
+
        ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
                                      path, block_group->start);
        btrfs_free_path(path);
@@ -1001,7 +1035,13 @@ int load_free_space_cache(struct btrfs_block_group *block_group)
                if (ret == 0)
                        ret = 1;
        } else {
+               /*
+                * We need to call the _locked variant so we don't try to update
+                * the discard counters.
+                */
+               spin_lock(&tmp_ctl.tree_lock);
                __btrfs_remove_free_space_cache(&tmp_ctl);
+               spin_unlock(&tmp_ctl.tree_lock);
                btrfs_warn(fs_info,
                           "block group %llu has wrong amount of free space",
                           block_group->start);
@@ -1123,7 +1163,7 @@ update_cache_item(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0) {
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-                                EXTENT_DELALLOC, 0, 0, NULL);
+                                EXTENT_DELALLOC, NULL);
                goto fail;
        }
        leaf = path->nodes[0];
@@ -1135,8 +1175,8 @@ update_cache_item(struct btrfs_trans_handle *trans,
                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
                    found_key.offset != offset) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
-                                        inode->i_size - 1, EXTENT_DELALLOC, 0,
-                                        0, NULL);
+                                        inode->i_size - 1, EXTENT_DELALLOC,
+                                        NULL);
                        btrfs_release_path(path);
                        goto fail;
                }
@@ -1232,7 +1272,7 @@ static int flush_dirty_cache(struct inode *inode)
        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
        if (ret)
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-                                EXTENT_DELALLOC, 0, 0, NULL);
+                                EXTENT_DELALLOC, NULL);
 
        return ret;
 }
@@ -1252,8 +1292,8 @@ cleanup_write_cache_enospc(struct inode *inode,
                           struct extent_state **cached_state)
 {
        io_ctl_drop_pages(io_ctl);
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-                            i_size_read(inode) - 1, cached_state);
+       unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+                     cached_state);
 }
 
 static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1378,8 +1418,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (ret)
                goto out_unlock;
 
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
-                        &cached_state);
+       lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+                   &cached_state);
 
        io_ctl_set_generation(io_ctl, trans->transid);
 
@@ -1434,8 +1474,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        io_ctl_drop_pages(io_ctl);
        io_ctl_free(io_ctl);
 
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-                            i_size_read(inode) - 1, &cached_state);
+       unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+                     &cached_state);
 
        /*
         * at this point the pages are under IO and we're happy,
@@ -2860,7 +2900,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
        if (btrfs_is_zoned(fs_info)) {
                btrfs_info(fs_info, "free space %llu active %d",
                           block_group->zone_capacity - block_group->alloc_offset,
-                          block_group->zone_is_active);
+                          test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+                                   &block_group->runtime_flags));
                return;
        }
 
@@ -2964,34 +3005,6 @@ static void __btrfs_return_cluster_to_free_space(
        btrfs_put_block_group(block_group);
 }
 
-static void __btrfs_remove_free_space_cache_locked(
-                               struct btrfs_free_space_ctl *ctl)
-{
-       struct btrfs_free_space *info;
-       struct rb_node *node;
-
-       while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
-               info = rb_entry(node, struct btrfs_free_space, offset_index);
-               if (!info->bitmap) {
-                       unlink_free_space(ctl, info, true);
-                       kmem_cache_free(btrfs_free_space_cachep, info);
-               } else {
-                       free_bitmap(ctl, info);
-               }
-
-               cond_resched_lock(&ctl->tree_lock);
-       }
-}
-
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
-{
-       spin_lock(&ctl->tree_lock);
-       __btrfs_remove_free_space_cache_locked(ctl);
-       if (ctl->block_group)
-               btrfs_discard_update_discardable(ctl->block_group);
-       spin_unlock(&ctl->tree_lock);
-}
-
 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3009,7 +3022,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
 
                cond_resched_lock(&ctl->tree_lock);
        }
-       __btrfs_remove_free_space_cache_locked(ctl);
+       __btrfs_remove_free_space_cache(ctl);
        btrfs_discard_update_discardable(block_group);
        spin_unlock(&ctl->tree_lock);
 
@@ -3992,7 +4005,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group,
        *trimmed = 0;
 
        spin_lock(&block_group->lock);
-       if (block_group->removed) {
+       if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
@@ -4022,7 +4035,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
        *trimmed = 0;
 
        spin_lock(&block_group->lock);
-       if (block_group->removed) {
+       if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
@@ -4044,7 +4057,7 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
        *trimmed = 0;
 
        spin_lock(&block_group->lock);
-       if (block_group->removed) {
+       if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
index 15591b2..6d419ba 100644 (file)
@@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
                                       u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
                            u64 bytenr, u64 size);
-void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
index 1bf89aa..367bcfc 100644 (file)
@@ -1453,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
                ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
                ASSERT(key.objectid < end && key.objectid + key.offset <= end);
 
-               caching_ctl->progress = key.objectid;
-
                offset = key.objectid;
                while (offset < key.objectid + key.offset) {
                        bit = free_space_test_bit(block_group, path, offset);
@@ -1490,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
                goto out;
        }
 
-       caching_ctl->progress = (u64)-1;
-
        ret = 0;
 out:
        return ret;
@@ -1531,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
                ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
                ASSERT(key.objectid < end && key.objectid + key.offset <= end);
 
-               caching_ctl->progress = key.objectid;
-
                total_found += add_new_free_space(block_group, key.objectid,
                                                  key.objectid + key.offset);
                if (total_found > CACHING_CTL_WAKE_UP) {
@@ -1552,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
                goto out;
        }
 
-       caching_ctl->progress = (u64)-1;
-
        ret = 0;
 out:
        return ret;
index 1372210..45ebef8 100644 (file)
@@ -977,7 +977,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
                if (!(start >= locked_page_end || end <= locked_page_start))
                        locked_page = async_chunk->locked_page;
        }
-       lock_extent(io_tree, start, end);
+       lock_extent(io_tree, start, end, NULL);
 
        /* We have fall back to uncompressed write */
        if (!async_extent->pages)
@@ -1024,7 +1024,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
                                       1 << BTRFS_ORDERED_COMPRESSED,
                                       async_extent->compress_type);
        if (ret) {
-               btrfs_drop_extent_cache(inode, start, end, 0);
+               btrfs_drop_extent_map_range(inode, start, end, false);
                goto out_free_reserve;
        }
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1254,7 +1254,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
        }
 
        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
-       btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
        /*
         * Relocation relies on the relocated extents to have exactly the same
@@ -1319,8 +1318,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
                         * skip current ordered extent.
                         */
                        if (ret)
-                               btrfs_drop_extent_cache(inode, start,
-                                               start + ram_size - 1, 0);
+                               btrfs_drop_extent_map_range(inode, start,
+                                                           start + ram_size - 1,
+                                                           false);
                }
 
                btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1360,7 +1360,7 @@ out:
        return ret;
 
 out_drop_extent_cache:
-       btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
+       btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
 out_reserve:
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1524,7 +1524,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
        unsigned nofs_flag;
        const blk_opf_t write_flags = wbc_to_write_flags(wbc);
 
-       unlock_extent(&inode->io_tree, start, end);
+       unlock_extent(&inode->io_tree, start, end, NULL);
 
        if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
            !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
@@ -1666,7 +1666,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
 }
 
 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
-                                       u64 bytenr, u64 num_bytes)
+                                       u64 bytenr, u64 num_bytes, bool nowait)
 {
        struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
        struct btrfs_ordered_sum *sums;
@@ -1674,7 +1674,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
        LIST_HEAD(list);
 
        ret = btrfs_lookup_csums_range(csum_root, bytenr,
-                                      bytenr + num_bytes - 1, &list, 0);
+                                      bytenr + num_bytes - 1, &list, 0,
+                                      nowait);
        if (ret == 0 && list_empty(&list))
                return 0;
 
@@ -1747,7 +1748,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
 
                if (count > 0)
                        clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
-                                        0, 0, NULL);
+                                        NULL);
        }
 
        return cow_file_range(inode, locked_page, start, end, page_started,
@@ -1800,6 +1801,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
        u8 extent_type;
        int can_nocow = 0;
        int ret = 0;
+       bool nowait = path->nowait;
 
        fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
        extent_type = btrfs_file_extent_type(leaf, fi);
@@ -1876,7 +1878,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
         * Force COW if csums exist in the range. This ensures that csums for a
         * given extent are either valid or do not exist.
         */
-       ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes);
+       ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
+                                 nowait);
        WARN_ON_ONCE(ret > 0 && is_freespace_inode);
        if (ret != 0)
                goto out;
@@ -2099,8 +2102,8 @@ out_check:
                                        1 << BTRFS_ORDERED_PREALLOC,
                                        BTRFS_COMPRESS_NONE);
                        if (ret) {
-                               btrfs_drop_extent_cache(inode, cur_offset,
-                                                       nocow_end, 0);
+                               btrfs_drop_extent_map_range(inode, cur_offset,
+                                                           nocow_end, false);
                                goto error;
                        }
                } else {
@@ -2548,7 +2551,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 
        ASSERT(pre + post < len);
 
-       lock_extent(&inode->io_tree, start, start + len - 1);
+       lock_extent(&inode->io_tree, start, start + len - 1, NULL);
        write_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
        if (!em) {
@@ -2622,7 +2625,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
 
 out_unlock:
        write_unlock(&em_tree->lock);
-       unlock_extent(&inode->io_tree, start, start + len - 1);
+       unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
 out:
        free_extent_map(split_pre);
        free_extent_map(split_mid);
@@ -2700,8 +2703,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
        if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
                ret = extract_ordered_extent(bi, bio,
                                page_offset(bio_first_bvec_all(bio)->bv_page));
-               if (ret)
-                       goto out;
+               if (ret) {
+                       btrfs_bio_end_io(btrfs_bio(bio), ret);
+                       return;
+               }
        }
 
        /*
@@ -2721,16 +2726,12 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro
                        return;
 
                ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
-               if (ret)
-                       goto out;
+               if (ret) {
+                       btrfs_bio_end_io(btrfs_bio(bio), ret);
+                       return;
+               }
        }
        btrfs_submit_bio(fs_info, bio, mirror_num);
-       return;
-out:
-       if (ret) {
-               bio->bi_status = ret;
-               bio_endio(bio);
-       }
 }
 
 void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
@@ -2757,8 +2758,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
         */
        ret = btrfs_lookup_bio_sums(inode, bio, NULL);
        if (ret) {
-               bio->bi_status = ret;
-               bio_endio(bio);
+               btrfs_bio_end_io(btrfs_bio(bio), ret);
                return;
        }
 
@@ -2818,8 +2818,8 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
 
                ret = set_extent_bit(&inode->io_tree, search_start,
                                     search_start + em_len - 1,
-                                    EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
-                                    GFP_NOFS, NULL);
+                                    EXTENT_DELALLOC_NEW, cached_state,
+                                    GFP_NOFS);
 next:
                search_start = extent_map_end(em);
                free_extent_map(em);
@@ -2931,7 +2931,7 @@ again:
        if (ret)
                goto out_page;
 
-       lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+       lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 
        /* already ordered? We're done */
        if (PageOrdered(page))
@@ -2939,8 +2939,8 @@ again:
 
        ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
        if (ordered) {
-               unlock_extent_cached(&inode->io_tree, page_start, page_end,
-                                    &cached_state);
+               unlock_extent(&inode->io_tree, page_start, page_end,
+                             &cached_state);
                unlock_page(page);
                btrfs_start_ordered_extent(ordered, 1);
                btrfs_put_ordered_extent(ordered);
@@ -2966,8 +2966,7 @@ out_reserved:
        if (free_delalloc_space)
                btrfs_delalloc_release_space(inode, data_reserved, page_start,
                                             PAGE_SIZE, true);
-       unlock_extent_cached(&inode->io_tree, page_start, page_end,
-                            &cached_state);
+       unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
 out_page:
        if (ret) {
                /*
@@ -3225,6 +3224,8 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                clear_bits |= EXTENT_DELALLOC_NEW;
 
        freespace_inode = btrfs_is_free_space_inode(inode);
+       if (!freespace_inode)
+               btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
 
        if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
                ret = -EIO;
@@ -3269,7 +3270,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        }
 
        clear_bits |= EXTENT_LOCKED;
-       lock_extent_bits(io_tree, start, end, &cached_state);
+       lock_extent(io_tree, start, end, &cached_state);
 
        if (freespace_inode)
                trans = btrfs_join_transaction_spacecache(root);
@@ -3325,7 +3326,7 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
            !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
                clear_extent_bit(&inode->io_tree, start, end,
                                 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
-                                0, 0, &cached_state);
+                                &cached_state);
 
        btrfs_inode_safe_disk_i_size_write(inode, 0);
        ret = btrfs_update_inode_fallback(trans, root, inode);
@@ -3336,7 +3337,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        ret = 0;
 out:
        clear_extent_bit(&inode->io_tree, start, end, clear_bits,
-                        (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
                         &cached_state);
 
        if (trans)
@@ -3361,8 +3361,8 @@ out:
                        unwritten_start += logical_len;
                clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
 
-               /* Drop the cache for the part of the extent we didn't write. */
-               btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
+               /* Drop extent maps for the part of the extent we didn't write. */
+               btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
 
                /*
                 * If the ordered extent had an IOERR or something else went
@@ -3439,6 +3439,13 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
        return 0;
 }
 
+static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+{
+       u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+
+       return csums + offset_in_sectors * fs_info->csum_size;
+}
+
 /*
  * check_data_csum - verify checksum of one sector of uncompressed data
  * @inode:     inode
@@ -4878,9 +4885,9 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
        block_end = block_start + blocksize - 1;
 
        ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
-                                         blocksize);
+                                         blocksize, false);
        if (ret < 0) {
-               if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
+               if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
                        /* For nocow case, no need to reserve data space */
                        only_release_metadata = true;
                } else {
@@ -4922,12 +4929,11 @@ again:
        }
        wait_on_page_writeback(page);
 
-       lock_extent_bits(io_tree, block_start, block_end, &cached_state);
+       lock_extent(io_tree, block_start, block_end, &cached_state);
 
        ordered = btrfs_lookup_ordered_extent(inode, block_start);
        if (ordered) {
-               unlock_extent_cached(io_tree, block_start, block_end,
-                                    &cached_state);
+               unlock_extent(io_tree, block_start, block_end, &cached_state);
                unlock_page(page);
                put_page(page);
                btrfs_start_ordered_extent(ordered, 1);
@@ -4937,13 +4943,12 @@ again:
 
        clear_extent_bit(&inode->io_tree, block_start, block_end,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                        0, 0, &cached_state);
+                        &cached_state);
 
        ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
                                        &cached_state);
        if (ret) {
-               unlock_extent_cached(io_tree, block_start, block_end,
-                                    &cached_state);
+               unlock_extent(io_tree, block_start, block_end, &cached_state);
                goto out_unlock;
        }
 
@@ -4960,11 +4965,11 @@ again:
        btrfs_page_clear_checked(fs_info, page, block_start,
                                 block_end + 1 - block_start);
        btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
-       unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+       unlock_extent(io_tree, block_start, block_end, &cached_state);
 
        if (only_release_metadata)
                set_extent_bit(&inode->io_tree, block_start, block_end,
-                              EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
+                              EXTENT_NORESERVE, NULL, GFP_NOFS);
 
 out_unlock:
        if (ret) {
@@ -5021,8 +5026,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
                return ret;
        }
 
-       ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
-                       offset, 0, 0, len, 0, len, 0, 0, 0);
+       ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
        } else {
@@ -5046,7 +5050,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
        struct extent_io_tree *io_tree = &inode->io_tree;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
-       struct extent_map_tree *em_tree = &inode->extent_tree;
        u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
        u64 block_end = ALIGN(size, fs_info->sectorsize);
        u64 last_byte;
@@ -5094,10 +5097,11 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
                        if (err)
                                break;
 
-                       btrfs_drop_extent_cache(inode, cur_offset,
-                                               cur_offset + hole_size - 1, 0);
                        hole_em = alloc_extent_map();
                        if (!hole_em) {
+                               btrfs_drop_extent_map_range(inode, cur_offset,
+                                                   cur_offset + hole_size - 1,
+                                                   false);
                                btrfs_set_inode_full_sync(inode);
                                goto next;
                        }
@@ -5112,16 +5116,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
                        hole_em->compress_type = BTRFS_COMPRESS_NONE;
                        hole_em->generation = fs_info->generation;
 
-                       while (1) {
-                               write_lock(&em_tree->lock);
-                               err = add_extent_mapping(em_tree, hole_em, 1);
-                               write_unlock(&em_tree->lock);
-                               if (err != -EEXIST)
-                                       break;
-                               btrfs_drop_extent_cache(inode, cur_offset,
-                                                       cur_offset +
-                                                       hole_size - 1, 0);
-                       }
+                       err = btrfs_replace_extent_map_range(inode, hole_em, true);
                        free_extent_map(hole_em);
                } else {
                        err = btrfs_inode_set_file_extent_range(inode,
@@ -5137,7 +5132,7 @@ next:
                        break;
        }
        free_extent_map(em);
-       unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
+       unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
        return err;
 }
 
@@ -5271,7 +5266,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
  * While truncating the inode pages during eviction, we get the VFS
  * calling btrfs_invalidate_folio() against each folio of the inode. This
  * is slow because the calls to btrfs_invalidate_folio() result in a
- * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * huge amount of calls to lock_extent() and clear_extent_bit(),
  * which keep merging and splitting extent_state structures over and over,
  * wasting lots of time.
  *
@@ -5283,29 +5278,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
 static void evict_inode_truncate_pages(struct inode *inode)
 {
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-       struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
        struct rb_node *node;
 
        ASSERT(inode->i_state & I_FREEING);
        truncate_inode_pages_final(&inode->i_data);
 
-       write_lock(&map_tree->lock);
-       while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
-               struct extent_map *em;
-
-               node = rb_first_cached(&map_tree->map);
-               em = rb_entry(node, struct extent_map, rb_node);
-               clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-               clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
-               remove_extent_mapping(map_tree, em);
-               free_extent_map(em);
-               if (need_resched()) {
-                       write_unlock(&map_tree->lock);
-                       cond_resched();
-                       write_lock(&map_tree->lock);
-               }
-       }
-       write_unlock(&map_tree->lock);
+       btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 
        /*
         * Keep looping until we have no more ranges in the io tree.
@@ -5338,7 +5316,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                state_flags = state->state;
                spin_unlock(&io_tree->lock);
 
-               lock_extent_bits(io_tree, start, end, &cached_state);
+               lock_extent(io_tree, start, end, &cached_state);
 
                /*
                 * If still has DELALLOC flag, the extent didn't reach disk,
@@ -5353,8 +5331,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
                                               end - start + 1);
 
                clear_extent_bit(io_tree, start, end,
-                                EXTENT_LOCKED | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+                                EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
                                 &cached_state);
 
                cond_resched();
@@ -5707,6 +5684,11 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
        BTRFS_I(inode)->location.offset = 0;
        BTRFS_I(inode)->root = btrfs_grab_root(args->root);
        BUG_ON(args->root && !BTRFS_I(inode)->root);
+
+       if (args->root && args->root == args->root->fs_info->tree_root &&
+           args->ino != BTRFS_BTREE_INODE_OBJECTID)
+               set_bit(BTRFS_INODE_FREE_SPACE_INODE,
+                       &BTRFS_I(inode)->runtime_flags);
        return 0;
 }
 
@@ -6867,7 +6849,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_map_tree *em_tree = &inode->extent_tree;
-       struct extent_io_tree *io_tree = &inode->io_tree;
 
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, start, len);
@@ -7030,8 +7011,6 @@ next:
                        }
                        flush_dcache_page(page);
                }
-               set_extent_uptodate(io_tree, em->start,
-                                   extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        }
 not_found:
@@ -7065,133 +7044,6 @@ out:
        return em;
 }
 
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-                                          u64 start, u64 len)
-{
-       struct extent_map *em;
-       struct extent_map *hole_em = NULL;
-       u64 delalloc_start = start;
-       u64 end;
-       u64 delalloc_len;
-       u64 delalloc_end;
-       int err = 0;
-
-       em = btrfs_get_extent(inode, NULL, 0, start, len);
-       if (IS_ERR(em))
-               return em;
-       /*
-        * If our em maps to:
-        * - a hole or
-        * - a pre-alloc extent,
-        * there might actually be delalloc bytes behind it.
-        */
-       if (em->block_start != EXTENT_MAP_HOLE &&
-           !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-               return em;
-       else
-               hole_em = em;
-
-       /* check to see if we've wrapped (len == -1 or similar) */
-       end = start + len;
-       if (end < start)
-               end = (u64)-1;
-       else
-               end -= 1;
-
-       em = NULL;
-
-       /* ok, we didn't find anything, lets look for delalloc */
-       delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
-                                end, len, EXTENT_DELALLOC, 1);
-       delalloc_end = delalloc_start + delalloc_len;
-       if (delalloc_end < delalloc_start)
-               delalloc_end = (u64)-1;
-
-       /*
-        * We didn't find anything useful, return the original results from
-        * get_extent()
-        */
-       if (delalloc_start > end || delalloc_end <= start) {
-               em = hole_em;
-               hole_em = NULL;
-               goto out;
-       }
-
-       /*
-        * Adjust the delalloc_start to make sure it doesn't go backwards from
-        * the start they passed in
-        */
-       delalloc_start = max(start, delalloc_start);
-       delalloc_len = delalloc_end - delalloc_start;
-
-       if (delalloc_len > 0) {
-               u64 hole_start;
-               u64 hole_len;
-               const u64 hole_end = extent_map_end(hole_em);
-
-               em = alloc_extent_map();
-               if (!em) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               ASSERT(hole_em);
-               /*
-                * When btrfs_get_extent can't find anything it returns one
-                * huge hole
-                *
-                * Make sure what it found really fits our range, and adjust to
-                * make sure it is based on the start from the caller
-                */
-               if (hole_end <= start || hole_em->start > end) {
-                      free_extent_map(hole_em);
-                      hole_em = NULL;
-               } else {
-                      hole_start = max(hole_em->start, start);
-                      hole_len = hole_end - hole_start;
-               }
-
-               if (hole_em && delalloc_start > hole_start) {
-                       /*
-                        * Our hole starts before our delalloc, so we have to
-                        * return just the parts of the hole that go until the
-                        * delalloc starts
-                        */
-                       em->len = min(hole_len, delalloc_start - hole_start);
-                       em->start = hole_start;
-                       em->orig_start = hole_start;
-                       /*
-                        * Don't adjust block start at all, it is fixed at
-                        * EXTENT_MAP_HOLE
-                        */
-                       em->block_start = hole_em->block_start;
-                       em->block_len = hole_len;
-                       if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
-                               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
-               } else {
-                       /*
-                        * Hole is out of passed range or it starts after
-                        * delalloc range
-                        */
-                       em->start = delalloc_start;
-                       em->len = delalloc_len;
-                       em->orig_start = delalloc_start;
-                       em->block_start = EXTENT_MAP_DELALLOC;
-                       em->block_len = delalloc_len;
-               }
-       } else {
-               return hole_em;
-       }
-out:
-
-       free_extent_map(hole_em);
-       if (err) {
-               free_extent_map(em);
-               return ERR_PTR(err);
-       }
-       return em;
-}
-
 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
                                                  const u64 start,
                                                  const u64 len,
@@ -7221,7 +7073,8 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
        if (ret) {
                if (em) {
                        free_extent_map(em);
-                       btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+                       btrfs_drop_extent_map_range(inode, start,
+                                                   start + len - 1, false);
                }
                em = ERR_PTR(ret);
        }
@@ -7292,7 +7145,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
  */
 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                              u64 *orig_start, u64 *orig_block_len,
-                             u64 *ram_bytes, bool strict)
+                             u64 *ram_bytes, bool nowait, bool strict)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        struct can_nocow_file_extent_args nocow_args = { 0 };
@@ -7308,6 +7161,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+       path->nowait = nowait;
 
        ret = btrfs_lookup_file_extent(NULL, root, path,
                        btrfs_ino(BTRFS_I(inode)), offset, 0);
@@ -7404,7 +7258,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                        if (!try_lock_extent(io_tree, lockstart, lockend))
                                return -EAGAIN;
                } else {
-                       lock_extent_bits(io_tree, lockstart, lockend, cached_state);
+                       lock_extent(io_tree, lockstart, lockend, cached_state);
                }
                /*
                 * We're concerned with the entire range that we're going to be
@@ -7426,7 +7280,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                                                         lockstart, lockend)))
                        break;
 
-               unlock_extent_cached(io_tree, lockstart, lockend, cached_state);
+               unlock_extent(io_tree, lockstart, lockend, cached_state);
 
                if (ordered) {
                        if (nowait) {
@@ -7488,7 +7342,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
                                       u64 ram_bytes, int compress_type,
                                       int type)
 {
-       struct extent_map_tree *em_tree;
        struct extent_map *em;
        int ret;
 
@@ -7497,7 +7350,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
               type == BTRFS_ORDERED_NOCOW ||
               type == BTRFS_ORDERED_REGULAR);
 
-       em_tree = &inode->extent_tree;
        em = alloc_extent_map();
        if (!em)
                return ERR_PTR(-ENOMEM);
@@ -7518,18 +7370,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
                em->compress_type = compress_type;
        }
 
-       do {
-               btrfs_drop_extent_cache(inode, em->start,
-                                       em->start + em->len - 1, 0);
-               write_lock(&em_tree->lock);
-               ret = add_extent_mapping(em_tree, em, 1);
-               write_unlock(&em_tree->lock);
-               /*
-                * The caller has taken lock_extent(), who could race with us
-                * to add em?
-                */
-       } while (ret == -EEXIST);
-
+       ret = btrfs_replace_extent_map_range(inode, em, true);
        if (ret) {
                free_extent_map(em);
                return ERR_PTR(ret);
@@ -7577,7 +7418,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
                block_start = em->block_start + (start - em->start);
 
                if (can_nocow_extent(inode, start, &len, &orig_start,
-                                    &orig_block_len, &ram_bytes, false) == 1) {
+                                    &orig_block_len, &ram_bytes, false, false) == 1) {
                        bg = btrfs_inc_nocow_writers(fs_info, block_start);
                        if (bg)
                                can_nocow = true;
@@ -7762,7 +7603,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
        if (write && !(flags & IOMAP_NOWAIT)) {
                ret = btrfs_check_data_free_space(BTRFS_I(inode),
                                                  &dio_data->data_reserved,
-                                                 start, data_alloc_len);
+                                                 start, data_alloc_len, false);
                if (!ret)
                        dio_data->data_space_reserved = true;
                else if (ret && !(BTRFS_I(inode)->flags &
@@ -7884,8 +7725,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
        }
 
        if (unlock_extents)
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                    lockstart, lockend, &cached_state);
+               unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                             &cached_state);
        else
                free_extent_state(cached_state);
 
@@ -7914,8 +7755,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
        return 0;
 
 unlock_err:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                            &cached_state);
+       unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                     &cached_state);
 err:
        if (dio_data->data_space_reserved) {
                btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -7938,7 +7779,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 
        if (!write && (iomap->type == IOMAP_HOLE)) {
                /* If reading from a hole, unlock and return */
-               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+               unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
+                             NULL);
                return 0;
        }
 
@@ -7950,7 +7792,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
                                                       pos, length, false);
                else
                        unlock_extent(&BTRFS_I(inode)->io_tree, pos,
-                                     pos + length - 1);
+                                     pos + length - 1, NULL);
                ret = -ENOTBLK;
        }
 
@@ -7975,7 +7817,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
        } else {
                unlock_extent(&BTRFS_I(dip->inode)->io_tree,
                              dip->file_offset,
-                             dip->file_offset + dip->bytes - 1);
+                             dip->file_offset + dip->bytes - 1, NULL);
        }
 
        kfree(dip->csums);
@@ -7986,7 +7828,7 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
                                  int mirror_num,
                                  enum btrfs_compression_type compress_type)
 {
-       struct btrfs_dio_private *dip = bio->bi_private;
+       struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 
        BUG_ON(bio_op(bio) == REQ_OP_WRITE);
@@ -8001,8 +7843,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
 {
        struct inode *inode = dip->inode;
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
        blk_status_t err = BLK_STS_OK;
        struct bvec_iter iter;
@@ -8015,9 +7855,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
                if (uptodate &&
                    (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
                                               bv.bv_offset))) {
-                       clean_io_failure(fs_info, failure_tree, io_tree, start,
-                                        bv.bv_page, btrfs_ino(BTRFS_I(inode)),
-                                        bv.bv_offset);
+                       btrfs_clean_io_failure(BTRFS_I(inode), start,
+                                              bv.bv_page, bv.bv_offset);
                } else {
                        int ret;
 
@@ -8039,10 +7878,10 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
        return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
 }
 
-static void btrfs_end_dio_bio(struct bio *bio)
+static void btrfs_end_dio_bio(struct btrfs_bio *bbio)
 {
-       struct btrfs_dio_private *dip = bio->bi_private;
-       struct btrfs_bio *bbio = btrfs_bio(bio);
+       struct btrfs_dio_private *dip = bbio->private;
+       struct bio *bio = &bbio->bio;
        blk_status_t err = bio->bi_status;
 
        if (err)
@@ -8068,7 +7907,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                 u64 file_offset, int async_submit)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_dio_private *dip = bio->bi_private;
+       struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
        blk_status_t ret;
 
        /* Save the original iter for read repair */
@@ -8091,8 +7930,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                 */
                ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
                if (ret) {
-                       bio->bi_status = ret;
-                       bio_endio(bio);
+                       btrfs_bio_end_io(btrfs_bio(bio), ret);
                        return;
                }
        } else {
@@ -8175,9 +8013,8 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
                 * This will never fail as it's passing GPF_NOFS and
                 * the allocation is backed by btrfs_bioset.
                 */
-               bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
-               bio->bi_private = dip;
-               bio->bi_end_io = btrfs_end_dio_bio;
+               bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len,
+                                             btrfs_end_dio_bio, dip);
                btrfs_bio(bio)->file_offset = file_offset;
 
                if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
@@ -8259,6 +8096,25 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret)
                return ret;
 
+       /*
+        * fiemap_prep() called filemap_write_and_wait() for the whole possible
+        * file range (0 to LLONG_MAX), but that is not enough if we have
+        * compression enabled. The first filemap_fdatawrite_range() only kicks
+        * in the compression of data (in an async thread) and will return
+        * before the compression is done and writeback is started. A second
+        * filemap_fdatawrite_range() is needed to wait for the compression to
+        * complete and writeback to start. We also need to wait for ordered
+        * extents to complete, because our fiemap implementation uses mainly
+        * file extent items to list the extents, searching for extent maps
+        * only for file ranges with holes or prealloc extents to figure out
+        * if we have delalloc in those ranges.
+        */
+       if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+               ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+               if (ret)
+                       return ret;
+       }
+
        return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
 }
 
@@ -8391,14 +8247,14 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
        }
 
        if (!inode_evicting)
-               lock_extent_bits(tree, page_start, page_end, &cached_state);
+               lock_extent(tree, page_start, page_end, &cached_state);
 
        cur = page_start;
        while (cur < page_end) {
                struct btrfs_ordered_extent *ordered;
-               bool delete_states;
                u64 range_end;
                u32 range_len;
+               u32 extra_flags = 0;
 
                ordered = btrfs_lookup_first_ordered_range(inode, cur,
                                                           page_end + 1 - cur);
@@ -8408,7 +8264,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                         * No ordered extent covering this range, we are safe
                         * to delete all extent states in the range.
                         */
-                       delete_states = true;
+                       extra_flags = EXTENT_CLEAR_ALL_BITS;
                        goto next;
                }
                if (ordered->file_offset > cur) {
@@ -8419,7 +8275,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                         * the ordered extent in the next iteration.
                         */
                        range_end = ordered->file_offset - 1;
-                       delete_states = true;
+                       extra_flags = EXTENT_CLEAR_ALL_BITS;
                        goto next;
                }
 
@@ -8434,7 +8290,6 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                         * We can't delete the extent states as
                         * btrfs_finish_ordered_io() may still use some of them.
                         */
-                       delete_states = false;
                        goto next;
                }
                btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
@@ -8451,7 +8306,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                        clear_extent_bit(tree, cur, range_end,
                                         EXTENT_DELALLOC |
                                         EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-                                        EXTENT_DEFRAG, 1, 0, &cached_state);
+                                        EXTENT_DEFRAG, &cached_state);
 
                spin_lock_irq(&inode->ordered_tree.lock);
                set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8459,6 +8314,12 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                                             cur - ordered->file_offset);
                spin_unlock_irq(&inode->ordered_tree.lock);
 
+               /*
+                * If the ordered extent has finished, we're safe to delete all
+                * the extent states of the range, otherwise
+                * btrfs_finish_ordered_io() will get executed by endio for
+                * other pages, so we can't delete extent states.
+                */
                if (btrfs_dec_test_ordered_pending(inode, &ordered,
                                                   cur, range_end + 1 - cur)) {
                        btrfs_finish_ordered_io(ordered);
@@ -8466,14 +8327,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
                         * The ordered extent has finished, now we're again
                         * safe to delete all extent states of the range.
                         */
-                       delete_states = true;
-               } else {
-                       /*
-                        * btrfs_finish_ordered_io() will get executed by endio
-                        * of other pages, thus we can't delete extent states
-                        * anymore
-                        */
-                       delete_states = false;
+                       extra_flags = EXTENT_CLEAR_ALL_BITS;
                }
 next:
                if (ordered)
@@ -8497,8 +8351,8 @@ next:
                if (!inode_evicting) {
                        clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
                                 EXTENT_DELALLOC | EXTENT_UPTODATE |
-                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
-                                delete_states, &cached_state);
+                                EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
+                                extra_flags, &cached_state);
                }
                cur = range_end + 1;
        }
@@ -8589,11 +8443,11 @@ again:
        }
        wait_on_page_writeback(page);
 
-       lock_extent_bits(io_tree, page_start, page_end, &cached_state);
+       lock_extent(io_tree, page_start, page_end, &cached_state);
        ret2 = set_page_extent_mapped(page);
        if (ret2 < 0) {
                ret = vmf_error(ret2);
-               unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+               unlock_extent(io_tree, page_start, page_end, &cached_state);
                goto out_unlock;
        }
 
@@ -8604,8 +8458,7 @@ again:
        ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
                        PAGE_SIZE);
        if (ordered) {
-               unlock_extent_cached(io_tree, page_start, page_end,
-                                    &cached_state);
+               unlock_extent(io_tree, page_start, page_end, &cached_state);
                unlock_page(page);
                up_read(&BTRFS_I(inode)->i_mmap_lock);
                btrfs_start_ordered_extent(ordered, 1);
@@ -8633,13 +8486,12 @@ again:
         */
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                         EXTENT_DEFRAG, 0, 0, &cached_state);
+                         EXTENT_DEFRAG, &cached_state);
 
        ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
                                        &cached_state);
        if (ret2) {
-               unlock_extent_cached(io_tree, page_start, page_end,
-                                    &cached_state);
+               unlock_extent(io_tree, page_start, page_end, &cached_state);
                ret = VM_FAULT_SIGBUS;
                goto out_unlock;
        }
@@ -8659,7 +8511,7 @@ again:
 
        btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
 
-       unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+       unlock_extent(io_tree, page_start, page_end, &cached_state);
        up_read(&BTRFS_I(inode)->i_mmap_lock);
 
        btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
@@ -8760,24 +8612,24 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
                const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
 
                control.new_size = new_size;
-               lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+               lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
                                 &cached_state);
                /*
                 * We want to drop from the next block forward in case this new
                 * size is not block aligned since we will be keeping the last
                 * block of the extent just the way it is.
                 */
-               btrfs_drop_extent_cache(BTRFS_I(inode),
-                                       ALIGN(new_size, fs_info->sectorsize),
-                                       (u64)-1, 0);
+               btrfs_drop_extent_map_range(BTRFS_I(inode),
+                                           ALIGN(new_size, fs_info->sectorsize),
+                                           (u64)-1, false);
 
                ret = btrfs_truncate_inode_items(trans, root, &control);
 
                inode_sub_bytes(inode, control.sub_bytes);
                btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);
 
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
-                                    (u64)-1, &cached_state);
+               unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+                             &cached_state);
 
                trans->block_rsv = &fs_info->trans_block_rsv;
                if (ret != -ENOSPC && ret != -EAGAIN)
@@ -8908,6 +8760,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_log_commit = 0;
 
        spin_lock_init(&ei->lock);
+       spin_lock_init(&ei->io_failure_lock);
        ei->outstanding_extents = 0;
        if (sb->s_magic != BTRFS_TEST_MAGIC)
                btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8924,12 +8777,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
-       extent_io_tree_init(fs_info, &ei->io_failure_tree,
-                           IO_TREE_INODE_IO_FAILURE, inode);
        extent_io_tree_init(fs_info, &ei->file_extent_tree,
-                           IO_TREE_INODE_FILE_EXTENT, inode);
-       ei->io_tree.track_uptodate = true;
-       ei->io_failure_tree.track_uptodate = true;
+                           IO_TREE_INODE_FILE_EXTENT, NULL);
+       ei->io_failure_tree = RB_ROOT;
        atomic_set(&ei->sync_writers, 0);
        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -8944,7 +8794,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_destroy_inode(struct inode *inode)
 {
-       btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+       btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 #endif
@@ -8959,6 +8809,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
        struct btrfs_ordered_extent *ordered;
        struct btrfs_inode *inode = BTRFS_I(vfs_inode);
        struct btrfs_root *root = inode->root;
+       bool freespace_inode;
 
        WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
        WARN_ON(vfs_inode->i_data.nrpages);
@@ -8980,6 +8831,12 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
        if (!root)
                return;
 
+       /*
+        * If this is a free space inode do not take the ordered extents lockdep
+        * map.
+        */
+       freespace_inode = btrfs_is_free_space_inode(inode);
+
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
                if (!ordered)
@@ -8988,6 +8845,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
                        btrfs_err(root->fs_info,
                                  "found ordered extent %llu %llu on inode cleanup",
                                  ordered->file_offset, ordered->num_bytes);
+
+                       if (!freespace_inode)
+                               btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
+
                        btrfs_remove_ordered_extent(inode, ordered);
                        btrfs_put_ordered_extent(ordered);
                        btrfs_put_ordered_extent(ordered);
@@ -8995,7 +8856,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode)
        }
        btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
-       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+       btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
        btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
        btrfs_put_root(inode->root);
 }
@@ -10008,7 +9869,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                       struct btrfs_trans_handle *trans)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
@@ -10064,11 +9924,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                        break;
                }
 
-               btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
-                                       cur_offset + ins.offset -1, 0);
-
                em = alloc_extent_map();
                if (!em) {
+                       btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
+                                           cur_offset + ins.offset - 1, false);
                        btrfs_set_inode_full_sync(BTRFS_I(inode));
                        goto next;
                }
@@ -10083,16 +9942,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                em->generation = trans->transid;
 
-               while (1) {
-                       write_lock(&em_tree->lock);
-                       ret = add_extent_mapping(em_tree, em, 1);
-                       write_unlock(&em_tree->lock);
-                       if (ret != -EEXIST)
-                               break;
-                       btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
-                                               cur_offset + ins.offset - 1,
-                                               0);
-               }
+               ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
                free_extent_map(em);
 next:
                num_bytes -= ins.offset;
@@ -10346,7 +10196,7 @@ static ssize_t btrfs_encoded_read_inline(
        }
        read_extent_buffer(leaf, tmp, ptr, count);
        btrfs_release_path(path);
-       unlock_extent_cached(io_tree, start, lockend, cached_state);
+       unlock_extent(io_tree, start, lockend, cached_state);
        btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
        *unlocked = true;
 
@@ -10371,7 +10221,7 @@ struct btrfs_encoded_read_private {
 static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
                                            struct bio *bio, int mirror_num)
 {
-       struct btrfs_encoded_read_private *priv = bio->bi_private;
+       struct btrfs_encoded_read_private *priv = btrfs_bio(bio)->private;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        blk_status_t ret;
 
@@ -10389,7 +10239,7 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
 static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
 {
        const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
-       struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+       struct btrfs_encoded_read_private *priv = bbio->private;
        struct btrfs_inode *inode = priv->inode;
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        u32 sectorsize = fs_info->sectorsize;
@@ -10417,10 +10267,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
        return BLK_STS_OK;
 }
 
-static void btrfs_encoded_read_endio(struct bio *bio)
+static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
 {
-       struct btrfs_encoded_read_private *priv = bio->bi_private;
-       struct btrfs_bio *bbio = btrfs_bio(bio);
+       struct btrfs_encoded_read_private *priv = bbio->private;
        blk_status_t status;
 
        status = btrfs_encoded_read_verify_csum(bbio);
@@ -10438,7 +10287,7 @@ static void btrfs_encoded_read_endio(struct bio *bio)
        if (!atomic_dec_return(&priv->pending))
                wake_up(&priv->wait);
        btrfs_bio_free_csum(bbio);
-       bio_put(bio);
+       bio_put(&bbio->bio);
 }
 
 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
@@ -10485,12 +10334,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
                        size_t bytes = min_t(u64, remaining, PAGE_SIZE);
 
                        if (!bio) {
-                               bio = btrfs_bio_alloc(BIO_MAX_VECS);
+                               bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+                                                     btrfs_encoded_read_endio,
+                                                     &priv);
                                bio->bi_iter.bi_sector =
                                        (disk_bytenr + cur) >> SECTOR_SHIFT;
-                               bio->bi_end_io = btrfs_encoded_read_endio;
-                               bio->bi_private = &priv;
-                               bio->bi_opf = REQ_OP_READ;
                        }
 
                        if (!bytes ||
@@ -10551,7 +10399,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
        if (ret)
                goto out;
 
-       unlock_extent_cached(io_tree, start, lockend, cached_state);
+       unlock_extent(io_tree, start, lockend, cached_state);
        btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
        *unlocked = true;
 
@@ -10621,13 +10469,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
                                               lockend - start + 1);
                if (ret)
                        goto out_unlock_inode;
-               lock_extent_bits(io_tree, start, lockend, &cached_state);
+               lock_extent(io_tree, start, lockend, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start,
                                                     lockend - start + 1);
                if (!ordered)
                        break;
                btrfs_put_ordered_extent(ordered);
-               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+               unlock_extent(io_tree, start, lockend, &cached_state);
                cond_resched();
        }
 
@@ -10701,7 +10549,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
        em = NULL;
 
        if (disk_bytenr == EXTENT_MAP_HOLE) {
-               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+               unlock_extent(io_tree, start, lockend, &cached_state);
                btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
                unlocked = true;
                ret = iov_iter_zero(count, iter);
@@ -10722,7 +10570,7 @@ out_em:
        free_extent_map(em);
 out_unlock_extent:
        if (!unlocked)
-               unlock_extent_cached(io_tree, start, lockend, &cached_state);
+               unlock_extent(io_tree, start, lockend, &cached_state);
 out_unlock_inode:
        if (!unlocked)
                btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
@@ -10860,14 +10708,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                                                    end >> PAGE_SHIFT);
                if (ret)
                        goto out_pages;
-               lock_extent_bits(io_tree, start, end, &cached_state);
+               lock_extent(io_tree, start, end, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
                if (!ordered &&
                    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
                        break;
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-               unlock_extent_cached(io_tree, start, end, &cached_state);
+               unlock_extent(io_tree, start, end, &cached_state);
                cond_resched();
        }
 
@@ -10921,7 +10769,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                                       (1 << BTRFS_ORDERED_COMPRESSED),
                                       compression);
        if (ret) {
-               btrfs_drop_extent_cache(inode, start, end, 0);
+               btrfs_drop_extent_map_range(inode, start, end, false);
                goto out_free_reserved;
        }
        btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -10929,7 +10777,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
        if (start + encoded->len > inode->vfs_inode.i_size)
                i_size_write(&inode->vfs_inode, start + encoded->len);
 
-       unlock_extent_cached(io_tree, start, end, &cached_state);
+       unlock_extent(io_tree, start, end, &cached_state);
 
        btrfs_delalloc_release_extents(inode, num_bytes);
 
@@ -10960,7 +10808,7 @@ out_free_data_space:
        if (!extent_reserved)
                btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
 out_unlock:
-       unlock_extent_cached(io_tree, start, end, &cached_state);
+       unlock_extent(io_tree, start, end, &cached_state);
 out_pages:
        for (i = 0; i < nr_pages; i++) {
                if (pages[i])
@@ -11201,7 +11049,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 
        isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
 
-       lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
+       lock_extent(io_tree, 0, isize - 1, &cached_state);
        start = 0;
        while (start < isize) {
                u64 logical_block_start, physical_block_start;
@@ -11242,7 +11090,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
                free_extent_map(em);
                em = NULL;
 
-               ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
+               ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
                if (ret < 0) {
                        goto out;
                } else if (ret) {
@@ -11338,7 +11186,7 @@ out:
        if (!IS_ERR_OR_NULL(em))
                free_extent_map(em);
 
-       unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);
+       unlock_extent(io_tree, 0, isize - 1, &cached_state);
 
        if (ret)
                btrfs_swap_deactivate(file);
index fe0cc81..d5dd8be 100644 (file)
@@ -1218,10 +1218,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
 
                /* get the big lock and read metadata off disk */
                if (!locked)
-                       lock_extent_bits(io_tree, start, end, &cached);
+                       lock_extent(io_tree, start, end, &cached);
                em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
                if (!locked)
-                       unlock_extent_cached(io_tree, start, end, &cached);
+                       unlock_extent(io_tree, start, end, &cached);
 
                if (IS_ERR(em))
                        return NULL;
@@ -1333,10 +1333,10 @@ again:
        while (1) {
                struct btrfs_ordered_extent *ordered;
 
-               lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
+               lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
                ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
-               unlock_extent_cached(&inode->io_tree, page_start, page_end,
-                                    &cached_state);
+               unlock_extent(&inode->io_tree, page_start, page_end,
+                             &cached_state);
                if (!ordered)
                        break;
 
@@ -1616,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
                return ret;
        clear_extent_bit(&inode->io_tree, start, start + len - 1,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
-                        EXTENT_DEFRAG, 0, 0, cached_state);
+                        EXTENT_DEFRAG, cached_state);
        set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
 
        /* Update the page status */
@@ -1666,9 +1666,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
                wait_on_page_writeback(pages[i]);
 
        /* Lock the pages range */
-       lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
-                        (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
-                        &cached_state);
+       lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+                   (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                   &cached_state);
        /*
         * Now we have a consistent view about the extent map, re-check
         * which range really needs to be defragged.
@@ -1694,9 +1694,9 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
                kfree(entry);
        }
 unlock_extent:
-       unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
-                            (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
-                            &cached_state);
+       unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
+                     (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
+                     &cached_state);
 free_pages:
        for (i = 0; i < nr_pages; i++) {
                if (pages[i]) {
index 9063072..0eab3cb 100644 (file)
@@ -286,6 +286,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 }
 
 /*
+ * Loop around taking references on and locking the root node of the tree in
+ * nowait mode until we end up with a lock on the root node or returning to
+ * avoid blocking.
+ *
+ * Return: root extent buffer with read lock held or -EAGAIN.
+ */
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+{
+       struct extent_buffer *eb;
+
+       while (1) {
+               eb = btrfs_root_node(root);
+               if (!btrfs_try_tree_read_lock(eb)) {
+                       free_extent_buffer(eb);
+                       return ERR_PTR(-EAGAIN);
+               }
+               if (eb == root->node)
+                       break;
+               btrfs_tree_read_unlock(eb);
+               free_extent_buffer(eb);
+       }
+       return eb;
+}
+
+/*
  * DREW locks
  * ==========
  *
index ab268be..490c7a7 100644 (file)
@@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
 
 #ifdef CONFIG_BTRFS_DEBUG
 static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
index 340f995..f9850ed 100644 (file)
@@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
        return NULL;
 }
 
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root:      the root to search.
+ * @bytenr:    bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr.  If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+                                                    u64 bytenr)
+{
+       struct rb_node *node = root->rb_node, *ret = NULL;
+       struct rb_simple_node *entry, *ret_entry = NULL;
+
+       while (node) {
+               entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+               if (bytenr < entry->bytenr) {
+                       if (!ret || entry->bytenr < ret_entry->bytenr) {
+                               ret = node;
+                               ret_entry = entry;
+                       }
+
+                       node = node->rb_left;
+               } else if (bytenr > entry->bytenr) {
+                       node = node->rb_right;
+               } else {
+                       return node;
+               }
+       }
+
+       return ret;
+}
+
 static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
                                               struct rb_node *node)
 {
index 1952ac8..e54f828 100644 (file)
@@ -524,7 +524,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct rb_node *node;
        bool pending;
+       bool freespace_inode;
 
+       /*
+        * If this is a free space inode the thread has not acquired the ordered
+        * extents lockdep map.
+        */
+       freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
+
+       btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
        /* This is paired with btrfs_add_ordered_extent. */
        spin_lock(&btrfs_inode->lock);
        btrfs_mod_outstanding_extents(btrfs_inode, -1);
@@ -580,6 +588,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
                }
        }
 
+       btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
+
        spin_lock(&root->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
        root->nr_ordered_extents--;
@@ -594,6 +604,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
        }
        spin_unlock(&root->ordered_extent_lock);
        wake_up(&entry->wait);
+       if (!freespace_inode)
+               btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
 }
 
 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
@@ -712,10 +724,17 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
        u64 start = entry->file_offset;
        u64 end = start + entry->num_bytes - 1;
        struct btrfs_inode *inode = BTRFS_I(entry->inode);
+       bool freespace_inode;
 
        trace_btrfs_ordered_extent_start(inode, entry);
 
        /*
+        * If this is a free space inode do not take the ordered extents lockdep
+        * map.
+        */
+       freespace_inode = btrfs_is_free_space_inode(inode);
+
+       /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for the flusher thread to find them
@@ -723,6 +742,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
                filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
        if (wait) {
+               if (!freespace_inode)
+                       btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
        }
@@ -1022,7 +1043,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
                cachedp = cached_state;
 
        while (1) {
-               lock_extent_bits(&inode->io_tree, start, end, cachedp);
+               lock_extent(&inode->io_tree, start, end, cachedp);
                ordered = btrfs_lookup_ordered_range(inode, start,
                                                     end - start + 1);
                if (!ordered) {
@@ -1035,12 +1056,37 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
                                refcount_dec(&cache->refs);
                        break;
                }
-               unlock_extent_cached(&inode->io_tree, start, end, cachedp);
+               unlock_extent(&inode->io_tree, start, end, cachedp);
                btrfs_start_ordered_extent(ordered, 1);
                btrfs_put_ordered_extent(ordered);
        }
 }
 
+/*
+ * Lock the passed range and ensure all pending ordered extents in it are run
+ * to completion in nowait mode.
+ *
+ * Return true if btrfs_lock_ordered_range does not return any extents,
+ * otherwise false.
+ */
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
+{
+       struct btrfs_ordered_extent *ordered;
+
+       if (!try_lock_extent(&inode->io_tree, start, end))
+               return false;
+
+       ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
+       if (!ordered)
+               return true;
+
+       btrfs_put_ordered_extent(ordered);
+       unlock_extent(&inode->io_tree, start, end, NULL);
+
+       return false;
+}
+
+
 static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
                                u64 len)
 {
index 87792f8..f59f2db 100644 (file)
@@ -160,18 +160,6 @@ struct btrfs_ordered_extent {
        struct block_device *bdev;
 };
 
-/*
- * calculates the total size you need to allocate for an ordered sum
- * structure spanning 'bytes' in the file
- */
-static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
-                                        unsigned long bytes)
-{
-       int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
-
-       return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
-}
-
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
@@ -218,6 +206,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
                                        u64 end,
                                        struct extent_state **cached_state);
+bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
 int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
                               u64 post);
 int __init ordered_data_init(void);
index a2ec8ec..055a631 100644 (file)
@@ -270,11 +270,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 ino = btrfs_ino(BTRFS_I(inode));
-       int ret;
-
-       ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
 
-       return ret;
+       return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
 }
 
 static int prop_compression_validate(const struct btrfs_inode *inode,
index db723c0..9334c31 100644 (file)
@@ -275,7 +275,7 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p
 }
 
 /*
- * Add relation specified by two qgoup ids.
+ * Add relation specified by two qgroup ids.
  *
  * Must be called with qgroup_lock held.
  *
@@ -333,6 +333,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 }
 #endif
 
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+{
+       fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
+                                 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+                                 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+}
+
 /*
  * The full config is read in one go, only called from open_ctree()
  * It doesn't use any locking, as at this point we're still single-threaded
@@ -401,7 +408,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                        }
                        if (btrfs_qgroup_status_generation(l, ptr) !=
                            fs_info->generation) {
-                               flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                               qgroup_mark_inconsistent(fs_info);
                                btrfs_err(fs_info,
                                        "qgroup generation mismatch, marked as inconsistent");
                        }
@@ -419,7 +426,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
                        btrfs_err(fs_info, "inconsistent qgroup config");
-                       flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                       qgroup_mark_inconsistent(fs_info);
                }
                if (!qgroup) {
                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
@@ -878,7 +885,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
        l = path->nodes[0];
        slot = path->slots[0];
        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
-       btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags &
+                                     BTRFS_QGROUP_STATUS_FLAGS_MASK);
        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
        btrfs_set_qgroup_status_rescan(l, ptr,
                                fs_info->qgroup_rescan_progress.objectid);
@@ -1052,7 +1060,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
                                BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
-       btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
+       btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags &
+                                     BTRFS_QGROUP_STATUS_FLAGS_MASK);
        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 
        btrfs_mark_buffer_dirty(leaf);
@@ -1174,6 +1183,21 @@ out_add_root:
                fs_info->qgroup_rescan_running = true;
                btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                 &fs_info->qgroup_rescan_work);
+       } else {
+               /*
+                * We have set both BTRFS_FS_QUOTA_ENABLED and
+                * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+                * -EINPROGRESS. That can happen because someone started the
+                * rescan worker by calling quota rescan ioctl before we
+                * attempted to initialize the rescan worker. Failure due to
+                * quotas disabled in the meanwhile is not possible, because
+                * we are holding a write lock on fs_info->subvol_sem, which
+                * is also acquired when disabling quotas.
+                * Ignore such error, and any other error would need to undo
+                * everything we did in the transaction we just committed.
+                */
+               ASSERT(ret == -EINPROGRESS);
+               ret = 0;
        }
 
 out_free_path:
@@ -1255,6 +1279,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
        quota_root = fs_info->quota_root;
        fs_info->quota_root = NULL;
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
+       fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
        spin_unlock(&fs_info->qgroup_lock);
 
        btrfs_free_qgroup_config(fs_info);
@@ -1717,7 +1742,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
 
        ret = update_qgroup_limit_item(trans, qgroup);
        if (ret) {
-               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
                btrfs_info(fs_info, "unable to update quota limit for %llu",
                       qgroupid);
        }
@@ -1790,10 +1815,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
         */
        ASSERT(trans != NULL);
 
+       if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+               return 0;
+
        ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
                                   true);
        if (ret < 0) {
-               trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(trans->fs_info);
                btrfs_warn(trans->fs_info,
 "error accounting new delayed refs extent (err code: %d), quota inconsistent",
                        ret);
@@ -2269,7 +2297,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 out:
        btrfs_free_path(dst_path);
        if (ret < 0)
-               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
        return ret;
 }
 
@@ -2280,6 +2308,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret = 0;
        int level;
+       u8 drop_subptree_thres;
        struct extent_buffer *eb = root_eb;
        struct btrfs_path *path = NULL;
 
@@ -2289,6 +2318,23 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
                return 0;
 
+       spin_lock(&fs_info->qgroup_lock);
+       drop_subptree_thres = fs_info->qgroup_drop_subtree_thres;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       /*
+        * This function only gets called for snapshot drop, if we hit a high
+        * node here, it means we are going to change ownership for quite a lot
+        * of extents, which will greatly slow down btrfs_commit_transaction().
+        *
+        * So here if we find a high tree here, we just skip the accounting and
+        * mark qgroup inconsistent.
+        */
+       if (root_level >= drop_subptree_thres) {
+               qgroup_mark_inconsistent(fs_info);
+               return 0;
+       }
+
        if (!extent_buffer_uptodate(root_eb)) {
                ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
                if (ret)
@@ -2604,7 +2650,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
         * If quotas get disabled meanwhile, the resources need to be freed and
         * we can't just exit here.
         */
-       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+           fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
                goto out_free;
 
        if (new_roots) {
@@ -2700,7 +2747,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
                num_dirty_extents++;
                trace_btrfs_qgroup_account_extents(fs_info, record);
 
-               if (!ret) {
+               if (!ret && !(fs_info->qgroup_flags &
+                             BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
                        /*
                         * Old roots should be searched when inserting qgroup
                         * extent record
@@ -2773,12 +2821,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
                spin_unlock(&fs_info->qgroup_lock);
                ret = update_qgroup_info_item(trans, qgroup);
                if (ret)
-                       fs_info->qgroup_flags |=
-                                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                       qgroup_mark_inconsistent(fs_info);
                ret = update_qgroup_limit_item(trans, qgroup);
                if (ret)
-                       fs_info->qgroup_flags |=
-                                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                       qgroup_mark_inconsistent(fs_info);
                spin_lock(&fs_info->qgroup_lock);
        }
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
@@ -2789,7 +2835,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
 
        ret = update_qgroup_status_item(trans);
        if (ret)
-               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
 
        return ret;
 }
@@ -2907,7 +2953,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 
                ret = update_qgroup_limit_item(trans, dstgroup);
                if (ret) {
-                       fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                       qgroup_mark_inconsistent(fs_info);
                        btrfs_info(fs_info,
                                   "unable to update quota limit for %llu",
                                   dstgroup->qgroupid);
@@ -3013,7 +3059,7 @@ out:
        if (!committing)
                mutex_unlock(&fs_info->qgroup_ioctl_lock);
        if (need_rescan)
-               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
        return ret;
 }
 
@@ -3286,7 +3332,8 @@ static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
 {
        return btrfs_fs_closing(fs_info) ||
                test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
-               !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+               !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+                         fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
 }
 
 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3351,7 +3398,8 @@ out:
        }
 
        mutex_lock(&fs_info->qgroup_rescan_lock);
-       if (!stopped)
+       if (!stopped ||
+           fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN)
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        if (trans) {
                ret = update_qgroup_status_item(trans);
@@ -3362,6 +3410,7 @@ out:
                }
        }
        fs_info->qgroup_rescan_running = false;
+       fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN;
        complete_all(&fs_info->qgroup_rescan_completion);
        mutex_unlock(&fs_info->qgroup_rescan_lock);
 
@@ -3372,6 +3421,8 @@ out:
 
        if (stopped) {
                btrfs_info(fs_info, "qgroup scan paused");
+       } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) {
+               btrfs_info(fs_info, "qgroup scan cancelled");
        } else if (err >= 0) {
                btrfs_info(fs_info, "qgroup scan completed%s",
                        err > 0 ? " (inconsistency flag cleared)" : "");
@@ -3434,6 +3485,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 
        memset(&fs_info->qgroup_rescan_progress, 0,
                sizeof(fs_info->qgroup_rescan_progress));
+       fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
+                                  BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
        init_completion(&fs_info->qgroup_rescan_completion);
        mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -4231,8 +4284,7 @@ out_unlock:
        spin_unlock(&blocks->lock);
 out:
        if (ret < 0)
-               fs_info->qgroup_flags |=
-                       BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
        return ret;
 }
 
@@ -4319,7 +4371,7 @@ out:
                btrfs_err_rl(fs_info,
                             "failed to account subtree at bytenr %llu: %d",
                             subvol_eb->start, ret);
-               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+               qgroup_mark_inconsistent(fs_info);
        }
        return ret;
 }
index 0c4dd2a..578c77e 100644 (file)
  *     subtree rescan for them.
  */
 
+#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN                (1UL << 3)
+#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING                (1UL << 4)
+
 /*
  * Record a dirty extent, and info qgroup to update quota on it
  * TODO: Use kmem cache to alloc it.
index 2feb5c2..f6395e8 100644 (file)
@@ -275,7 +275,6 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
        /* Also inherit the bitmaps from @victim. */
        bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
                  dest->stripe_nsectors);
-       dest->generic_bio_cnt += victim->generic_bio_cnt;
        bio_list_init(&victim->bio_list);
 }
 
@@ -814,8 +813,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
        struct bio *cur = bio_list_get(&rbio->bio_list);
        struct bio *extra;
 
-       if (rbio->generic_bio_cnt)
-               btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
        /*
         * Clear the data bitmap, as the rbio may be cached for later usage.
         * do this before before unlock_stripe() so there will be no new bio
@@ -946,6 +943,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
        spin_lock_init(&rbio->bio_list_lock);
        INIT_LIST_HEAD(&rbio->stripe_cache);
        INIT_LIST_HEAD(&rbio->hash_list);
+       btrfs_get_bioc(bioc);
        rbio->bioc = bioc;
        rbio->nr_pages = num_pages;
        rbio->nr_sectors = num_sectors;
@@ -1813,15 +1811,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 
        rbio = alloc_rbio(fs_info, bioc);
        if (IS_ERR(rbio)) {
-               btrfs_put_bioc(bioc);
                ret = PTR_ERR(rbio);
-               goto out_dec_counter;
+               goto fail;
        }
        rbio->operation = BTRFS_RBIO_WRITE;
        rbio_add_bio(rbio, bio);
 
-       rbio->generic_bio_cnt = 1;
-
        /*
         * don't plug on full rbios, just get them out the door
         * as quickly as we can
@@ -1829,7 +1824,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
        if (rbio_is_full(rbio)) {
                ret = full_stripe_write(rbio);
                if (ret)
-                       goto out_dec_counter;
+                       goto fail;
                return;
        }
 
@@ -1844,13 +1839,12 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
        } else {
                ret = __raid56_parity_write(rbio);
                if (ret)
-                       goto out_dec_counter;
+                       goto fail;
        }
 
        return;
 
-out_dec_counter:
-       btrfs_bio_counter_dec(fs_info);
+fail:
        bio->bi_status = errno_to_blk_status(ret);
        bio_endio(bio);
 }
@@ -2198,18 +2192,11 @@ cleanup:
  * of the drive.
  */
 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
-                          int mirror_num, bool generic_io)
+                          int mirror_num)
 {
        struct btrfs_fs_info *fs_info = bioc->fs_info;
        struct btrfs_raid_bio *rbio;
 
-       if (generic_io) {
-               ASSERT(bioc->mirror_num == mirror_num);
-               btrfs_bio(bio)->mirror_num = mirror_num;
-       } else {
-               btrfs_get_bioc(bioc);
-       }
-
        rbio = alloc_rbio(fs_info, bioc);
        if (IS_ERR(rbio)) {
                bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
@@ -2225,14 +2212,11 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
                           __func__, bio->bi_iter.bi_sector << 9,
                           (u64)bio->bi_iter.bi_size, bioc->map_type);
-               kfree(rbio);
+               __free_raid_bio(rbio);
                bio->bi_status = BLK_STS_IOERR;
                goto out_end_bio;
        }
 
-       if (generic_io)
-               rbio->generic_bio_cnt = 1;
-
        /*
         * Loop retry:
         * for 'mirror == 2', reconstruct from all other stripes.
@@ -2261,8 +2245,6 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
        return;
 
 out_end_bio:
-       btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bioc(bioc);
        bio_endio(bio);
 }
 
@@ -2326,13 +2308,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
        ASSERT(i < rbio->real_stripes);
 
        bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
-
-       /*
-        * We have already increased bio_counter when getting bioc, record it
-        * so we can free it at rbio_orig_end_io().
-        */
-       rbio->generic_bio_cnt = 1;
-
        return rbio;
 }
 
@@ -2772,12 +2747,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
                return NULL;
        }
 
-       /*
-        * When we get bioc, we have already increased bio_counter, record it
-        * so we can free it at rbio_orig_end_io()
-        */
-       rbio->generic_bio_cnt = 1;
-
        return rbio;
 }
 
index 6f48f9e..91d5c0a 100644 (file)
@@ -89,8 +89,6 @@ struct btrfs_raid_bio {
         */
        int bio_list_bytes;
 
-       int generic_bio_cnt;
-
        refcount_t refs;
 
        atomic_t stripes_pending;
@@ -166,7 +164,7 @@ static inline int nr_data_stripes(const struct map_lookup *map)
 struct btrfs_device;
 
 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
-                          int mirror_num, bool generic_io);
+                          int mirror_num);
 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
 
 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
index 9acf47b..f50586f 100644 (file)
@@ -92,7 +92,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 
        clear_extent_bit(&inode->io_tree, file_offset, range_end,
                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
-                        0, 0, NULL);
+                        NULL);
        ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
        if (ret)
                goto out_unlock;
@@ -615,8 +615,8 @@ out:
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
                                       struct inode *inode2, u64 loff2, u64 len)
 {
-       unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
-       unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+       unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL);
+       unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL);
 }
 
 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
@@ -634,8 +634,8 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
                swap(range1_end, range2_end);
        }
 
-       lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
-       lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
+       lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL);
+       lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL);
 
        btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
        btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
index 45c02ab..666a37a 100644 (file)
@@ -1124,10 +1124,10 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                                if (!ret)
                                        continue;
 
-                               btrfs_drop_extent_cache(BTRFS_I(inode),
-                                               key.offset,     end, 1);
+                               btrfs_drop_extent_map_range(BTRFS_I(inode),
+                                                           key.offset, end, true);
                                unlock_extent(&BTRFS_I(inode)->io_tree,
-                                             key.offset, end);
+                                             key.offset, end, NULL);
                        }
                }
 
@@ -1566,9 +1566,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
                }
 
                /* the lock_extent waits for read_folio to complete */
-               lock_extent(&BTRFS_I(inode)->io_tree, start, end);
-               btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
-               unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+               lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+               btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
+               unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
        }
        return 0;
 }
@@ -2869,13 +2869,13 @@ static noinline_for_stack int prealloc_file_extent_cluster(
                else
                        end = cluster->end - offset;
 
-               lock_extent(&inode->io_tree, start, end);
+               lock_extent(&inode->io_tree, start, end, NULL);
                num_bytes = end + 1 - start;
                ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
                                                num_bytes, num_bytes,
                                                end + 1, &alloc_hint);
                cur_offset = end + 1;
-               unlock_extent(&inode->io_tree, start, end);
+               unlock_extent(&inode->io_tree, start, end, NULL);
                if (ret)
                        break;
        }
@@ -2890,7 +2890,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(
 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
                                u64 start, u64 end, u64 block_start)
 {
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_map *em;
        int ret = 0;
 
@@ -2904,18 +2903,11 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
        em->block_start = block_start;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-       lock_extent(&BTRFS_I(inode)->io_tree, start, end);
-       while (1) {
-               write_lock(&em_tree->lock);
-               ret = add_extent_mapping(em_tree, em, 0);
-               write_unlock(&em_tree->lock);
-               if (ret != -EEXIST) {
-                       free_extent_map(em);
-                       break;
-               }
-               btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
-       }
-       unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
+       lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+       ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
+       unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
+       free_extent_map(em);
+
        return ret;
 }
 
@@ -3006,7 +2998,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
                        goto release_page;
 
                /* Mark the range delalloc and dirty for later writeback */
-               lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+               lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
                ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
                                                clamped_end, 0, NULL);
                if (ret) {
@@ -3039,7 +3031,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
                                        boundary_start, boundary_end,
                                        EXTENT_BOUNDARY);
                }
-               unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
+               unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
                btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
                cur += clamped_len;
 
@@ -4339,7 +4331,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
        disk_bytenr = file_pos + inode->index_cnt;
        csum_root = btrfs_csum_root(fs_info, disk_bytenr);
        ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
-                                      disk_bytenr + len - 1, &list, 0);
+                                      disk_bytenr + len - 1, &list, 0, false);
        if (ret)
                goto out;
 
index d647cb2..e1f599d 100644 (file)
@@ -337,7 +337,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
        struct extent_buffer *leaf;
        struct btrfs_key key;
        unsigned long ptr;
-       int err = 0;
        int ret;
 
        path = btrfs_alloc_path();
@@ -350,7 +349,6 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
 again:
        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
        if (ret < 0) {
-               err = ret;
                goto out;
        } else if (ret == 0) {
                leaf = path->nodes[0];
@@ -360,18 +358,18 @@ again:
                if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
                    (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
                    memcmp_extent_buffer(leaf, name, ptr, name_len)) {
-                       err = -ENOENT;
+                       ret = -ENOENT;
                        goto out;
                }
                *sequence = btrfs_root_ref_sequence(leaf, ref);
 
                ret = btrfs_del_item(trans, tree_root, path);
-               if (ret) {
-                       err = ret;
+               if (ret)
                        goto out;
-               }
-       } else
-               err = -ENOENT;
+       } else {
+               ret = -ENOENT;
+               goto out;
+       }
 
        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
                btrfs_release_path(path);
@@ -383,7 +381,7 @@ again:
 
 out:
        btrfs_free_path(path);
-       return err;
+       return ret;
 }
 
 /*
index 3afe5fa..f260c53 100644 (file)
@@ -54,6 +54,8 @@ struct scrub_ctx;
  */
 #define SCRUB_MAX_SECTORS_PER_BLOCK    (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
 
+#define SCRUB_MAX_PAGES                        (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
+
 struct scrub_recover {
        refcount_t              refs;
        struct btrfs_io_context *bioc;
@@ -62,16 +64,12 @@ struct scrub_recover {
 
 struct scrub_sector {
        struct scrub_block      *sblock;
-       struct page             *page;
-       struct btrfs_device     *dev;
        struct list_head        list;
        u64                     flags;  /* extent flags */
        u64                     generation;
-       u64                     logical;
-       u64                     physical;
-       u64                     physical_for_dev_replace;
+       /* Offset in bytes to @sblock. */
+       u32                     offset;
        atomic_t                refs;
-       u8                      mirror_num;
        unsigned int            have_csum:1;
        unsigned int            io_error:1;
        u8                      csum[BTRFS_CSUM_SIZE];
@@ -94,8 +92,22 @@ struct scrub_bio {
 };
 
 struct scrub_block {
+       /*
+        * Each page will have its page::private used to record the logical
+        * bytenr.
+        */
+       struct page             *pages[SCRUB_MAX_PAGES];
        struct scrub_sector     *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+       struct btrfs_device     *dev;
+       /* Logical bytenr of the sblock */
+       u64                     logical;
+       u64                     physical;
+       u64                     physical_for_dev_replace;
+       /* Length of sblock in bytes */
+       u32                     len;
        int                     sector_count;
+       int                     mirror_num;
+
        atomic_t                outstanding_sectors;
        refcount_t              refs; /* free mem on transition to zero */
        struct scrub_ctx        *sctx;
@@ -202,8 +214,174 @@ struct full_stripe_lock {
        struct mutex mutex;
 };
 
+#ifndef CONFIG_64BIT
+/* This structure is for archtectures whose (void *) is smaller than u64 */
+struct scrub_page_private {
+       u64 logical;
+};
+#endif
+
+static int attach_scrub_page_private(struct page *page, u64 logical)
+{
+#ifdef CONFIG_64BIT
+       attach_page_private(page, (void *)logical);
+       return 0;
+#else
+       struct scrub_page_private *spp;
+
+       spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+       if (!spp)
+               return -ENOMEM;
+       spp->logical = logical;
+       attach_page_private(page, (void *)spp);
+       return 0;
+#endif
+}
+
+static void detach_scrub_page_private(struct page *page)
+{
+#ifdef CONFIG_64BIT
+       detach_page_private(page);
+       return;
+#else
+       struct scrub_page_private *spp;
+
+       spp = detach_page_private(page);
+       kfree(spp);
+       return;
+#endif
+}
+
+static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+                                            struct btrfs_device *dev,
+                                            u64 logical, u64 physical,
+                                            u64 physical_for_dev_replace,
+                                            int mirror_num)
+{
+       struct scrub_block *sblock;
+
+       sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+       if (!sblock)
+               return NULL;
+       refcount_set(&sblock->refs, 1);
+       sblock->sctx = sctx;
+       sblock->logical = logical;
+       sblock->physical = physical;
+       sblock->physical_for_dev_replace = physical_for_dev_replace;
+       sblock->dev = dev;
+       sblock->mirror_num = mirror_num;
+       sblock->no_io_error_seen = 1;
+       /*
+        * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+        * the corresponding page is not allocated.
+        */
+       return sblock;
+}
+
+/*
+ * Allocate a new scrub sector and attach it to @sblock.
+ *
+ * Will also allocate new pages for @sblock if needed.
+ */
+static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+                                              u64 logical, gfp_t gfp)
+{
+       const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+       struct scrub_sector *ssector;
+
+       /* We must never have scrub_block exceed U32_MAX in size. */
+       ASSERT(logical - sblock->logical < U32_MAX);
+
+       ssector = kzalloc(sizeof(*ssector), gfp);
+       if (!ssector)
+               return NULL;
+
+       /* Allocate a new page if the slot is not allocated */
+       if (!sblock->pages[page_index]) {
+               int ret;
+
+               sblock->pages[page_index] = alloc_page(gfp);
+               if (!sblock->pages[page_index]) {
+                       kfree(ssector);
+                       return NULL;
+               }
+               ret = attach_scrub_page_private(sblock->pages[page_index],
+                               sblock->logical + (page_index << PAGE_SHIFT));
+               if (ret < 0) {
+                       kfree(ssector);
+                       __free_page(sblock->pages[page_index]);
+                       sblock->pages[page_index] = NULL;
+                       return NULL;
+               }
+       }
+
+       atomic_set(&ssector->refs, 1);
+       ssector->sblock = sblock;
+       /* The sector to be added should not be used */
+       ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+       ssector->offset = logical - sblock->logical;
+
+       /* The sector count must be smaller than the limit */
+       ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+
+       sblock->sectors[sblock->sector_count] = ssector;
+       sblock->sector_count++;
+       sblock->len += sblock->sctx->fs_info->sectorsize;
+
+       return ssector;
+}
+
+static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+{
+       struct scrub_block *sblock = ssector->sblock;
+       pgoff_t index;
+       /*
+        * When calling this function, ssector must be alreaday attached to the
+        * parent sblock.
+        */
+       ASSERT(sblock);
+
+       /* The range should be inside the sblock range */
+       ASSERT(ssector->offset < sblock->len);
+
+       index = ssector->offset >> PAGE_SHIFT;
+       ASSERT(index < SCRUB_MAX_PAGES);
+       ASSERT(sblock->pages[index]);
+       ASSERT(PagePrivate(sblock->pages[index]));
+       return sblock->pages[index];
+}
+
+static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
+{
+       struct scrub_block *sblock = ssector->sblock;
+
+       /*
+        * When calling this function, ssector must be already attached to the
+        * parent sblock.
+        */
+       ASSERT(sblock);
+
+       /* The range should be inside the sblock range */
+       ASSERT(ssector->offset < sblock->len);
+
+       return offset_in_page(ssector->offset);
+}
+
+static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+{
+       return page_address(scrub_sector_get_page(ssector)) +
+              scrub_sector_get_page_offset(ssector);
+}
+
+static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+                               unsigned int len)
+{
+       return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+                           scrub_sector_get_page_offset(ssector));
+}
+
 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-                                    struct scrub_block *sblocks_for_recheck);
+                                    struct scrub_block *sblocks_for_recheck[]);
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock,
                                int retry_failed_mirror);
@@ -533,10 +711,8 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
        if (sctx->curr != -1) {
                struct scrub_bio *sbio = sctx->bios[sctx->curr];
 
-               for (i = 0; i < sbio->sector_count; i++) {
-                       WARN_ON(!sbio->sectors[i]->page);
+               for (i = 0; i < sbio->sector_count; i++)
                        scrub_block_put(sbio->sectors[i]->sblock);
-               }
                bio_put(sbio->bio);
        }
 
@@ -726,15 +902,22 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
        int ret;
 
        WARN_ON(sblock->sector_count < 1);
-       dev = sblock->sectors[0]->dev;
+       dev = sblock->dev;
        fs_info = sblock->sctx->fs_info;
 
+       /* Super block error, no need to search extent tree. */
+       if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+               btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+                       errstr, rcu_str_deref(dev->name),
+                       sblock->physical);
+               return;
+       }
        path = btrfs_alloc_path();
        if (!path)
                return;
 
-       swarn.physical = sblock->sectors[0]->physical;
-       swarn.logical = sblock->sectors[0]->logical;
+       swarn.physical = sblock->physical;
+       swarn.logical = sblock->logical;
        swarn.errstr = errstr;
        swarn.dev = NULL;
 
@@ -804,13 +987,14 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
        struct scrub_ctx *sctx = sblock_to_check->sctx;
-       struct btrfs_device *dev;
+       struct btrfs_device *dev = sblock_to_check->dev;
        struct btrfs_fs_info *fs_info;
        u64 logical;
        unsigned int failed_mirror_index;
        unsigned int is_metadata;
        unsigned int have_csum;
-       struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+       /* One scrub_block for each mirror */
+       struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
        struct scrub_block *sblock_bad;
        int ret;
        int mirror_index;
@@ -825,22 +1009,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        fs_info = sctx->fs_info;
        if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
                /*
-                * if we find an error in a super block, we just report it.
+                * If we find an error in a super block, we just report it.
                 * They will get written with the next transaction commit
                 * anyway
                 */
+               scrub_print_warning("super block error", sblock_to_check);
                spin_lock(&sctx->stat_lock);
                ++sctx->stat.super_errors;
                spin_unlock(&sctx->stat_lock);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
                return 0;
        }
-       logical = sblock_to_check->sectors[0]->logical;
-       BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
-       failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
+       logical = sblock_to_check->logical;
+       ASSERT(sblock_to_check->mirror_num);
+       failed_mirror_index = sblock_to_check->mirror_num - 1;
        is_metadata = !(sblock_to_check->sectors[0]->flags &
                        BTRFS_EXTENT_FLAG_DATA);
        have_csum = sblock_to_check->sectors[0]->have_csum;
-       dev = sblock_to_check->sectors[0]->dev;
 
        if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
                return 0;
@@ -902,17 +1087,28 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         * repaired area is verified in order to correctly maintain
         * the statistics.
         */
-
-       sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
-                                     sizeof(*sblocks_for_recheck), GFP_KERNEL);
-       if (!sblocks_for_recheck) {
-               spin_lock(&sctx->stat_lock);
-               sctx->stat.malloc_errors++;
-               sctx->stat.read_errors++;
-               sctx->stat.uncorrectable_errors++;
-               spin_unlock(&sctx->stat_lock);
-               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
-               goto out;
+       for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+               /*
+                * Note: the two members refs and outstanding_sectors are not
+                * used in the blocks that are used for the recheck procedure.
+                *
+                * But alloc_scrub_block() will initialize sblock::ref anyway,
+                * so we can use scrub_block_put() to clean them up.
+                *
+                * And here we don't setup the physical/dev for the sblock yet,
+                * they will be correctly initialized in scrub_setup_recheck_block().
+                */
+               sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+                                                       logical, 0, 0, mirror_index);
+               if (!sblocks_for_recheck[mirror_index]) {
+                       spin_lock(&sctx->stat_lock);
+                       sctx->stat.malloc_errors++;
+                       sctx->stat.read_errors++;
+                       sctx->stat.uncorrectable_errors++;
+                       spin_unlock(&sctx->stat_lock);
+                       btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+                       goto out;
+               }
        }
 
        /* Setup the context, map the logical blocks and alloc the sectors */
@@ -926,7 +1122,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                goto out;
        }
        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
-       sblock_bad = sblocks_for_recheck + failed_mirror_index;
+       sblock_bad = sblocks_for_recheck[failed_mirror_index];
 
        /* build and submit the bios for the failed mirror, check checksums */
        scrub_recheck_block(fs_info, sblock_bad, 1);
@@ -1011,22 +1207,22 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
                        if (mirror_index >= BTRFS_MAX_MIRRORS)
                                break;
-                       if (!sblocks_for_recheck[mirror_index].sector_count)
+                       if (!sblocks_for_recheck[mirror_index]->sector_count)
                                break;
 
-                       sblock_other = sblocks_for_recheck + mirror_index;
+                       sblock_other = sblocks_for_recheck[mirror_index];
                } else {
                        struct scrub_recover *r = sblock_bad->sectors[0]->recover;
                        int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
 
                        if (mirror_index >= max_allowed)
                                break;
-                       if (!sblocks_for_recheck[1].sector_count)
+                       if (!sblocks_for_recheck[1]->sector_count)
                                break;
 
                        ASSERT(failed_mirror_index == 0);
-                       sblock_other = sblocks_for_recheck + 1;
-                       sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
+                       sblock_other = sblocks_for_recheck[1];
+                       sblock_other->mirror_num = 1 + mirror_index;
                }
 
                /* build and submit the bios, check checksums */
@@ -1097,12 +1293,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                        /* Try to find no-io-error sector in mirrors */
                        for (mirror_index = 0;
                             mirror_index < BTRFS_MAX_MIRRORS &&
-                            sblocks_for_recheck[mirror_index].sector_count > 0;
+                            sblocks_for_recheck[mirror_index]->sector_count > 0;
                             mirror_index++) {
-                               if (!sblocks_for_recheck[mirror_index].
+                               if (!sblocks_for_recheck[mirror_index]->
                                    sectors[sector_num]->io_error) {
-                                       sblock_other = sblocks_for_recheck +
-                                                      mirror_index;
+                                       sblock_other = sblocks_for_recheck[mirror_index];
                                        break;
                                }
                        }
@@ -1176,25 +1371,28 @@ did_not_correct_error:
        }
 
 out:
-       if (sblocks_for_recheck) {
-               for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
-                    mirror_index++) {
-                       struct scrub_block *sblock = sblocks_for_recheck +
-                                                    mirror_index;
-                       struct scrub_recover *recover;
-                       int i;
-
-                       for (i = 0; i < sblock->sector_count; i++) {
-                               sblock->sectors[i]->sblock = NULL;
-                               recover = sblock->sectors[i]->recover;
-                               if (recover) {
-                                       scrub_put_recover(fs_info, recover);
-                                       sblock->sectors[i]->recover = NULL;
-                               }
-                               scrub_sector_put(sblock->sectors[i]);
+       for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+               struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+               struct scrub_recover *recover;
+               int sector_index;
+
+               /* Not allocated, continue checking the next mirror */
+               if (!sblock)
+                       continue;
+
+               for (sector_index = 0; sector_index < sblock->sector_count;
+                    sector_index++) {
+                       /*
+                        * Here we just cleanup the recover, each sector will be
+                        * properly cleaned up by later scrub_block_put()
+                        */
+                       recover = sblock->sectors[sector_index]->recover;
+                       if (recover) {
+                               scrub_put_recover(fs_info, recover);
+                               sblock->sectors[sector_index]->recover = NULL;
                        }
                }
-               kfree(sblocks_for_recheck);
+               scrub_block_put(sblock);
        }
 
        ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
@@ -1244,12 +1442,12 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
 }
 
 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-                                    struct scrub_block *sblocks_for_recheck)
+                                    struct scrub_block *sblocks_for_recheck[])
 {
        struct scrub_ctx *sctx = original_sblock->sctx;
        struct btrfs_fs_info *fs_info = sctx->fs_info;
+       u64 logical = original_sblock->logical;
        u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
-       u64 logical = original_sblock->sectors[0]->logical;
        u64 generation = original_sblock->sectors[0]->generation;
        u64 flags = original_sblock->sectors[0]->flags;
        u64 have_csum = original_sblock->sectors[0]->have_csum;
@@ -1264,11 +1462,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
        int nmirrors;
        int ret;
 
-       /*
-        * Note: the two members refs and outstanding_sectors are not used (and
-        * not set) in the blocks that are used for the recheck procedure.
-        */
-
        while (length > 0) {
                sublen = min_t(u64, length, fs_info->sectorsize);
                mapped_length = sublen;
@@ -1307,24 +1500,19 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
                        struct scrub_block *sblock;
                        struct scrub_sector *sector;
 
-                       sblock = sblocks_for_recheck + mirror_index;
+                       sblock = sblocks_for_recheck[mirror_index];
                        sblock->sctx = sctx;
 
-                       sector = kzalloc(sizeof(*sector), GFP_NOFS);
+                       sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
                        if (!sector) {
-leave_nomem:
                                spin_lock(&sctx->stat_lock);
                                sctx->stat.malloc_errors++;
                                spin_unlock(&sctx->stat_lock);
                                scrub_put_recover(fs_info, recover);
                                return -ENOMEM;
                        }
-                       scrub_sector_get(sector);
-                       sblock->sectors[sector_index] = sector;
-                       sector->sblock = sblock;
                        sector->flags = flags;
                        sector->generation = generation;
-                       sector->logical = logical;
                        sector->have_csum = have_csum;
                        if (have_csum)
                                memcpy(sector->csum,
@@ -1339,21 +1527,20 @@ leave_nomem:
                                                      mirror_index,
                                                      &stripe_index,
                                                      &stripe_offset);
-                       sector->physical = bioc->stripes[stripe_index].physical +
-                                        stripe_offset;
-                       sector->dev = bioc->stripes[stripe_index].dev;
+                       /*
+                        * We're at the first sector, also populate @sblock
+                        * physical and dev.
+                        */
+                       if (sector_index == 0) {
+                               sblock->physical =
+                                       bioc->stripes[stripe_index].physical +
+                                       stripe_offset;
+                               sblock->dev = bioc->stripes[stripe_index].dev;
+                               sblock->physical_for_dev_replace =
+                                       original_sblock->physical_for_dev_replace;
+                       }
 
                        BUG_ON(sector_index >= original_sblock->sector_count);
-                       sector->physical_for_dev_replace =
-                               original_sblock->sectors[sector_index]->
-                               physical_for_dev_replace;
-                       /* For missing devices, dev->bdev is NULL */
-                       sector->mirror_num = mirror_index + 1;
-                       sblock->sector_count++;
-                       sector->page = alloc_page(GFP_NOFS);
-                       if (!sector->page)
-                               goto leave_nomem;
-
                        scrub_get_recover(recover);
                        sector->recover = recover;
                }
@@ -1377,11 +1564,11 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 {
        DECLARE_COMPLETION_ONSTACK(done);
 
-       bio->bi_iter.bi_sector = sector->logical >> 9;
+       bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+                                SECTOR_SHIFT;
        bio->bi_private = &done;
        bio->bi_end_io = scrub_bio_wait_endio;
-       raid56_parity_recover(bio, sector->recover->bioc,
-                             sector->sblock->sectors[0]->mirror_num, false);
+       raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
 
        wait_for_completion_io(&done);
        return blk_status_to_errno(bio->bi_status);
@@ -1395,17 +1582,16 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
        int i;
 
        /* All sectors in sblock belong to the same stripe on the same device. */
-       ASSERT(first_sector->dev);
-       if (!first_sector->dev->bdev)
+       ASSERT(sblock->dev);
+       if (!sblock->dev->bdev)
                goto out;
 
-       bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+       bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
 
        for (i = 0; i < sblock->sector_count; i++) {
                struct scrub_sector *sector = sblock->sectors[i];
 
-               WARN_ON(!sector->page);
-               bio_add_page(bio, sector->page, PAGE_SIZE, 0);
+               bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
        }
 
        if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
@@ -1449,16 +1635,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                struct bio bio;
                struct bio_vec bvec;
 
-               if (sector->dev->bdev == NULL) {
+               if (sblock->dev->bdev == NULL) {
                        sector->io_error = 1;
                        sblock->no_io_error_seen = 0;
                        continue;
                }
 
-               WARN_ON(!sector->page);
-               bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
-               bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
-               bio.bi_iter.bi_sector = sector->physical >> 9;
+               bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+               bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+               bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+                                       SECTOR_SHIFT;
 
                btrfsic_check_bio(&bio);
                if (submit_bio_wait(&bio)) {
@@ -1475,7 +1661,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
 {
-       struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
+       struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
        int ret;
 
        ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -1521,30 +1707,29 @@ static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
        struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
        const u32 sectorsize = fs_info->sectorsize;
 
-       BUG_ON(sector_bad->page == NULL);
-       BUG_ON(sector_good->page == NULL);
        if (force_write || sblock_bad->header_error ||
            sblock_bad->checksum_error || sector_bad->io_error) {
                struct bio bio;
                struct bio_vec bvec;
                int ret;
 
-               if (!sector_bad->dev->bdev) {
+               if (!sblock_bad->dev->bdev) {
                        btrfs_warn_rl(fs_info,
                                "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
                        return -EIO;
                }
 
-               bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
-               bio.bi_iter.bi_sector = sector_bad->physical >> 9;
-               __bio_add_page(&bio, sector_good->page, sectorsize, 0);
+               bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+               bio.bi_iter.bi_sector = (sblock_bad->physical +
+                                        sector_bad->offset) >> SECTOR_SHIFT;
+               ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
 
                btrfsic_check_bio(&bio);
                ret = submit_bio_wait(&bio);
                bio_uninit(&bio);
 
                if (ret) {
-                       btrfs_dev_stat_inc_and_print(sector_bad->dev,
+                       btrfs_dev_stat_inc_and_print(sblock_bad->dev,
                                BTRFS_DEV_STAT_WRITE_ERRS);
                        atomic64_inc(&fs_info->dev_replace.num_write_errors);
                        return -EIO;
@@ -1577,11 +1762,11 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
 
 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
 {
+       const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
        struct scrub_sector *sector = sblock->sectors[sector_num];
 
-       BUG_ON(sector->page == NULL);
        if (sector->io_error)
-               clear_page(page_address(sector->page));
+               memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
 
        return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
 }
@@ -1608,9 +1793,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
        return ret;
 }
 
+static void scrub_block_get(struct scrub_block *sblock)
+{
+       refcount_inc(&sblock->refs);
+}
+
 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
                                      struct scrub_sector *sector)
 {
+       struct scrub_block *sblock = sector->sblock;
        struct scrub_bio *sbio;
        int ret;
        const u32 sectorsize = sctx->fs_info->sectorsize;
@@ -1629,14 +1820,15 @@ again:
        }
        sbio = sctx->wr_curr_bio;
        if (sbio->sector_count == 0) {
-               ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
+               ret = fill_writer_pointer_gap(sctx, sector->offset +
+                                             sblock->physical_for_dev_replace);
                if (ret) {
                        mutex_unlock(&sctx->wr_lock);
                        return ret;
                }
 
-               sbio->physical = sector->physical_for_dev_replace;
-               sbio->logical = sector->logical;
+               sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+               sbio->logical = sblock->logical + sector->offset;
                sbio->dev = sctx->wr_tgtdev;
                if (!sbio->bio) {
                        sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
@@ -1647,14 +1839,14 @@ again:
                sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->status = 0;
        } else if (sbio->physical + sbio->sector_count * sectorsize !=
-                  sector->physical_for_dev_replace ||
+                  sblock->physical_for_dev_replace + sector->offset ||
                   sbio->logical + sbio->sector_count * sectorsize !=
-                  sector->logical) {
+                  sblock->logical + sector->offset) {
                scrub_wr_submit(sctx);
                goto again;
        }
 
-       ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+       ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
        if (ret != sectorsize) {
                if (sbio->sector_count < 1) {
                        bio_put(sbio->bio);
@@ -1668,6 +1860,13 @@ again:
 
        sbio->sectors[sbio->sector_count] = sector;
        scrub_sector_get(sector);
+       /*
+        * Since ssector no longer holds a page, but uses sblock::pages, we
+        * have to ensure the sblock had not been freed before our write bio
+        * finished.
+        */
+       scrub_block_get(sector->sblock);
+
        sbio->sector_count++;
        if (sbio->sector_count == sctx->sectors_per_bio)
                scrub_wr_submit(sctx);
@@ -1729,8 +1928,14 @@ static void scrub_wr_bio_end_io_worker(struct work_struct *work)
                }
        }
 
-       for (i = 0; i < sbio->sector_count; i++)
+       /*
+        * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+        * endio we should put the sblock.
+        */
+       for (i = 0; i < sbio->sector_count; i++) {
+               scrub_block_put(sbio->sectors[i]->sblock);
                scrub_sector_put(sbio->sectors[i]);
+       }
 
        bio_put(sbio->bio);
        kfree(sbio);
@@ -1762,7 +1967,7 @@ static int scrub_checksum(struct scrub_block *sblock)
        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = scrub_checksum_tree_block(sblock);
        else if (flags & BTRFS_EXTENT_FLAG_SUPER)
-               (void)scrub_checksum_super(sblock);
+               ret = scrub_checksum_super(sblock);
        else
                WARN_ON(1);
        if (ret)
@@ -1785,15 +1990,11 @@ static int scrub_checksum_data(struct scrub_block *sblock)
        if (!sector->have_csum)
                return 0;
 
-       kaddr = page_address(sector->page);
+       kaddr = scrub_sector_get_kaddr(sector);
 
        shash->tfm = fs_info->csum_shash;
        crypto_shash_init(shash);
 
-       /*
-        * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
-        * only contains one sector of data.
-        */
        crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
 
        if (memcmp(csum, sector->csum, fs_info->csum_size))
@@ -1826,7 +2027,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
        ASSERT(sblock->sector_count == num_sectors);
 
        sector = sblock->sectors[0];
-       kaddr = page_address(sector->page);
+       kaddr = scrub_sector_get_kaddr(sector);
        h = (struct btrfs_header *)kaddr;
        memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
 
@@ -1835,7 +2036,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
         * a) don't have an extent buffer and
         * b) the page is already kmapped
         */
-       if (sector->logical != btrfs_stack_header_bytenr(h))
+       if (sblock->logical != btrfs_stack_header_bytenr(h))
                sblock->header_error = 1;
 
        if (sector->generation != btrfs_stack_header_generation(h)) {
@@ -1856,7 +2057,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                            sectorsize - BTRFS_CSUM_SIZE);
 
        for (i = 1; i < num_sectors; i++) {
-               kaddr = page_address(sblock->sectors[i]->page);
+               kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
                crypto_shash_update(shash, kaddr, sectorsize);
        }
 
@@ -1881,10 +2082,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
        BUG_ON(sblock->sector_count < 1);
        sector = sblock->sectors[0];
-       kaddr = page_address(sector->page);
+       kaddr = scrub_sector_get_kaddr(sector);
        s = (struct btrfs_super_block *)kaddr;
 
-       if (sector->logical != btrfs_super_bytenr(s))
+       if (sblock->logical != btrfs_super_bytenr(s))
                ++fail_cor;
 
        if (sector->generation != btrfs_super_generation(s))
@@ -1901,31 +2102,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
                ++fail_cor;
 
-       if (fail_cor + fail_gen) {
-               /*
-                * if we find an error in a super block, we just report it.
-                * They will get written with the next transaction commit
-                * anyway
-                */
-               spin_lock(&sctx->stat_lock);
-               ++sctx->stat.super_errors;
-               spin_unlock(&sctx->stat_lock);
-               if (fail_cor)
-                       btrfs_dev_stat_inc_and_print(sector->dev,
-                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
-               else
-                       btrfs_dev_stat_inc_and_print(sector->dev,
-                               BTRFS_DEV_STAT_GENERATION_ERRS);
-       }
-
        return fail_cor + fail_gen;
 }
 
-static void scrub_block_get(struct scrub_block *sblock)
-{
-       refcount_inc(&sblock->refs);
-}
-
 static void scrub_block_put(struct scrub_block *sblock)
 {
        if (refcount_dec_and_test(&sblock->refs)) {
@@ -1936,6 +2115,12 @@ static void scrub_block_put(struct scrub_block *sblock)
 
                for (i = 0; i < sblock->sector_count; i++)
                        scrub_sector_put(sblock->sectors[i]);
+               for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+                       if (sblock->pages[i]) {
+                               detach_scrub_page_private(sblock->pages[i]);
+                               __free_page(sblock->pages[i]);
+                       }
+               }
                kfree(sblock);
        }
 }
@@ -1947,11 +2132,8 @@ static void scrub_sector_get(struct scrub_sector *sector)
 
 static void scrub_sector_put(struct scrub_sector *sector)
 {
-       if (atomic_dec_and_test(&sector->refs)) {
-               if (sector->page)
-                       __free_page(sector->page);
+       if (atomic_dec_and_test(&sector->refs))
                kfree(sector);
-       }
 }
 
 /*
@@ -2056,9 +2238,9 @@ again:
        }
        sbio = sctx->bios[sctx->curr];
        if (sbio->sector_count == 0) {
-               sbio->physical = sector->physical;
-               sbio->logical = sector->logical;
-               sbio->dev = sector->dev;
+               sbio->physical = sblock->physical + sector->offset;
+               sbio->logical = sblock->logical + sector->offset;
+               sbio->dev = sblock->dev;
                if (!sbio->bio) {
                        sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
                                              REQ_OP_READ, GFP_NOFS);
@@ -2068,16 +2250,16 @@ again:
                sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
                sbio->status = 0;
        } else if (sbio->physical + sbio->sector_count * sectorsize !=
-                  sector->physical ||
+                  sblock->physical + sector->offset ||
                   sbio->logical + sbio->sector_count * sectorsize !=
-                  sector->logical ||
-                  sbio->dev != sector->dev) {
+                  sblock->logical + sector->offset ||
+                  sbio->dev != sblock->dev) {
                scrub_submit(sctx);
                goto again;
        }
 
        sbio->sectors[sbio->sector_count] = sector;
-       ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
+       ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
        if (ret != sectorsize) {
                if (sbio->sector_count < 1) {
                        bio_put(sbio->bio);
@@ -2102,6 +2284,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
        struct scrub_block *sblock = bio->bi_private;
        struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
 
+       btrfs_bio_counter_dec(fs_info);
        if (bio->bi_status)
                sblock->no_io_error_seen = 0;
 
@@ -2118,8 +2301,8 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
        u64 logical;
        struct btrfs_device *dev;
 
-       logical = sblock->sectors[0]->logical;
-       dev = sblock->sectors[0]->dev;
+       logical = sblock->logical;
+       dev = sblock->dev;
 
        if (sblock->no_io_error_seen)
                scrub_recheck_block_checksum(sblock);
@@ -2157,7 +2340,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
        struct scrub_ctx *sctx = sblock->sctx;
        struct btrfs_fs_info *fs_info = sctx->fs_info;
        u64 length = sblock->sector_count << fs_info->sectorsize_bits;
-       u64 logical = sblock->sectors[0]->logical;
+       u64 logical = sblock->logical;
        struct btrfs_io_context *bioc = NULL;
        struct bio *bio;
        struct btrfs_raid_bio *rbio;
@@ -2193,17 +2376,16 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
        for (i = 0; i < sblock->sector_count; i++) {
                struct scrub_sector *sector = sblock->sectors[i];
 
-               /*
-                * For now, our scrub is still one page per sector, so pgoff
-                * is always 0.
-                */
-               raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
+               raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+                                      scrub_sector_get_page_offset(sector),
+                                      sector->offset + sector->sblock->logical);
        }
 
        INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
        scrub_block_get(sblock);
        scrub_pending_bio_inc(sctx);
        raid56_submit_missing_rbio(rbio);
+       btrfs_put_bioc(bioc);
        return;
 
 rbio_out:
@@ -2225,7 +2407,8 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
        const u32 sectorsize = sctx->fs_info->sectorsize;
        int index;
 
-       sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+       sblock = alloc_scrub_block(sctx, dev, logical, physical,
+                                  physical_for_dev_replace, mirror_num);
        if (!sblock) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
@@ -2233,12 +2416,6 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
                return -ENOMEM;
        }
 
-       /* one ref inside this function, plus one for each page added to
-        * a bio later on */
-       refcount_set(&sblock->refs, 1);
-       sblock->sctx = sctx;
-       sblock->no_io_error_seen = 1;
-
        for (index = 0; len > 0; index++) {
                struct scrub_sector *sector;
                /*
@@ -2248,36 +2425,22 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
                 */
                u32 l = min(sectorsize, len);
 
-               sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+               sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
                if (!sector) {
-leave_nomem:
                        spin_lock(&sctx->stat_lock);
                        sctx->stat.malloc_errors++;
                        spin_unlock(&sctx->stat_lock);
                        scrub_block_put(sblock);
                        return -ENOMEM;
                }
-               ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
-               scrub_sector_get(sector);
-               sblock->sectors[index] = sector;
-               sector->sblock = sblock;
-               sector->dev = dev;
                sector->flags = flags;
                sector->generation = gen;
-               sector->logical = logical;
-               sector->physical = physical;
-               sector->physical_for_dev_replace = physical_for_dev_replace;
-               sector->mirror_num = mirror_num;
                if (csum) {
                        sector->have_csum = 1;
                        memcpy(sector->csum, csum, sctx->fs_info->csum_size);
                } else {
                        sector->have_csum = 0;
                }
-               sblock->sector_count++;
-               sector->page = alloc_page(GFP_KERNEL);
-               if (!sector->page)
-                       goto leave_nomem;
                len -= l;
                logical += l;
                physical += l;
@@ -2423,8 +2586,9 @@ static void scrub_block_complete(struct scrub_block *sblock)
        }
 
        if (sblock->sparity && corrupted && !sblock->data_corrected) {
-               u64 start = sblock->sectors[0]->logical;
-               u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
+               u64 start = sblock->logical;
+               u64 end = sblock->logical +
+                         sblock->sectors[sblock->sector_count - 1]->offset +
                          sblock->sctx->fs_info->sectorsize;
 
                ASSERT(end - start <= U32_MAX);
@@ -2508,11 +2672,17 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
        u8 csum[BTRFS_CSUM_SIZE];
        u32 blocksize;
 
+       /*
+        * Block size determines how many scrub_block will be allocated.  Here
+        * we use BTRFS_STRIPE_LEN (64KiB) as default limit, so we won't
+        * allocate too many scrub_block, while still won't cause too large
+        * bios for large extents.
+        */
        if (flags & BTRFS_EXTENT_FLAG_DATA) {
                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
                        blocksize = map->stripe_len;
                else
-                       blocksize = sctx->fs_info->sectorsize;
+                       blocksize = BTRFS_STRIPE_LEN;
                spin_lock(&sctx->stat_lock);
                sctx->stat.data_extents_scrubbed++;
                sctx->stat.data_bytes_scrubbed += len;
@@ -2578,7 +2748,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
 
        ASSERT(IS_ALIGNED(len, sectorsize));
 
-       sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+       sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
        if (!sblock) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
@@ -2586,51 +2756,32 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
                return -ENOMEM;
        }
 
-       /* one ref inside this function, plus one for each page added to
-        * a bio later on */
-       refcount_set(&sblock->refs, 1);
-       sblock->sctx = sctx;
-       sblock->no_io_error_seen = 1;
        sblock->sparity = sparity;
        scrub_parity_get(sparity);
 
        for (index = 0; len > 0; index++) {
                struct scrub_sector *sector;
 
-               sector = kzalloc(sizeof(*sector), GFP_KERNEL);
+               sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
                if (!sector) {
-leave_nomem:
                        spin_lock(&sctx->stat_lock);
                        sctx->stat.malloc_errors++;
                        spin_unlock(&sctx->stat_lock);
                        scrub_block_put(sblock);
                        return -ENOMEM;
                }
-               ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
-               /* For scrub block */
-               scrub_sector_get(sector);
                sblock->sectors[index] = sector;
                /* For scrub parity */
                scrub_sector_get(sector);
                list_add_tail(&sector->list, &sparity->sectors_list);
-               sector->sblock = sblock;
-               sector->dev = dev;
                sector->flags = flags;
                sector->generation = gen;
-               sector->logical = logical;
-               sector->physical = physical;
-               sector->mirror_num = mirror_num;
                if (csum) {
                        sector->have_csum = 1;
                        memcpy(sector->csum, csum, sctx->fs_info->csum_size);
                } else {
                        sector->have_csum = 0;
                }
-               sblock->sector_count++;
-               sector->page = alloc_page(GFP_KERNEL);
-               if (!sector->page)
-                       goto leave_nomem;
-
 
                /* Iterate over the stripe range in sectorsize steps */
                len -= sectorsize;
@@ -2774,6 +2925,7 @@ static void scrub_parity_bio_endio_worker(struct work_struct *work)
                                                    work);
        struct scrub_ctx *sctx = sparity->sctx;
 
+       btrfs_bio_counter_dec(sctx->fs_info);
        scrub_free_parity(sparity);
        scrub_pending_bio_dec(sctx);
 }
@@ -2824,6 +2976,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
                                              sparity->scrub_dev,
                                              &sparity->dbitmap,
                                              sparity->nsectors);
+       btrfs_put_bioc(bioc);
        if (!rbio)
                goto rbio_out;
 
@@ -2835,7 +2988,6 @@ rbio_out:
        bio_put(bio);
 bioc_out:
        btrfs_bio_counter_dec(fs_info);
-       btrfs_put_bioc(bioc);
        bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
                  sparity->nsectors);
        spin_lock(&sctx->stat_lock);
@@ -3077,7 +3229,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
 
                ret = btrfs_lookup_csums_range(csum_root, extent_start,
                                               extent_start + extent_size - 1,
-                                              &sctx->csum_list, 1);
+                                              &sctx->csum_list, 1, false);
                if (ret) {
                        scrub_parity_mark_sectors_error(sparity, extent_start,
                                                        extent_size);
@@ -3266,7 +3418,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
                }
                /* Block group removed? */
                spin_lock(&bg->lock);
-               if (bg->removed) {
+               if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) {
                        spin_unlock(&bg->lock);
                        ret = 0;
                        break;
@@ -3303,7 +3455,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
                if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
                        ret = btrfs_lookup_csums_range(csum_root, cur_logical,
                                        cur_logical + scrub_len - 1,
-                                       &sctx->csum_list, 1);
+                                       &sctx->csum_list, 1, false);
                        if (ret)
                                break;
                }
@@ -3606,7 +3758,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
                 * kthread or relocation.
                 */
                spin_lock(&bg->lock);
-               if (!bg->removed)
+               if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags))
                        ret = -EINVAL;
                spin_unlock(&bg->lock);
 
@@ -3764,13 +3916,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                }
 
                if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
-                       spin_lock(&cache->lock);
-                       if (!cache->to_copy) {
+                       if (!test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags)) {
                                spin_unlock(&cache->lock);
                                btrfs_put_block_group(cache);
                                goto skip;
                        }
-                       spin_unlock(&cache->lock);
                }
 
                /*
@@ -3782,7 +3932,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                 * repair extents.
                 */
                spin_lock(&cache->lock);
-               if (cache->removed) {
+               if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
                        spin_unlock(&cache->lock);
                        btrfs_put_block_group(cache);
                        goto skip;
@@ -3942,8 +4092,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                 * balance is triggered or it becomes used and unused again.
                 */
                spin_lock(&cache->lock);
-               if (!cache->removed && !cache->ro && cache->reserved == 0 &&
-                   cache->used == 0) {
+               if (!test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags) &&
+                   !cache->ro && cache->reserved == 0 && cache->used == 0) {
                        spin_unlock(&cache->lock);
                        if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
                                btrfs_discard_queue_work(&fs_info->discard_ctl,
@@ -4102,36 +4252,21 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        int ret;
        struct btrfs_device *dev;
        unsigned int nofs_flag;
+       bool need_commit = false;
 
        if (btrfs_fs_closing(fs_info))
                return -EAGAIN;
 
-       if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
-               /*
-                * in this case scrub is unable to calculate the checksum
-                * the way scrub is implemented. Do not handle this
-                * situation at all because it won't ever happen.
-                */
-               btrfs_err(fs_info,
-                          "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
-                      fs_info->nodesize,
-                      BTRFS_STRIPE_LEN);
-               return -EINVAL;
-       }
+       /* At mount time we have ensured nodesize is in the range of [4K, 64K]. */
+       ASSERT(fs_info->nodesize <= BTRFS_STRIPE_LEN);
 
-       if (fs_info->nodesize >
-           SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
-           fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
-               /*
-                * Would exhaust the array bounds of sectorv member in
-                * struct scrub_block
-                */
-               btrfs_err(fs_info,
-"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
-                      fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
-                      fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
-               return -EINVAL;
-       }
+       /*
+        * SCRUB_MAX_SECTORS_PER_BLOCK is calculated using the largest possible
+        * value (max nodesize / min sectorsize), thus nodesize should always
+        * be fine.
+        */
+       ASSERT(fs_info->nodesize <=
+              SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits);
 
        /* Allocate outside of device_list_mutex */
        sctx = scrub_setup_ctx(fs_info, is_dev_replace);
@@ -4205,6 +4340,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
         */
        nofs_flag = memalloc_nofs_save();
        if (!is_dev_replace) {
+               u64 old_super_errors;
+
+               spin_lock(&sctx->stat_lock);
+               old_super_errors = sctx->stat.super_errors;
+               spin_unlock(&sctx->stat_lock);
+
                btrfs_info(fs_info, "scrub: started on devid %llu", devid);
                /*
                 * by holding device list mutex, we can
@@ -4213,6 +4354,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
                mutex_lock(&fs_info->fs_devices->device_list_mutex);
                ret = scrub_supers(sctx, dev);
                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+               spin_lock(&sctx->stat_lock);
+               /*
+                * Super block errors found, but we can not commit transaction
+                * at current context, since btrfs_commit_transaction() needs
+                * to pause the current running scrub (hold by ourselves).
+                */
+               if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+                       need_commit = true;
+               spin_unlock(&sctx->stat_lock);
        }
 
        if (!ret)
@@ -4239,6 +4390,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        scrub_workers_put(fs_info);
        scrub_put_ctx(sctx);
 
+       /*
+        * We found some super block errors before, now try to force a
+        * transaction commit, as scrub has finished.
+        */
+       if (need_commit) {
+               struct btrfs_trans_handle *trans;
+
+               trans = btrfs_start_transaction(fs_info->tree_root, 0);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       btrfs_err(fs_info,
+       "scrub: failed to start transaction to fix super block errors: %d", ret);
+                       return ret;
+               }
+               ret = btrfs_commit_transaction(trans);
+               if (ret < 0)
+                       btrfs_err(fs_info,
+       "scrub: failed to commit transaction to fix super block errors: %d", ret);
+       }
        return ret;
 out:
        scrub_workers_put(fs_info);
index e7671af..4ef4167 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 #include <linux/compat.h>
 #include <linux/crc32c.h>
+#include <linux/fsverity.h>
 
 #include "send.h"
 #include "ctree.h"
@@ -127,6 +128,8 @@ struct send_ctx {
        bool cur_inode_new_gen;
        bool cur_inode_deleted;
        bool ignore_cur_inode;
+       bool cur_inode_needs_verity;
+       void *verity_descriptor;
 
        u64 send_progress;
 
@@ -624,6 +627,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
                return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));      \
        }
 
+TLV_PUT_DEFINE_INT(8)
 TLV_PUT_DEFINE_INT(32)
 TLV_PUT_DEFINE_INT(64)
 
@@ -842,17 +846,32 @@ out:
        return ret;
 }
 
+struct btrfs_inode_info {
+       u64 size;
+       u64 gen;
+       u64 mode;
+       u64 uid;
+       u64 gid;
+       u64 rdev;
+       u64 fileattr;
+       u64 nlink;
+};
+
 /*
  * Helper function to retrieve some fields from an inode item.
  */
-static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
-                         u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
-                         u64 *gid, u64 *rdev, u64 *fileattr)
+static int get_inode_info(struct btrfs_root *root, u64 ino,
+                         struct btrfs_inode_info *info)
 {
        int ret;
+       struct btrfs_path *path;
        struct btrfs_inode_item *ii;
        struct btrfs_key key;
 
+       path = alloc_path_for_send();
+       if (!path)
+               return -ENOMEM;
+
        key.objectid = ino;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
@@ -860,47 +879,43 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
-               return ret;
+               goto out;
        }
 
+       if (!info)
+               goto out;
+
        ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
                        struct btrfs_inode_item);
-       if (size)
-               *size = btrfs_inode_size(path->nodes[0], ii);
-       if (gen)
-               *gen = btrfs_inode_generation(path->nodes[0], ii);
-       if (mode)
-               *mode = btrfs_inode_mode(path->nodes[0], ii);
-       if (uid)
-               *uid = btrfs_inode_uid(path->nodes[0], ii);
-       if (gid)
-               *gid = btrfs_inode_gid(path->nodes[0], ii);
-       if (rdev)
-               *rdev = btrfs_inode_rdev(path->nodes[0], ii);
+       info->size = btrfs_inode_size(path->nodes[0], ii);
+       info->gen = btrfs_inode_generation(path->nodes[0], ii);
+       info->mode = btrfs_inode_mode(path->nodes[0], ii);
+       info->uid = btrfs_inode_uid(path->nodes[0], ii);
+       info->gid = btrfs_inode_gid(path->nodes[0], ii);
+       info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
+       info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
        /*
         * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
         * otherwise logically split to 32/32 parts.
         */
-       if (fileattr)
-               *fileattr = btrfs_inode_flags(path->nodes[0], ii);
+       info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
 
+out:
+       btrfs_free_path(path);
        return ret;
 }
 
-static int get_inode_info(struct btrfs_root *root,
-                         u64 ino, u64 *size, u64 *gen,
-                         u64 *mode, u64 *uid, u64 *gid,
-                         u64 *rdev, u64 *fileattr)
+static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
 {
-       struct btrfs_path *path;
        int ret;
+       struct btrfs_inode_info info;
 
-       path = alloc_path_for_send();
-       if (!path)
-               return -ENOMEM;
-       ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
-                              rdev, fileattr);
-       btrfs_free_path(path);
+       if (!gen)
+               return -EPERM;
+
+       ret = get_inode_info(root, ino, &info);
+       if (!ret)
+               *gen = info.gen;
        return ret;
 }
 
@@ -1643,21 +1658,22 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
        int right_ret;
        u64 left_gen;
        u64 right_gen;
+       struct btrfs_inode_info info;
 
-       ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
-                       NULL, NULL, NULL);
+       ret = get_inode_info(sctx->send_root, ino, &info);
        if (ret < 0 && ret != -ENOENT)
                goto out;
-       left_ret = ret;
+       left_ret = (info.nlink == 0) ? -ENOENT : ret;
+       left_gen = info.gen;
 
        if (!sctx->parent_root) {
                right_ret = -ENOENT;
        } else {
-               ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
-                               NULL, NULL, NULL, NULL, NULL);
+               ret = get_inode_info(sctx->parent_root, ino, &info);
                if (ret < 0 && ret != -ENOENT)
                        goto out;
-               right_ret = ret;
+               right_ret = (info.nlink == 0) ? -ENOENT : ret;
+               right_gen = info.gen;
        }
 
        if (!left_ret && !right_ret) {
@@ -1816,8 +1832,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
        btrfs_release_path(path);
 
        if (dir_gen) {
-               ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
-                                    NULL, NULL, NULL, NULL);
+               ret = get_inode_gen(root, parent_dir, dir_gen);
                if (ret < 0)
                        goto out;
        }
@@ -1874,6 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
        int ret = 0;
        u64 gen;
        u64 other_inode = 0;
+       struct btrfs_inode_info info;
 
        if (!sctx->parent_root)
                goto out;
@@ -1888,8 +1904,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
         * and we can just unlink this entry.
         */
        if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
-               ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
-                                    NULL, NULL, NULL, NULL);
+               ret = get_inode_gen(sctx->parent_root, dir, &gen);
                if (ret < 0 && ret != -ENOENT)
                        goto out;
                if (ret) {
@@ -1916,13 +1931,14 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
         */
        if (other_inode > sctx->send_progress ||
            is_waiting_for_move(sctx, other_inode)) {
-               ret = get_inode_info(sctx->parent_root, other_inode, NULL,
-                               who_gen, who_mode, NULL, NULL, NULL, NULL);
+               ret = get_inode_info(sctx->parent_root, other_inode, &info);
                if (ret < 0)
                        goto out;
 
                ret = 1;
                *who_ino = other_inode;
+               *who_gen = info.gen;
+               *who_mode = info.mode;
        } else {
                ret = 0;
        }
@@ -1955,8 +1971,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
                goto out;
 
        if (dir != BTRFS_FIRST_FREE_OBJECTID) {
-               ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
-                                    NULL, NULL, NULL, NULL);
+               ret = get_inode_gen(sctx->send_root, dir, &gen);
                if (ret < 0 && ret != -ENOENT)
                        goto out;
                if (ret) {
@@ -1978,8 +1993,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
                goto out;
        }
 
-       ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
-                       NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->send_root, ow_inode, &gen);
        if (ret < 0)
                goto out;
 
@@ -2645,6 +2659,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
        int ret = 0;
        struct fs_path *p;
        int cmd;
+       struct btrfs_inode_info info;
        u64 gen;
        u64 mode;
        u64 rdev;
@@ -2656,10 +2671,12 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
                return -ENOMEM;
 
        if (ino != sctx->cur_ino) {
-               ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
-                                    NULL, NULL, &rdev, NULL);
+               ret = get_inode_info(sctx->send_root, ino, &info);
                if (ret < 0)
                        goto out;
+               gen = info.gen;
+               mode = info.mode;
+               rdev = info.rdev;
        } else {
                gen = sctx->cur_inode_gen;
                mode = sctx->cur_inode_mode;
@@ -3359,8 +3376,7 @@ finish:
                /*
                 * The parent inode might have been deleted in the send snapshot
                 */
-               ret = get_inode_info(sctx->send_root, cur->dir, NULL,
-                                    NULL, NULL, NULL, NULL, NULL, NULL);
+               ret = get_inode_info(sctx->send_root, cur->dir, NULL);
                if (ret == -ENOENT) {
                        ret = 0;
                        continue;
@@ -3534,12 +3550,10 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
                goto out;
        }
 
-       ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
-                            &left_gen, NULL, NULL, NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
        if (ret < 0)
                goto out;
-       ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
-                            &right_gen, NULL, NULL, NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
        if (ret < 0) {
                if (ret == -ENOENT)
                        ret = 0;
@@ -3669,8 +3683,7 @@ static int is_ancestor(struct btrfs_root *root,
                                cur_offset = item_size;
                        }
 
-                       ret = get_inode_info(root, parent, NULL, &parent_gen,
-                                            NULL, NULL, NULL, NULL, NULL);
+                       ret = get_inode_gen(root, parent, &parent_gen);
                        if (ret < 0)
                                goto out;
                        ret = check_ino_in_path(root, ino1, ino1_gen,
@@ -3760,9 +3773,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
                     memcmp(path_before->start, path_after->start, len1))) {
                        u64 parent_ino_gen;
 
-                       ret = get_inode_info(sctx->parent_root, ino, NULL,
-                                            &parent_ino_gen, NULL, NULL, NULL,
-                                            NULL, NULL);
+                       ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
                        if (ret < 0)
                                goto out;
                        if (ino_gen == parent_ino_gen) {
@@ -4441,8 +4452,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index,
        struct recorded_ref *ref;
        u64 dir_gen;
 
-       ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
-                            NULL, NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
        if (ret < 0)
                goto out;
 
@@ -4472,8 +4482,7 @@ static int record_deleted_ref_if_needed(int num, u64 dir, int index,
        struct recorded_ref *ref;
        u64 dir_gen;
 
-       ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
-                            NULL, NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
        if (ret < 0)
                goto out;
 
@@ -4886,6 +4895,84 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
        return ret;
 }
 
+static int send_verity(struct send_ctx *sctx, struct fs_path *path,
+                      struct fsverity_descriptor *desc)
+{
+       int ret;
+
+       ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
+       if (ret < 0)
+               goto out;
+
+       TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+       TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
+                       le8_to_cpu(desc->hash_algorithm));
+       TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
+                       1U << le8_to_cpu(desc->log_blocksize));
+       TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
+                       le8_to_cpu(desc->salt_size));
+       TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
+                       le32_to_cpu(desc->sig_size));
+
+       ret = send_cmd(sctx);
+
+tlv_put_failure:
+out:
+       return ret;
+}
+
+static int process_verity(struct send_ctx *sctx)
+{
+       int ret = 0;
+       struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+       struct inode *inode;
+       struct fs_path *p;
+
+       inode = btrfs_iget(fs_info->sb, sctx->cur_ino, sctx->send_root);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+       if (ret < 0)
+               goto iput;
+
+       if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
+               ret = -EMSGSIZE;
+               goto iput;
+       }
+       if (!sctx->verity_descriptor) {
+               sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
+                                                  GFP_KERNEL);
+               if (!sctx->verity_descriptor) {
+                       ret = -ENOMEM;
+                       goto iput;
+               }
+       }
+
+       ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+       if (ret < 0)
+               goto iput;
+
+       p = fs_path_alloc();
+       if (!p) {
+               ret = -ENOMEM;
+               goto iput;
+       }
+       ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+       if (ret < 0)
+               goto free_path;
+
+       ret = send_verity(sctx, p, sctx->verity_descriptor);
+       if (ret < 0)
+               goto free_path;
+
+free_path:
+       fs_path_free(p);
+iput:
+       iput(inode);
+       return ret;
+}
+
 static inline u64 max_send_read_size(const struct send_ctx *sctx)
 {
        return sctx->send_max_size - SZ_16K;
@@ -5056,8 +5143,7 @@ static int send_clone(struct send_ctx *sctx,
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 
        if (clone_root->root == sctx->send_root) {
-               ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
-                               &gen, NULL, NULL, NULL, NULL, NULL);
+               ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
                if (ret < 0)
                        goto out;
                ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -5536,6 +5622,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
        struct btrfs_path *path;
        struct btrfs_key key;
        int ret;
+       struct btrfs_inode_info info;
        u64 clone_src_i_size = 0;
 
        /*
@@ -5565,12 +5652,11 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
         * There are inodes that have extents that lie behind its i_size. Don't
         * accept clones from these extents.
         */
-       ret = __get_inode_info(clone_root->root, path, clone_root->ino,
-                              &clone_src_i_size, NULL, NULL, NULL, NULL, NULL,
-                              NULL);
+       ret = get_inode_info(clone_root->root, clone_root->ino, &info);
        btrfs_release_path(path);
        if (ret < 0)
                goto out;
+       clone_src_i_size = info.size;
 
        /*
         * We can't send a clone operation for the entire range if we find
@@ -6259,6 +6345,7 @@ out:
 static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 {
        int ret = 0;
+       struct btrfs_inode_info info;
        u64 left_mode;
        u64 left_uid;
        u64 left_gid;
@@ -6301,11 +6388,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                goto out;
        if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
                goto out;
-
-       ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
-                       &left_mode, &left_uid, &left_gid, NULL, &left_fileattr);
+       ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
        if (ret < 0)
                goto out;
+       left_mode = info.mode;
+       left_uid = info.uid;
+       left_gid = info.gid;
+       left_fileattr = info.fileattr;
 
        if (!sctx->parent_root || sctx->cur_inode_new) {
                need_chown = 1;
@@ -6316,11 +6405,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        } else {
                u64 old_size;
 
-               ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
-                               &old_size, NULL, &right_mode, &right_uid,
-                               &right_gid, NULL, &right_fileattr);
+               ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
                if (ret < 0)
                        goto out;
+               old_size = info.size;
+               right_mode = info.mode;
+               right_uid = info.uid;
+               right_gid = info.gid;
+               right_fileattr = info.fileattr;
 
                if (left_uid != right_uid || left_gid != right_gid)
                        need_chown = 1;
@@ -6377,6 +6469,11 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                if (ret < 0)
                        goto out;
        }
+       if (sctx->cur_inode_needs_verity) {
+               ret = process_verity(sctx);
+               if (ret < 0)
+                       goto out;
+       }
 
        ret = send_capabilities(sctx);
        if (ret < 0)
@@ -6407,86 +6504,6 @@ out:
        return ret;
 }
 
-struct parent_paths_ctx {
-       struct list_head *refs;
-       struct send_ctx *sctx;
-};
-
-static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
-                            void *ctx)
-{
-       struct parent_paths_ctx *ppctx = ctx;
-
-       /*
-        * Pass 0 as the generation for the directory, we don't care about it
-        * here as we have no new references to add, we just want to delete all
-        * references for an inode.
-        */
-       return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs,
-                                 name, dir, 0, ppctx->sctx);
-}
-
-/*
- * Issue unlink operations for all paths of the current inode found in the
- * parent snapshot.
- */
-static int btrfs_unlink_all_paths(struct send_ctx *sctx)
-{
-       LIST_HEAD(deleted_refs);
-       struct btrfs_path *path;
-       struct btrfs_root *root = sctx->parent_root;
-       struct btrfs_key key;
-       struct btrfs_key found_key;
-       struct parent_paths_ctx ctx;
-       int iter_ret = 0;
-       int ret;
-
-       path = alloc_path_for_send();
-       if (!path)
-               return -ENOMEM;
-
-       key.objectid = sctx->cur_ino;
-       key.type = BTRFS_INODE_REF_KEY;
-       key.offset = 0;
-
-       ctx.refs = &deleted_refs;
-       ctx.sctx = sctx;
-
-       btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
-               if (found_key.objectid != key.objectid)
-                       break;
-               if (found_key.type != key.type &&
-                   found_key.type != BTRFS_INODE_EXTREF_KEY)
-                       break;
-
-               ret = iterate_inode_ref(root, path, &found_key, 1,
-                                       record_parent_ref, &ctx);
-               if (ret < 0)
-                       goto out;
-       }
-       /* Catch error found during iteration */
-       if (iter_ret < 0) {
-               ret = iter_ret;
-               goto out;
-       }
-
-       while (!list_empty(&deleted_refs)) {
-               struct recorded_ref *ref;
-
-               ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
-               ret = send_unlink(sctx, ref->full_path);
-               if (ret < 0)
-                       goto out;
-               recorded_ref_free(ref);
-       }
-       ret = 0;
-out:
-       btrfs_free_path(path);
-       if (ret)
-               __free_recorded_refs(&deleted_refs);
-       return ret;
-}
-
 static void close_current_inode(struct send_ctx *sctx)
 {
        u64 i_size;
@@ -6577,25 +6594,36 @@ static int changed_inode(struct send_ctx *sctx,
         * file descriptor against it or turning a RO snapshot into RW mode,
         * keep an open file descriptor against a file, delete it and then
         * turn the snapshot back to RO mode before using it for a send
-        * operation. So if we find such cases, ignore the inode and all its
-        * items completely if it's a new inode, or if it's a changed inode
-        * make sure all its previous paths (from the parent snapshot) are all
-        * unlinked and all other the inode items are ignored.
+        * operation. The former is what the receiver operation does.
+        * Therefore, if we want to send these snapshots soon after they're
+        * received, we need to handle orphan inodes as well. Moreover, orphans
+        * can appear not only in the send snapshot but also in the parent
+        * snapshot. Here are several cases:
+        *
+        * Case 1: BTRFS_COMPARE_TREE_NEW
+        *       |  send snapshot  | action
+        * --------------------------------
+        * nlink |        0        | ignore
+        *
+        * Case 2: BTRFS_COMPARE_TREE_DELETED
+        *       | parent snapshot | action
+        * ----------------------------------
+        * nlink |        0        | as usual
+        * Note: No unlinks will be sent because there're no paths for it.
+        *
+        * Case 3: BTRFS_COMPARE_TREE_CHANGED
+        *           |       | parent snapshot | send snapshot | action
+        * -----------------------------------------------------------------------
+        * subcase 1 | nlink |        0        |       0       | ignore
+        * subcase 2 | nlink |       >0        |       0       | new_gen(deletion)
+        * subcase 3 | nlink |        0        |      >0       | new_gen(creation)
+        *
         */
-       if (result == BTRFS_COMPARE_TREE_NEW ||
-           result == BTRFS_COMPARE_TREE_CHANGED) {
-               u32 nlinks;
-
-               nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
-               if (nlinks == 0) {
+       if (result == BTRFS_COMPARE_TREE_NEW) {
+               if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
                        sctx->ignore_cur_inode = true;
-                       if (result == BTRFS_COMPARE_TREE_CHANGED)
-                               ret = btrfs_unlink_all_paths(sctx);
                        goto out;
                }
-       }
-
-       if (result == BTRFS_COMPARE_TREE_NEW) {
                sctx->cur_inode_gen = left_gen;
                sctx->cur_inode_new = true;
                sctx->cur_inode_deleted = false;
@@ -6616,6 +6644,16 @@ static int changed_inode(struct send_ctx *sctx,
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->right_path->nodes[0], right_ii);
        } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+               u32 new_nlinks, old_nlinks;
+
+               new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
+               old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
+               if (new_nlinks == 0 && old_nlinks == 0) {
+                       sctx->ignore_cur_inode = true;
+                       goto out;
+               } else if (new_nlinks == 0 || old_nlinks == 0) {
+                       sctx->cur_inode_new_gen = 1;
+               }
                /*
                 * We need to do some special handling in case the inode was
                 * reported as changed with a changed generation number. This
@@ -6642,38 +6680,44 @@ static int changed_inode(struct send_ctx *sctx,
                        /*
                         * Now process the inode as if it was new.
                         */
-                       sctx->cur_inode_gen = left_gen;
-                       sctx->cur_inode_new = true;
-                       sctx->cur_inode_deleted = false;
-                       sctx->cur_inode_size = btrfs_inode_size(
-                                       sctx->left_path->nodes[0], left_ii);
-                       sctx->cur_inode_mode = btrfs_inode_mode(
-                                       sctx->left_path->nodes[0], left_ii);
-                       sctx->cur_inode_rdev = btrfs_inode_rdev(
-                                       sctx->left_path->nodes[0], left_ii);
-                       ret = send_create_inode_if_needed(sctx);
-                       if (ret < 0)
-                               goto out;
+                       if (new_nlinks > 0) {
+                               sctx->cur_inode_gen = left_gen;
+                               sctx->cur_inode_new = true;
+                               sctx->cur_inode_deleted = false;
+                               sctx->cur_inode_size = btrfs_inode_size(
+                                               sctx->left_path->nodes[0],
+                                               left_ii);
+                               sctx->cur_inode_mode = btrfs_inode_mode(
+                                               sctx->left_path->nodes[0],
+                                               left_ii);
+                               sctx->cur_inode_rdev = btrfs_inode_rdev(
+                                               sctx->left_path->nodes[0],
+                                               left_ii);
+                               ret = send_create_inode_if_needed(sctx);
+                               if (ret < 0)
+                                       goto out;
 
-                       ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
-                       if (ret < 0)
-                               goto out;
-                       /*
-                        * Advance send_progress now as we did not get into
-                        * process_recorded_refs_if_needed in the new_gen case.
-                        */
-                       sctx->send_progress = sctx->cur_ino + 1;
+                               ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
+                               if (ret < 0)
+                                       goto out;
+                               /*
+                                * Advance send_progress now as we did not get
+                                * into process_recorded_refs_if_needed in the
+                                * new_gen case.
+                                */
+                               sctx->send_progress = sctx->cur_ino + 1;
 
-                       /*
-                        * Now process all extents and xattrs of the inode as if
-                        * they were all new.
-                        */
-                       ret = process_all_extents(sctx);
-                       if (ret < 0)
-                               goto out;
-                       ret = process_all_new_xattrs(sctx);
-                       if (ret < 0)
-                               goto out;
+                               /*
+                                * Now process all extents and xattrs of the
+                                * inode as if they were all new.
+                                */
+                               ret = process_all_extents(sctx);
+                               if (ret < 0)
+                                       goto out;
+                               ret = process_all_new_xattrs(sctx);
+                               if (ret < 0)
+                                       goto out;
+                       }
                } else {
                        sctx->cur_inode_gen = left_gen;
                        sctx->cur_inode_new = false;
@@ -6785,18 +6829,27 @@ static int changed_extent(struct send_ctx *sctx,
        return ret;
 }
 
+static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
+{
+       int ret = 0;
+
+       if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
+               if (result == BTRFS_COMPARE_TREE_NEW)
+                       sctx->cur_inode_needs_verity = true;
+       }
+       return ret;
+}
+
 static int dir_changed(struct send_ctx *sctx, u64 dir)
 {
        u64 orig_gen, new_gen;
        int ret;
 
-       ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
-                            NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->send_root, dir, &new_gen);
        if (ret)
                return ret;
 
-       ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
-                            NULL, NULL, NULL, NULL);
+       ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
        if (ret)
                return ret;
 
@@ -6939,6 +6992,9 @@ static int changed_cb(struct btrfs_path *left_path,
                        ret = changed_xattr(sctx, result);
                else if (key->type == BTRFS_EXTENT_DATA_KEY)
                        ret = changed_extent(sctx, result);
+               else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
+                        key->offset == 0)
+                       ret = changed_verity(sctx, result);
        }
 
 out:
@@ -8036,6 +8092,7 @@ out:
                kvfree(sctx->clone_roots);
                kfree(sctx->send_buf_pages);
                kvfree(sctx->send_buf);
+               kvfree(sctx->verity_descriptor);
 
                name_cache_free(sctx);
 
index 4bb4e6a..0a45377 100644 (file)
@@ -92,8 +92,11 @@ enum btrfs_send_cmd {
        BTRFS_SEND_C_ENCODED_WRITE      = 25,
        BTRFS_SEND_C_MAX_V2             = 25,
 
+       /* Version 3 */
+       BTRFS_SEND_C_ENABLE_VERITY      = 26,
+       BTRFS_SEND_C_MAX_V3             = 26,
        /* End */
-       BTRFS_SEND_C_MAX                = 25,
+       BTRFS_SEND_C_MAX                = 26,
 };
 
 /* attributes in send stream */
@@ -160,8 +163,14 @@ enum {
        BTRFS_SEND_A_ENCRYPTION         = 31,
        BTRFS_SEND_A_MAX_V2             = 31,
 
-       /* End */
-       BTRFS_SEND_A_MAX                = 31,
+       /* Version 3 */
+       BTRFS_SEND_A_VERITY_ALGORITHM   = 32,
+       BTRFS_SEND_A_VERITY_BLOCK_SIZE  = 33,
+       BTRFS_SEND_A_VERITY_SALT_DATA   = 34,
+       BTRFS_SEND_A_VERITY_SIG_DATA    = 35,
+       BTRFS_SEND_A_MAX_V3             = 35,
+
+       __BTRFS_SEND_A_MAX              = 35,
 };
 
 long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
index 435559b..f171bf8 100644 (file)
@@ -293,32 +293,36 @@ out:
        return ret;
 }
 
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
-                            u64 total_bytes, u64 bytes_used,
-                            u64 bytes_readonly, u64 bytes_zone_unusable,
-                            bool active, struct btrfs_space_info **space_info)
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+                               struct btrfs_block_group *block_group)
 {
        struct btrfs_space_info *found;
-       int factor;
+       int factor, index;
 
-       factor = btrfs_bg_type_to_factor(flags);
+       factor = btrfs_bg_type_to_factor(block_group->flags);
 
-       found = btrfs_find_space_info(info, flags);
+       found = btrfs_find_space_info(info, block_group->flags);
        ASSERT(found);
        spin_lock(&found->lock);
-       found->total_bytes += total_bytes;
-       if (active)
-               found->active_total_bytes += total_bytes;
-       found->disk_total += total_bytes * factor;
-       found->bytes_used += bytes_used;
-       found->disk_used += bytes_used * factor;
-       found->bytes_readonly += bytes_readonly;
-       found->bytes_zone_unusable += bytes_zone_unusable;
-       if (total_bytes > 0)
+       found->total_bytes += block_group->length;
+       if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+               found->active_total_bytes += block_group->length;
+       found->disk_total += block_group->length * factor;
+       found->bytes_used += block_group->used;
+       found->disk_used += block_group->used * factor;
+       found->bytes_readonly += block_group->bytes_super;
+       found->bytes_zone_unusable += block_group->zone_unusable;
+       if (block_group->length > 0)
                found->full = 0;
        btrfs_try_granting_tickets(info, found);
        spin_unlock(&found->lock);
-       *space_info = found;
+
+       block_group->space_info = found;
+
+       index = btrfs_bg_flags_to_raid_index(block_group->flags);
+       down_write(&found->groups_sem);
+       list_add_tail(&block_group->list, &found->block_groups[index]);
+       up_write(&found->groups_sem);
 }
 
 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -472,28 +476,47 @@ do {                                                                      \
        spin_unlock(&__rsv->lock);                                      \
 } while (0)
 
+static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
+{
+       switch (space_info->flags) {
+       case BTRFS_BLOCK_GROUP_SYSTEM:
+               return "SYSTEM";
+       case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
+               return "DATA+METADATA";
+       case BTRFS_BLOCK_GROUP_DATA:
+               return "DATA";
+       case BTRFS_BLOCK_GROUP_METADATA:
+               return "METADATA";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+       DUMP_BLOCK_RSV(fs_info, global_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
+       DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
+}
+
 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
                                    struct btrfs_space_info *info)
 {
+       const char *flag_str = space_info_flag_to_str(info);
        lockdep_assert_held(&info->lock);
 
        /* The free space could be negative in case of overcommit */
-       btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
-                  info->flags,
+       btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
+                  flag_str,
                   (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
                   info->full ? "" : "not ");
        btrfs_info(fs_info,
-               "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
+"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
                info->total_bytes, info->bytes_used, info->bytes_pinned,
                info->bytes_reserved, info->bytes_may_use,
                info->bytes_readonly, info->bytes_zone_unusable);
-
-       DUMP_BLOCK_RSV(fs_info, global_block_rsv);
-       DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
-       DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
-       DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
-       DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
-
 }
 
 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
@@ -505,6 +528,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 
        spin_lock(&info->lock);
        __btrfs_dump_space_info(fs_info, info);
+       dump_global_block_rsv(fs_info);
        spin_unlock(&info->lock);
 
        if (!dump_block_groups)
@@ -1662,7 +1686,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
                                      &space_info->priority_tickets);
                }
        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
-               used += orig_bytes;
                /*
                 * We will do the space reservation dance during log replay,
                 * which means we won't have fs_info->fs_root set, so don't do
@@ -1737,7 +1760,8 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
        int ret;
 
        ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
-              flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+              flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
+              flush == BTRFS_RESERVE_NO_FLUSH);
        ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
 
        ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
@@ -1749,3 +1773,17 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
        }
        return ret;
 }
+
+/* Dump all the space infos when we abort a transaction due to ENOSPC. */
+__cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_space_info *space_info;
+
+       btrfs_info(fs_info, "dumping space info:");
+       list_for_each_entry(space_info, &fs_info->space_info, list) {
+               spin_lock(&space_info->lock);
+               __btrfs_dump_space_info(fs_info, space_info);
+               spin_unlock(&space_info->lock);
+       }
+       dump_global_block_rsv(fs_info);
+}
index 12fd614..ce66023 100644 (file)
@@ -123,10 +123,8 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info");
 DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned");
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
-void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
-                            u64 total_bytes, u64 bytes_used,
-                            u64 bytes_readonly, u64 bytes_zone_unusable,
-                            bool active, struct btrfs_space_info **space_info);
+void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+                               struct btrfs_block_group *block_group);
 void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
                                        u64 chunk_size);
 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -159,4 +157,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
 }
 int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
                             enum btrfs_reserve_flush_enum flush);
+void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
+
 #endif /* BTRFS_SPACE_INFO_H */
index f89beac..9be4fd2 100644 (file)
@@ -346,12 +346,14 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
 __cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                               const char *function,
-                              unsigned int line, int errno)
+                              unsigned int line, int errno, bool first_hit)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
 
        WRITE_ONCE(trans->aborted, errno);
        WRITE_ONCE(trans->transaction->aborted, errno);
+       if (first_hit && errno == -ENOSPC)
+               btrfs_dump_space_info_for_trans_abort(fs_info);
        /* Wake up anybody who may be waiting on this transaction */
        wake_up(&fs_info->transaction_wait);
        wake_up(&fs_info->transaction_blocked_wait);
@@ -626,6 +628,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
        int saved_compress_level;
        bool saved_compress_force;
        int no_compress = 0;
+       const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
 
        if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
                btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
@@ -1137,10 +1140,12 @@ out:
        }
        if (!ret)
                ret = btrfs_check_mountopts_zoned(info);
-       if (!ret && btrfs_test_opt(info, SPACE_CACHE))
-               btrfs_info(info, "disk space caching is enabled");
-       if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
-               btrfs_info(info, "using free space tree");
+       if (!ret && !remounting) {
+               if (btrfs_test_opt(info, SPACE_CACHE))
+                       btrfs_info(info, "disk space caching is enabled");
+               if (btrfs_test_opt(info, FREE_SPACE_TREE))
+                       btrfs_info(info, "using free space tree");
+       }
        return ret;
 }
 
@@ -2009,14 +2014,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto restore;
 
-       /* V1 cache is not supported for subpage mount. */
-       if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
-               btrfs_warn(fs_info,
-       "v1 space cache is not supported for page size %lu with sectorsize %u",
-                          PAGE_SIZE, fs_info->sectorsize);
-               ret = -EINVAL;
+       ret = btrfs_check_features(fs_info, sb);
+       if (ret < 0)
                goto restore;
-       }
+
        btrfs_remount_begin(fs_info, old_opts, *flags);
        btrfs_resize_thread_pool(fs_info,
                fs_info->thread_pool_size, old_thread_pool_size);
@@ -2550,11 +2551,71 @@ static int btrfs_freeze(struct super_block *sb)
        return btrfs_commit_transaction(trans);
 }
 
+static int check_dev_super(struct btrfs_device *dev)
+{
+       struct btrfs_fs_info *fs_info = dev->fs_info;
+       struct btrfs_super_block *sb;
+       int ret = 0;
+
+       /* This should be called with fs still frozen. */
+       ASSERT(test_bit(BTRFS_FS_FROZEN, &fs_info->flags));
+
+       /* Missing dev, no need to check. */
+       if (!dev->bdev)
+               return 0;
+
+       /* Only need to check the primary super block. */
+       sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+       if (IS_ERR(sb))
+               return PTR_ERR(sb);
+
+       /* Btrfs_validate_super() includes fsid check against super->fsid. */
+       ret = btrfs_validate_super(fs_info, sb, 0);
+       if (ret < 0)
+               goto out;
+
+       if (btrfs_super_generation(sb) != fs_info->last_trans_committed) {
+               btrfs_err(fs_info, "transid mismatch, has %llu expect %llu",
+                       btrfs_super_generation(sb),
+                       fs_info->last_trans_committed);
+               ret = -EUCLEAN;
+               goto out;
+       }
+out:
+       btrfs_release_disk_super(sb);
+       return ret;
+}
+
 static int btrfs_unfreeze(struct super_block *sb)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_device *device;
+       int ret = 0;
 
+       /*
+        * Make sure the fs is not changed by accident (like hibernation then
+        * modified by other OS).
+        * If we found anything wrong, we mark the fs error immediately.
+        *
+        * And since the fs is frozen, no one can modify the fs yet, thus
+        * we don't need to hold device_list_mutex.
+        */
+       list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+               ret = check_dev_super(device);
+               if (ret < 0) {
+                       btrfs_handle_fs_error(fs_info, ret,
+                               "super block on devid %llu got modified unexpectedly",
+                               device->devid);
+                       break;
+               }
+       }
        clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
+
+       /*
+        * We still return 0, to allow VFS layer to unfreeze the fs even the
+        * above checks failed. Since the fs is either fine or read-only, we're
+        * safe to continue, without causing further damage.
+        */
        return 0;
 }
 
@@ -2662,17 +2723,21 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_compress;
 
-       err = extent_io_init();
+       err = extent_state_init_cachep();
        if (err)
                goto free_cachep;
 
-       err = extent_state_cache_init();
+       err = extent_buffer_init_cachep();
+       if (err)
+               goto free_extent_cachep;
+
+       err = btrfs_bioset_init();
        if (err)
-               goto free_extent_io;
+               goto free_eb_cachep;
 
        err = extent_map_init();
        if (err)
-               goto free_extent_state_cache;
+               goto free_bioset;
 
        err = ordered_data_init();
        if (err)
@@ -2724,10 +2789,12 @@ free_ordered_data:
        ordered_data_exit();
 free_extent_map:
        extent_map_exit();
-free_extent_state_cache:
-       extent_state_cache_exit();
-free_extent_io:
-       extent_io_exit();
+free_bioset:
+       btrfs_bioset_exit();
+free_eb_cachep:
+       extent_buffer_free_cachep();
+free_extent_cachep:
+       extent_state_free_cachep();
 free_cachep:
        btrfs_destroy_cachep();
 free_compress:
@@ -2746,8 +2813,9 @@ static void __exit exit_btrfs_fs(void)
        btrfs_prelim_ref_exit();
        ordered_data_exit();
        extent_map_exit();
-       extent_state_cache_exit();
-       extent_io_exit();
+       btrfs_bioset_exit();
+       extent_state_free_cachep();
+       extent_buffer_free_cachep();
        btrfs_interface_exit();
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
index d5d0717..699b54b 100644 (file)
  * qgroup_attrs                                /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>
  * space_info_attrs                    /sys/fs/btrfs/<uuid>/allocation/<bg-type>
  * raid_attrs                          /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>
+ * discard_attrs                       /sys/fs/btrfs/<uuid>/discard
  *
  * When built with BTRFS_CONFIG_DEBUG:
  *
  * btrfs_debug_feature_attrs           /sys/fs/btrfs/debug
  * btrfs_debug_mount_attrs             /sys/fs/btrfs/<uuid>/debug
- * discard_debug_attrs                 /sys/fs/btrfs/<uuid>/debug/discard
  */
 
 struct btrfs_feature_attr {
@@ -286,6 +286,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
 BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
 BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
 BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
 #ifdef CONFIG_BLK_DEV_ZONED
 BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
@@ -317,6 +318,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
        BTRFS_FEAT_ATTR_PTR(metadata_uuid),
        BTRFS_FEAT_ATTR_PTR(free_space_tree),
        BTRFS_FEAT_ATTR_PTR(raid1c34),
+       BTRFS_FEAT_ATTR_PTR(block_group_tree),
 #ifdef CONFIG_BLK_DEV_ZONED
        BTRFS_FEAT_ATTR_PTR(zoned),
 #endif
@@ -429,12 +431,10 @@ static const struct attribute_group btrfs_static_feature_attr_group = {
        .attrs = btrfs_supported_static_feature_attrs,
 };
 
-#ifdef CONFIG_BTRFS_DEBUG
-
 /*
  * Discard statistics and tunables
  */
-#define discard_to_fs_info(_kobj)      to_fs_info((_kobj)->parent->parent)
+#define discard_to_fs_info(_kobj)      to_fs_info(get_btrfs_kobj(_kobj))
 
 static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj,
                                            struct kobj_attribute *a,
@@ -583,11 +583,11 @@ BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show,
              btrfs_discard_max_discard_size_store);
 
 /*
- * Per-filesystem debugging of discard (when mounted with discard=async).
+ * Per-filesystem stats for discard (when mounted with discard=async).
  *
- * Path: /sys/fs/btrfs/<uuid>/debug/discard/
+ * Path: /sys/fs/btrfs/<uuid>/discard/
  */
-static const struct attribute *discard_debug_attrs[] = {
+static const struct attribute *discard_attrs[] = {
        BTRFS_ATTR_PTR(discard, discardable_bytes),
        BTRFS_ATTR_PTR(discard, discardable_extents),
        BTRFS_ATTR_PTR(discard, discard_bitmap_bytes),
@@ -599,6 +599,8 @@ static const struct attribute *discard_debug_attrs[] = {
        NULL,
 };
 
+#ifdef CONFIG_BTRFS_DEBUG
+
 /*
  * Per-filesystem runtime debugging exported via sysfs.
  *
@@ -837,11 +839,8 @@ static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
                                                     char *buf)
 {
        struct btrfs_space_info *space_info = to_space_info(kobj);
-       ssize_t ret;
 
-       ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
-
-       return ret;
+       return sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
 }
 
 static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1150,25 +1149,6 @@ static ssize_t btrfs_generation_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, generation, btrfs_generation_show);
 
-/*
- * Look for an exact string @string in @buffer with possible leading or
- * trailing whitespace
- */
-static bool strmatch(const char *buffer, const char *string)
-{
-       const size_t len = strlen(string);
-
-       /* Skip leading whitespace */
-       buffer = skip_spaces(buffer);
-
-       /* Match entire string, check if the rest is whitespace or empty */
-       if (strncmp(string, buffer, len) == 0 &&
-           strlen(skip_spaces(buffer + len)) == 0)
-               return true;
-
-       return false;
-}
-
 static const char * const btrfs_read_policy_name[] = { "pid" };
 
 static ssize_t btrfs_read_policy_show(struct kobject *kobj,
@@ -1202,7 +1182,7 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
        int i;
 
        for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
-               if (strmatch(buf, btrfs_read_policy_name[i])) {
+               if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
                        if (i != fs_devices->read_policy) {
                                fs_devices->read_policy = i;
                                btrfs_info(fs_devices->fs_info,
@@ -1222,11 +1202,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
                                               char *buf)
 {
        struct btrfs_fs_info *fs_info = to_fs_info(kobj);
-       ssize_t ret;
 
-       ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
-
-       return ret;
+       return sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold));
 }
 
 static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
@@ -1427,13 +1404,12 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
                kobject_del(fs_info->space_info_kobj);
                kobject_put(fs_info->space_info_kobj);
        }
-#ifdef CONFIG_BTRFS_DEBUG
-       if (fs_info->discard_debug_kobj) {
-               sysfs_remove_files(fs_info->discard_debug_kobj,
-                                  discard_debug_attrs);
-               kobject_del(fs_info->discard_debug_kobj);
-               kobject_put(fs_info->discard_debug_kobj);
+       if (fs_info->discard_kobj) {
+               sysfs_remove_files(fs_info->discard_kobj, discard_attrs);
+               kobject_del(fs_info->discard_kobj);
+               kobject_put(fs_info->discard_kobj);
        }
+#ifdef CONFIG_BTRFS_DEBUG
        if (fs_info->debug_kobj) {
                sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
                kobject_del(fs_info->debug_kobj);
@@ -2001,20 +1977,18 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
        error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
        if (error)
                goto failure;
+#endif
 
        /* Discard directory */
-       fs_info->discard_debug_kobj = kobject_create_and_add("discard",
-                                                    fs_info->debug_kobj);
-       if (!fs_info->discard_debug_kobj) {
+       fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
+       if (!fs_info->discard_kobj) {
                error = -ENOMEM;
                goto failure;
        }
 
-       error = sysfs_create_files(fs_info->discard_debug_kobj,
-                                  discard_debug_attrs);
+       error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
        if (error)
                goto failure;
-#endif
 
        error = addrm_unknown_feature_attrs(fs_info, true);
        if (error)
@@ -2041,6 +2015,98 @@ failure:
        return error;
 }
 
+static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
+                                  struct kobj_attribute *a,
+                                  char *buf)
+{
+       struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+       bool enabled;
+
+       spin_lock(&fs_info->qgroup_lock);
+       enabled = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return sysfs_emit(buf, "%d\n", enabled);
+}
+BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show);
+
+static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj,
+                                       struct kobj_attribute *a,
+                                       char *buf)
+{
+       struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+       bool inconsistent;
+
+       spin_lock(&fs_info->qgroup_lock);
+       inconsistent = (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT);
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return sysfs_emit(buf, "%d\n", inconsistent);
+}
+BTRFS_ATTR(qgroups, inconsistent, qgroup_inconsistent_show);
+
+static ssize_t qgroup_drop_subtree_thres_show(struct kobject *qgroups_kobj,
+                                             struct kobj_attribute *a,
+                                             char *buf)
+{
+       struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+       u8 result;
+
+       spin_lock(&fs_info->qgroup_lock);
+       result = fs_info->qgroup_drop_subtree_thres;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t qgroup_drop_subtree_thres_store(struct kobject *qgroups_kobj,
+                                              struct kobj_attribute *a,
+                                              const char *buf, size_t len)
+{
+       struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent);
+       u8 new_thres;
+       int ret;
+
+       ret = kstrtou8(buf, 10, &new_thres);
+       if (ret)
+               return -EINVAL;
+
+       if (new_thres > BTRFS_MAX_LEVEL)
+               return -EINVAL;
+
+       spin_lock(&fs_info->qgroup_lock);
+       fs_info->qgroup_drop_subtree_thres = new_thres;
+       spin_unlock(&fs_info->qgroup_lock);
+
+       return len;
+}
+BTRFS_ATTR_RW(qgroups, drop_subtree_threshold, qgroup_drop_subtree_thres_show,
+             qgroup_drop_subtree_thres_store);
+
+/*
+ * Qgroups global info
+ *
+ * Path: /sys/fs/btrfs/<uuid>/qgroups/
+ */
+static struct attribute *qgroups_attrs[] = {
+       BTRFS_ATTR_PTR(qgroups, enabled),
+       BTRFS_ATTR_PTR(qgroups, inconsistent),
+       BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold),
+       NULL
+};
+ATTRIBUTE_GROUPS(qgroups);
+
+static void qgroups_release(struct kobject *kobj)
+{
+       kfree(kobj);
+}
+
+static struct kobj_type qgroups_ktype = {
+       .sysfs_ops = &kobj_sysfs_ops,
+       .default_groups = qgroups_groups,
+       .release = qgroups_release,
+};
+
 static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj)
 {
        return to_fs_info(kobj->parent->parent);
@@ -2166,11 +2232,15 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info)
        if (fs_info->qgroups_kobj)
                return 0;
 
-       fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj);
-       if (!fs_info->qgroups_kobj) {
-               ret = -ENOMEM;
+       fs_info->qgroups_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+       if (!fs_info->qgroups_kobj)
+               return -ENOMEM;
+
+       ret = kobject_init_and_add(fs_info->qgroups_kobj, &qgroups_ktype,
+                                  fsid_kobj, "qgroups");
+       if (ret < 0)
                goto out;
-       }
+
        rbtree_postorder_for_each_entry_safe(qgroup, next,
                                             &fs_info->qgroup_tree, node) {
                ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
index cc9377c..9c478fa 100644 (file)
@@ -243,7 +243,7 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
 {
        if (!cache)
                return;
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        kfree(cache->free_space_ctl);
        kfree(cache);
 }
index a232b15..f69ec4d 100644 (file)
@@ -80,7 +80,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
        PRINT_ONE_FLAG(state, dest, cur, NODATASUM);
        PRINT_ONE_FLAG(state, dest, cur, CLEAR_META_RESV);
        PRINT_ONE_FLAG(state, dest, cur, NEED_WAIT);
-       PRINT_ONE_FLAG(state, dest, cur, DAMAGED);
        PRINT_ONE_FLAG(state, dest, cur, NORESERVE);
        PRINT_ONE_FLAG(state, dest, cur, QGROUP_RESERVED);
        PRINT_ONE_FLAG(state, dest, cur, CLEAR_DATA_RESV);
@@ -172,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
                        sectorsize - 1, start, end);
                goto out_bits;
        }
-       unlock_extent(tmp, start, end);
+       unlock_extent(tmp, start, end, NULL);
        unlock_page(locked_page);
        put_page(locked_page);
 
@@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize)
                test_err("there were unlocked pages in the range");
                goto out_bits;
        }
-       unlock_extent(tmp, start, end);
+       unlock_extent(tmp, start, end, NULL);
        /* locked_page was unlocked above */
        put_page(locked_page);
 
@@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize)
                test_err("pages in range were not all locked");
                goto out_bits;
        }
-       unlock_extent(tmp, start, end);
+       unlock_extent(tmp, start, end, NULL);
 
        /*
         * Now to test where we run into a page that is no longer dirty in the
index 5930cdc..ebf68fc 100644 (file)
@@ -82,7 +82,7 @@ static int test_extents(struct btrfs_block_group *cache)
        }
 
        /* Cleanup */
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        return 0;
 }
@@ -149,7 +149,7 @@ static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
                return -1;
        }
 
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        return 0;
 }
@@ -230,7 +230,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
                return -1;
        }
 
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        /* Now with the extent entry offset into the bitmap */
        ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
         *      [ bitmap ]
         *        [ del ]
         */
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
        if (ret) {
                test_err("couldn't add bitmap %d", ret);
@@ -291,7 +291,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
                return -1;
        }
 
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        /*
         * This blew up before, we have part of the free space in a bitmap and
@@ -317,7 +317,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
                return ret;
        }
 
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        return 0;
 }
 
@@ -629,7 +629,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
        if (ret)
                return ret;
 
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        /*
         * Now test a similar scenario, but where our extent entry is located
@@ -819,7 +819,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
                return ret;
 
        cache->free_space_ctl->op = orig_free_space_ops;
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
 
        return 0;
 }
@@ -868,7 +868,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
        }
 
        /* Now validate bitmaps do the correct thing. */
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        for (i = 0; i < 2; i++) {
                offset = i * BITS_PER_BITMAP * sectorsize;
                bytes = (i + 1) * SZ_1M;
@@ -891,7 +891,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
        }
 
        /* Now validate bitmaps with different ->max_extent_size. */
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        orig_free_space_ops = cache->free_space_ctl->op;
        cache->free_space_ctl->op = &test_free_space_ops;
 
@@ -998,7 +998,7 @@ static int test_bytes_index(struct btrfs_block_group *cache, u32 sectorsize)
        }
 
        cache->free_space_ctl->op = orig_free_space_ops;
-       __btrfs_remove_free_space_cache(cache->free_space_ctl);
+       btrfs_remove_free_space_cache(cache);
        return 0;
 }
 
index cac89c3..625f7d3 100644 (file)
@@ -267,7 +267,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
                goto out;
        }
        free_extent_map(em);
-       btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
+       btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
 
        /*
         * All of the magic numbers are based on the mapping setup in
@@ -975,7 +975,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
                               BTRFS_MAX_EXTENT_SIZE >> 1,
                               (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
                               EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-                              EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_UPTODATE, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1043,7 +1043,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
                               BTRFS_MAX_EXTENT_SIZE + sectorsize,
                               BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
                               EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-                              EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_UPTODATE, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1076,7 +1076,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
        /* Empty */
        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                               EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-                              EXTENT_UPTODATE, 0, 0, NULL);
+                              EXTENT_UPTODATE, NULL);
        if (ret) {
                test_err("clear_extent_bit returned %d", ret);
                goto out;
@@ -1092,7 +1092,7 @@ out:
        if (ret)
                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
                                 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
-                                EXTENT_UPTODATE, 0, 0, NULL);
+                                EXTENT_UPTODATE, NULL);
        iput(inode);
        btrfs_free_dummy_root(root);
        btrfs_free_dummy_fs_info(fs_info);
index 0bec107..d1f1da6 100644 (file)
@@ -161,7 +161,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_root *root, *tmp;
-       struct btrfs_caching_control *caching_ctl, *next;
 
        /*
         * At this point no one can be using this transaction to modify any tree
@@ -196,46 +195,6 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
        }
        spin_unlock(&cur_trans->dropped_roots_lock);
 
-       /*
-        * We have to update the last_byte_to_unpin under the commit_root_sem,
-        * at the same time we swap out the commit roots.
-        *
-        * This is because we must have a real view of the last spot the caching
-        * kthreads were while caching.  Consider the following views of the
-        * extent tree for a block group
-        *
-        * commit root
-        * +----+----+----+----+----+----+----+
-        * |\\\\|    |\\\\|\\\\|    |\\\\|\\\\|
-        * +----+----+----+----+----+----+----+
-        * 0    1    2    3    4    5    6    7
-        *
-        * new commit root
-        * +----+----+----+----+----+----+----+
-        * |    |    |    |\\\\|    |    |\\\\|
-        * +----+----+----+----+----+----+----+
-        * 0    1    2    3    4    5    6    7
-        *
-        * If the cache_ctl->progress was at 3, then we are only allowed to
-        * unpin [0,1) and [2,3], because the caching thread has already
-        * processed those extents.  We are not allowed to unpin [5,6), because
-        * the caching thread will re-start it's search from 3, and thus find
-        * the hole from [4,6) to add to the free space cache.
-        */
-       write_lock(&fs_info->block_group_cache_lock);
-       list_for_each_entry_safe(caching_ctl, next,
-                                &fs_info->caching_block_groups, list) {
-               struct btrfs_block_group *cache = caching_ctl->block_group;
-
-               if (btrfs_block_group_done(cache)) {
-                       cache->last_byte_to_unpin = (u64)-1;
-                       list_del_init(&caching_ctl->list);
-                       btrfs_put_caching_control(caching_ctl);
-               } else {
-                       cache->last_byte_to_unpin = caching_ctl->progress;
-               }
-       }
-       write_unlock(&fs_info->block_group_cache_lock);
        up_write(&fs_info->commit_root_sem);
 }
 
@@ -313,6 +272,8 @@ loop:
                atomic_inc(&cur_trans->num_writers);
                extwriter_counter_inc(cur_trans, type);
                spin_unlock(&fs_info->trans_lock);
+               btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+               btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
                return 0;
        }
        spin_unlock(&fs_info->trans_lock);
@@ -334,16 +295,23 @@ loop:
        if (!cur_trans)
                return -ENOMEM;
 
+       btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
+       btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
+
        spin_lock(&fs_info->trans_lock);
        if (fs_info->running_transaction) {
                /*
                 * someone started a transaction after we unlocked.  Make sure
                 * to redo the checks above
                 */
+               btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+               btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                kfree(cur_trans);
                goto loop;
        } else if (BTRFS_FS_ERROR(fs_info)) {
                spin_unlock(&fs_info->trans_lock);
+               btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+               btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                kfree(cur_trans);
                return -EROFS;
        }
@@ -397,7 +365,7 @@ loop:
        spin_lock_init(&cur_trans->releasing_ebs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
-                       IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
+                       IO_TREE_TRANS_DIRTY_PAGES, NULL);
        extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
                        IO_TREE_FS_PINNED_EXTENTS, NULL);
        fs_info->generation++;
@@ -541,6 +509,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
                refcount_inc(&cur_trans->use_count);
                spin_unlock(&fs_info->trans_lock);
 
+               btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
                wait_event(fs_info->transaction_wait,
                           cur_trans->state >= TRANS_STATE_UNBLOCKED ||
                           TRANS_ABORTED(cur_trans));
@@ -625,7 +594,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
                 */
                num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
                if (flush == BTRFS_RESERVE_FLUSH_ALL &&
-                   delayed_refs_rsv->full == 0) {
+                   btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
                        delayed_refs_bytes = num_bytes;
                        num_bytes <<= 1;
                }
@@ -650,7 +619,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
                if (rsv->space_info->force_alloc)
                        do_chunk_alloc = true;
        } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
-                  !delayed_refs_rsv->full) {
+                  !btrfs_block_rsv_full(delayed_refs_rsv)) {
                /*
                 * Some people call with btrfs_start_transaction(root, 0)
                 * because they can be throttled, but have some other mechanism
@@ -859,6 +828,15 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit,
        u64 transid = commit->transid;
        bool put = false;
 
+       /*
+        * At the moment this function is called with min_state either being
+        * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
+        */
+       if (min_state == TRANS_STATE_COMPLETED)
+               btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+       else
+               btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+
        while (1) {
                wait_event(commit->commit_wait, commit->state >= min_state);
                if (put)
@@ -1022,6 +1000,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        extwriter_counter_dec(cur_trans, trans->type);
 
        cond_wake_up(&cur_trans->writer_wait);
+
+       btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
+       btrfs_lockdep_release(info, btrfs_trans_num_writers);
+
        btrfs_put_transaction(cur_trans);
 
        if (current->journal_info == trans)
@@ -1134,7 +1116,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
                 * it's safe to do it (through extent_io_tree_release()).
                 */
                err = clear_extent_bit(dirty_pages, start, end,
-                                      EXTENT_NEED_WAIT, 0, 0, &cached_state);
+                                      EXTENT_NEED_WAIT, &cached_state);
                if (err == -ENOMEM)
                        err = 0;
                if (!err)
@@ -1912,14 +1894,6 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
                super->cache_generation = 0;
        if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
                super->uuid_tree_generation = root_item->generation;
-
-       if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
-               root_item = &fs_info->block_group_root->root_item;
-
-               super->block_group_root = root_item->bytenr;
-               super->block_group_root_generation = root_item->generation;
-               super->block_group_root_level = root_item->level;
-       }
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1967,6 +1941,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
         * Wait for the current transaction commit to start and block
         * subsequent transaction joins
         */
+       btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
        wait_event(fs_info->transaction_blocked_wait,
                   cur_trans->state >= TRANS_STATE_COMMIT_START ||
                   TRANS_ABORTED(cur_trans));
@@ -1994,6 +1969,12 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
        if (cur_trans == fs_info->running_transaction) {
                cur_trans->state = TRANS_STATE_COMMIT_DOING;
                spin_unlock(&fs_info->trans_lock);
+
+               /*
+                * The thread has already released the lockdep map as reader
+                * already in btrfs_commit_transaction().
+                */
+               btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
                wait_event(cur_trans->writer_wait,
                           atomic_read(&cur_trans->num_writers) == 1);
 
@@ -2118,12 +2099,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
        ktime_t interval;
 
        ASSERT(refcount_read(&trans->use_count) == 1);
+       btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
 
        /* Stop the commit early if ->aborted is set */
        if (TRANS_ABORTED(cur_trans)) {
                ret = cur_trans->aborted;
-               btrfs_end_transaction(trans);
-               return ret;
+               goto lockdep_trans_commit_start_release;
        }
 
        btrfs_trans_release_metadata(trans);
@@ -2140,10 +2121,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                 * Any running threads may add more while we are here.
                 */
                ret = btrfs_run_delayed_refs(trans, 0);
-               if (ret) {
-                       btrfs_end_transaction(trans);
-                       return ret;
-               }
+               if (ret)
+                       goto lockdep_trans_commit_start_release;
        }
 
        btrfs_create_pending_block_groups(trans);
@@ -2172,10 +2151,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
                if (run_it) {
                        ret = btrfs_start_dirty_block_groups(trans);
-                       if (ret) {
-                               btrfs_end_transaction(trans);
-                               return ret;
-                       }
+                       if (ret)
+                               goto lockdep_trans_commit_start_release;
                }
        }
 
@@ -2190,6 +2167,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
                if (trans->in_fsync)
                        want_state = TRANS_STATE_SUPER_COMMITTED;
+
+               btrfs_trans_state_lockdep_release(fs_info,
+                                                 BTRFS_LOCKDEP_TRANS_COMMIT_START);
                ret = btrfs_end_transaction(trans);
                wait_for_commit(cur_trans, want_state);
 
@@ -2203,6 +2183,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
        cur_trans->state = TRANS_STATE_COMMIT_START;
        wake_up(&fs_info->transaction_blocked_wait);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
 
        if (cur_trans->list.prev != &fs_info->trans_list) {
                enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
@@ -2222,7 +2203,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
                        btrfs_put_transaction(prev_trans);
                        if (ret)
-                               goto cleanup_transaction;
+                               goto lockdep_release;
                } else {
                        spin_unlock(&fs_info->trans_lock);
                }
@@ -2236,7 +2217,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                 */
                if (BTRFS_FS_ERROR(fs_info)) {
                        ret = -EROFS;
-                       goto cleanup_transaction;
+                       goto lockdep_release;
                }
        }
 
@@ -2250,19 +2231,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
        ret = btrfs_start_delalloc_flush(fs_info);
        if (ret)
-               goto cleanup_transaction;
+               goto lockdep_release;
 
        ret = btrfs_run_delayed_items(trans);
        if (ret)
-               goto cleanup_transaction;
+               goto lockdep_release;
 
+       /*
+        * The thread has started/joined the transaction thus it holds the
+        * lockdep map as a reader. It has to release it before acquiring the
+        * lockdep map as a writer.
+        */
+       btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+       btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
        wait_event(cur_trans->writer_wait,
                   extwriter_counter_read(cur_trans) == 0);
 
        /* some pending stuffs might be added after the previous flush. */
        ret = btrfs_run_delayed_items(trans);
-       if (ret)
+       if (ret) {
+               btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
                goto cleanup_transaction;
+       }
 
        btrfs_wait_delalloc_flush(fs_info);
 
@@ -2271,6 +2261,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
         * transaction. Otherwise if this transaction commits before the ordered
         * extents complete we lose logged data after a power failure.
         */
+       btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
        wait_event(cur_trans->pending_wait,
                   atomic_read(&cur_trans->pending_ordered) == 0);
 
@@ -2284,10 +2275,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
        add_pending_snapshot(trans);
        cur_trans->state = TRANS_STATE_COMMIT_DOING;
        spin_unlock(&fs_info->trans_lock);
+
+       /*
+        * The thread has started/joined the transaction thus it holds the
+        * lockdep map as a reader. It has to release it before acquiring the
+        * lockdep map as a writer.
+        */
+       btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+       btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
 
        /*
+        * Make lockdep happy by acquiring the state locks after
+        * btrfs_trans_num_writers is released. If we acquired the state locks
+        * before releasing the btrfs_trans_num_writers lock then lockdep would
+        * complain because we did not follow the reverse order unlocking rule.
+        */
+       btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
+       btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+       btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
+
+       /*
         * We've started the commit, clear the flag in case we were triggered to
         * do an async commit but somebody else started before the transaction
         * kthread could do the work.
@@ -2296,6 +2305,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
        if (TRANS_ABORTED(cur_trans)) {
                ret = cur_trans->aborted;
+               btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
                goto scrub_continue;
        }
        /*
@@ -2430,6 +2440,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
        mutex_unlock(&fs_info->reloc_mutex);
 
        wake_up(&fs_info->transaction_wait);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
 
        ret = btrfs_write_and_wait_transaction(trans);
        if (ret) {
@@ -2461,6 +2472,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
         */
        cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
        wake_up(&cur_trans->commit_wait);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
 
        btrfs_finish_extent_commit(trans);
 
@@ -2474,6 +2486,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
         */
        cur_trans->state = TRANS_STATE_COMPLETED;
        wake_up(&cur_trans->commit_wait);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
 
        spin_lock(&fs_info->trans_lock);
        list_del_init(&cur_trans->list);
@@ -2502,7 +2515,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
 
 unlock_reloc:
        mutex_unlock(&fs_info->reloc_mutex);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
 scrub_continue:
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
        btrfs_scrub_continue(fs_info);
 cleanup_transaction:
        btrfs_trans_release_metadata(trans);
@@ -2515,6 +2531,16 @@ cleanup_transaction:
        cleanup_transaction(trans, ret);
 
        return ret;
+
+lockdep_release:
+       btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
+       btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
+       goto cleanup_transaction;
+
+lockdep_trans_commit_start_release:
+       btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
+       btrfs_end_transaction(trans);
+       return ret;
 }
 
 /*
index 9205c4a..813986e 100644 (file)
@@ -22,6 +22,8 @@
 #include "zoned.h"
 #include "inode-item.h"
 
+#define MAX_CONFLICT_INODES 10
+
 /* magic values for the inode_only field in btrfs_log_inode:
  *
  * LOG_INODE_ALL means to log everything
@@ -31,8 +33,6 @@
 enum {
        LOG_INODE_ALL,
        LOG_INODE_EXISTS,
-       LOG_OTHER_INODE,
-       LOG_OTHER_INODE_ALL,
 };
 
 /*
@@ -801,7 +801,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
                        ret = btrfs_lookup_csums_range(root->log_root,
                                                csum_start, csum_end - 1,
-                                               &ordered_sums, 0);
+                                               &ordered_sums, 0, false);
                        if (ret)
                                goto out;
                        /*
@@ -1063,8 +1063,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_inode *dir,
                                  struct btrfs_inode *inode,
                                  u64 inode_objectid, u64 parent_objectid,
-                                 u64 ref_index, char *name, int namelen,
-                                 int *search_done)
+                                 u64 ref_index, char *name, int namelen)
 {
        int ret;
        char *victim_name;
@@ -1126,19 +1125,12 @@ again:
                                kfree(victim_name);
                                if (ret)
                                        return ret;
-                               *search_done = 1;
                                goto again;
                        }
                        kfree(victim_name);
 
                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
                }
-
-               /*
-                * NOTE: we have searched root tree and checked the
-                * corresponding ref, it does not need to check again.
-                */
-               *search_done = 1;
        }
        btrfs_release_path(path);
 
@@ -1202,14 +1194,12 @@ again:
                                kfree(victim_name);
                                if (ret)
                                        return ret;
-                               *search_done = 1;
                                goto again;
                        }
                        kfree(victim_name);
 next:
                        cur_offset += victim_name_len + sizeof(*extref);
                }
-               *search_done = 1;
        }
        btrfs_release_path(path);
 
@@ -1373,103 +1363,6 @@ again:
        return ret;
 }
 
-static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
-                                 const u8 ref_type, const char *name,
-                                 const int namelen)
-{
-       struct btrfs_key key;
-       struct btrfs_path *path;
-       const u64 parent_id = btrfs_ino(BTRFS_I(dir));
-       int ret;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       key.objectid = btrfs_ino(BTRFS_I(inode));
-       key.type = ref_type;
-       if (key.type == BTRFS_INODE_REF_KEY)
-               key.offset = parent_id;
-       else
-               key.offset = btrfs_extref_hash(parent_id, name, namelen);
-
-       ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
-       if (ret < 0)
-               goto out;
-       if (ret > 0) {
-               ret = 0;
-               goto out;
-       }
-       if (key.type == BTRFS_INODE_EXTREF_KEY)
-               ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
-                               path->slots[0], parent_id, name, namelen);
-       else
-               ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
-                                                  name, namelen);
-
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
-static int add_link(struct btrfs_trans_handle *trans,
-                   struct inode *dir, struct inode *inode, const char *name,
-                   int namelen, u64 ref_index)
-{
-       struct btrfs_root *root = BTRFS_I(dir)->root;
-       struct btrfs_dir_item *dir_item;
-       struct btrfs_key key;
-       struct btrfs_path *path;
-       struct inode *other_inode = NULL;
-       int ret;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       dir_item = btrfs_lookup_dir_item(NULL, root, path,
-                                        btrfs_ino(BTRFS_I(dir)),
-                                        name, namelen, 0);
-       if (!dir_item) {
-               btrfs_release_path(path);
-               goto add_link;
-       } else if (IS_ERR(dir_item)) {
-               ret = PTR_ERR(dir_item);
-               goto out;
-       }
-
-       /*
-        * Our inode's dentry collides with the dentry of another inode which is
-        * in the log but not yet processed since it has a higher inode number.
-        * So delete that other dentry.
-        */
-       btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
-       btrfs_release_path(path);
-       other_inode = read_one_inode(root, key.objectid);
-       if (!other_inode) {
-               ret = -ENOENT;
-               goto out;
-       }
-       ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
-                                         name, namelen);
-       if (ret)
-               goto out;
-       /*
-        * If we dropped the link count to 0, bump it so that later the iput()
-        * on the inode will not free it. We will fixup the link count later.
-        */
-       if (other_inode->i_nlink == 0)
-               set_nlink(other_inode, 1);
-add_link:
-       ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
-                            name, namelen, 0, ref_index);
-out:
-       iput(other_inode);
-       btrfs_free_path(path);
-
-       return ret;
-}
-
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -1490,7 +1383,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
        char *name = NULL;
        int namelen;
        int ret;
-       int search_done = 0;
        int log_ref_ver = 0;
        u64 parent_objectid;
        u64 inode_objectid;
@@ -1565,51 +1457,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                         * overwrite any existing back reference, and we don't
                         * want to create dangling pointers in the directory.
                         */
-
-                       if (!search_done) {
-                               ret = __add_inode_ref(trans, root, path, log,
-                                                     BTRFS_I(dir),
-                                                     BTRFS_I(inode),
-                                                     inode_objectid,
-                                                     parent_objectid,
-                                                     ref_index, name, namelen,
-                                                     &search_done);
-                               if (ret) {
-                                       if (ret == 1)
-                                               ret = 0;
-                                       goto out;
-                               }
-                       }
-
-                       /*
-                        * If a reference item already exists for this inode
-                        * with the same parent and name, but different index,
-                        * drop it and the corresponding directory index entries
-                        * from the parent before adding the new reference item
-                        * and dir index entries, otherwise we would fail with
-                        * -EEXIST returned from btrfs_add_link() below.
-                        */
-                       ret = btrfs_inode_ref_exists(inode, dir, key->type,
-                                                    name, namelen);
-                       if (ret > 0) {
-                               ret = unlink_inode_for_log_replay(trans,
-                                                        BTRFS_I(dir),
-                                                        BTRFS_I(inode),
-                                                        name, namelen);
-                               /*
-                                * If we dropped the link count to 0, bump it so
-                                * that later the iput() on the inode will not
-                                * free it. We will fixup the link count later.
-                                */
-                               if (!ret && inode->i_nlink == 0)
-                                       set_nlink(inode, 1);
-                       }
-                       if (ret < 0)
+                       ret = __add_inode_ref(trans, root, path, log,
+                                             BTRFS_I(dir), BTRFS_I(inode),
+                                             inode_objectid, parent_objectid,
+                                             ref_index, name, namelen);
+                       if (ret) {
+                               if (ret == 1)
+                                       ret = 0;
                                goto out;
+                       }
 
                        /* insert our name */
-                       ret = add_link(trans, dir, inode, name, namelen,
-                                      ref_index);
+                       ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+                                            name, namelen, 0, ref_index);
                        if (ret)
                                goto out;
 
@@ -3875,6 +3735,11 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
                        *last_old_dentry_offset = key.offset;
                        continue;
                }
+
+               /* If we logged this dir index item before, we can skip it. */
+               if (key.offset <= inode->last_dir_index_offset)
+                       continue;
+
                /*
                 * We must make sure that when we log a directory entry, the
                 * corresponding inode, after log replay, has a matching link
@@ -3905,51 +3770,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
                                ctx->log_new_dentries = true;
                }
 
-               if (!ctx->logged_before)
-                       goto add_to_batch;
-
-               /*
-                * If we were logged before and have logged dir items, we can skip
-                * checking if any item with a key offset larger than the last one
-                * we logged is in the log tree, saving time and avoiding adding
-                * contention on the log tree. We can only rely on the value of
-                * last_dir_index_offset when we know for sure that the inode was
-                * previously logged in the current transaction.
-                */
-               if (key.offset > inode->last_dir_index_offset)
-                       goto add_to_batch;
-               /*
-                * Check if the key was already logged before. If not we can add
-                * it to a batch for bulk insertion.
-                */
-               ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0);
-               if (ret < 0) {
-                       return ret;
-               } else if (ret > 0) {
-                       btrfs_release_path(dst_path);
-                       goto add_to_batch;
-               }
-
-               /*
-                * Item exists in the log. Overwrite the item in the log if it
-                * has different content or do nothing if it has exactly the same
-                * content. And then flush the current batch if any - do it after
-                * overwriting the current item, or we would deadlock otherwise,
-                * since we are holding a path for the existing item.
-                */
-               ret = do_overwrite_item(trans, log, dst_path, src, i, &key);
-               if (ret < 0)
-                       return ret;
-
-               if (batch_size > 0) {
-                       ret = flush_dir_items_batch(trans, log, src, dst_path,
-                                                   batch_start, batch_size);
-                       if (ret < 0)
-                               return ret;
-                       batch_size = 0;
-               }
-               continue;
-add_to_batch:
                if (batch_size == 0)
                        batch_start = i;
                batch_size++;
@@ -4136,6 +3956,71 @@ done:
 }
 
 /*
+ * If the inode was logged before and it was evicted, then its
+ * last_dir_index_offset is (u64)-1, so we don't the value of the last index
+ * key offset. If that's the case, search for it and update the inode. This
+ * is to avoid lookups in the log tree every time we try to insert a dir index
+ * key from a leaf changed in the current transaction, and to allow us to always
+ * do batch insertions of dir index keys.
+ */
+static int update_last_dir_index_offset(struct btrfs_inode *inode,
+                                       struct btrfs_path *path,
+                                       const struct btrfs_log_ctx *ctx)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_key key;
+       int ret;
+
+       lockdep_assert_held(&inode->log_mutex);
+
+       if (inode->last_dir_index_offset != (u64)-1)
+               return 0;
+
+       if (!ctx->logged_before) {
+               inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+               return 0;
+       }
+
+       key.objectid = ino;
+       key.type = BTRFS_DIR_INDEX_KEY;
+       key.offset = (u64)-1;
+
+       ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+       /*
+        * An error happened or we actually have an index key with an offset
+        * value of (u64)-1. Bail out, we're done.
+        */
+       if (ret <= 0)
+               goto out;
+
+       ret = 0;
+       inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
+
+       /*
+        * No dir index items, bail out and leave last_dir_index_offset with
+        * the value right before the first valid index value.
+        */
+       if (path->slots[0] == 0)
+               goto out;
+
+       /*
+        * btrfs_search_slot() left us at one slot beyond the slot with the last
+        * index key, or beyond the last key of the directory that is not an
+        * index key. If we have an index key before, set last_dir_index_offset
+        * to its offset value, otherwise leave it with a value right before the
+        * first valid index value, as it means we have an empty directory.
+        */
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+       if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
+               inode->last_dir_index_offset = key.offset;
+
+out:
+       btrfs_release_path(path);
+
+       return ret;
+}
+
+/*
  * logging directories is very similar to logging inodes, We find all the items
  * from the current transaction and write them to the log.
  *
@@ -4157,6 +4042,10 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
        u64 max_key;
        int ret;
 
+       ret = update_last_dir_index_offset(inode, path, ctx);
+       if (ret)
+               return ret;
+
        min_key = BTRFS_DIR_START_INDEX;
        max_key = 0;
        ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4382,8 +4271,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
         * file which happens to refer to the same extent as well. Such races
         * can leave checksum items in the log with overlapping ranges.
         */
-       ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
-                              lock_end, &cached_state);
+       ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+                         &cached_state);
        if (ret)
                return ret;
        /*
@@ -4399,8 +4288,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
        if (!ret)
                ret = btrfs_csum_file_blocks(trans, log_root, sums);
 
-       unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
-                            &cached_state);
+       unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end,
+                     &cached_state);
 
        return ret;
 }
@@ -4513,7 +4402,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                disk_bytenr += extent_offset;
                ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
                                               disk_bytenr + extent_num_bytes - 1,
-                                              &ordered_sums, 0);
+                                              &ordered_sums, 0, false);
                if (ret)
                        goto out;
 
@@ -4709,7 +4598,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
        ret = btrfs_lookup_csums_range(csum_root,
                                       em->block_start + csum_offset,
                                       em->block_start + csum_offset +
-                                      csum_len - 1, &ordered_sums, 0);
+                                      csum_len - 1, &ordered_sums, 0, false);
        if (ret)
                return ret;
 
@@ -5221,10 +5110,9 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
                         * leafs from the log root.
                         */
                        btrfs_release_path(path);
-                       ret = btrfs_insert_file_extent(trans, root->log_root,
-                                                      ino, prev_extent_end, 0,
-                                                      0, hole_len, 0, hole_len,
-                                                      0, 0, 0);
+                       ret = btrfs_insert_hole_extent(trans, root->log_root,
+                                                      ino, prev_extent_end,
+                                                      hole_len);
                        if (ret < 0)
                                return ret;
 
@@ -5253,10 +5141,8 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
 
                btrfs_release_path(path);
                hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
-               ret = btrfs_insert_file_extent(trans, root->log_root,
-                                              ino, prev_extent_end, 0, 0,
-                                              hole_len, 0, hole_len,
-                                              0, 0, 0);
+               ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
+                                              prev_extent_end, hole_len);
                if (ret < 0)
                        return ret;
        }
@@ -5399,111 +5285,461 @@ out:
        return ret;
 }
 
-struct btrfs_ino_list {
+/*
+ * Check if we need to log an inode. This is used in contexts where while
+ * logging an inode we need to log another inode (either that it exists or in
+ * full mode). This is used instead of btrfs_inode_in_log() because the later
+ * requires the inode to be in the log and have the log transaction committed,
+ * while here we do not care if the log transaction was already committed - our
+ * caller will commit the log later - and we want to avoid logging an inode
+ * multiple times when multiple tasks have joined the same log transaction.
+ */
+static bool need_log_inode(const struct btrfs_trans_handle *trans,
+                          const struct btrfs_inode *inode)
+{
+       /*
+        * If a directory was not modified, no dentries added or removed, we can
+        * and should avoid logging it.
+        */
+       if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
+               return false;
+
+       /*
+        * If this inode does not have new/updated/deleted xattrs since the last
+        * time it was logged and is flagged as logged in the current transaction,
+        * we can skip logging it. As for new/deleted names, those are updated in
+        * the log by link/unlink/rename operations.
+        * In case the inode was logged and then evicted and reloaded, its
+        * logged_trans will be 0, in which case we have to fully log it since
+        * logged_trans is a transient field, not persisted.
+        */
+       if (inode->logged_trans == trans->transid &&
+           !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
+               return false;
+
+       return true;
+}
+
+struct btrfs_dir_list {
        u64 ino;
-       u64 parent;
        struct list_head list;
 };
 
-static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 struct btrfs_path *path,
-                                 struct btrfs_log_ctx *ctx,
-                                 u64 ino, u64 parent)
+/*
+ * Log the inodes of the new dentries of a directory.
+ * See process_dir_items_leaf() for details about why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not acquire their VFS lock, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not acquiring the VFS lock of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new index items (key type
+ *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names - this is ok, not a problem, because at log replay time we set the
+ *    directory's i_size to the correct value (see replay_one_name() and
+ *    do_overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+                               struct btrfs_inode *start_inode,
+                               struct btrfs_log_ctx *ctx)
 {
-       struct btrfs_ino_list *ino_elem;
-       LIST_HEAD(inode_list);
+       struct btrfs_root *root = start_inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_path *path;
+       LIST_HEAD(dir_list);
+       struct btrfs_dir_list *dir_elem;
+       u64 ino = btrfs_ino(start_inode);
        int ret = 0;
 
-       ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
-       if (!ino_elem)
-               return -ENOMEM;
-       ino_elem->ino = ino;
-       ino_elem->parent = parent;
-       list_add_tail(&ino_elem->list, &inode_list);
+       /*
+        * If we are logging a new name, as part of a link or rename operation,
+        * don't bother logging new dentries, as we just want to log the names
+        * of an inode and that any new parents exist.
+        */
+       if (ctx->logging_new_name)
+               return 0;
 
-       while (!list_empty(&inode_list)) {
-               struct btrfs_fs_info *fs_info = root->fs_info;
-               struct btrfs_key key;
-               struct inode *inode;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
 
-               ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
-                                           list);
-               ino = ino_elem->ino;
-               parent = ino_elem->parent;
-               list_del(&ino_elem->list);
-               kfree(ino_elem);
-               if (ret)
-                       continue;
+       while (true) {
+               struct extent_buffer *leaf;
+               struct btrfs_key min_key;
+               bool continue_curr_inode = true;
+               int nritems;
+               int i;
 
+               min_key.objectid = ino;
+               min_key.type = BTRFS_DIR_INDEX_KEY;
+               min_key.offset = 0;
+again:
                btrfs_release_path(path);
-
-               inode = btrfs_iget(fs_info->sb, ino, root);
-               /*
-                * If the other inode that had a conflicting dir entry was
+               ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+               if (ret < 0) {
+                       break;
+               } else if (ret > 0) {
+                       ret = 0;
+                       goto next;
+               }
+
+               leaf = path->nodes[0];
+               nritems = btrfs_header_nritems(leaf);
+               for (i = path->slots[0]; i < nritems; i++) {
+                       struct btrfs_dir_item *di;
+                       struct btrfs_key di_key;
+                       struct inode *di_inode;
+                       int log_mode = LOG_INODE_EXISTS;
+                       int type;
+
+                       btrfs_item_key_to_cpu(leaf, &min_key, i);
+                       if (min_key.objectid != ino ||
+                           min_key.type != BTRFS_DIR_INDEX_KEY) {
+                               continue_curr_inode = false;
+                               break;
+                       }
+
+                       di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+                       type = btrfs_dir_type(leaf, di);
+                       if (btrfs_dir_transid(leaf, di) < trans->transid)
+                               continue;
+                       btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+                       if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+                               continue;
+
+                       btrfs_release_path(path);
+                       di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
+                       if (IS_ERR(di_inode)) {
+                               ret = PTR_ERR(di_inode);
+                               goto out;
+                       }
+
+                       if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+                               btrfs_add_delayed_iput(di_inode);
+                               break;
+                       }
+
+                       ctx->log_new_dentries = false;
+                       if (type == BTRFS_FT_DIR)
+                               log_mode = LOG_INODE_ALL;
+                       ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
+                                             log_mode, ctx);
+                       btrfs_add_delayed_iput(di_inode);
+                       if (ret)
+                               goto out;
+                       if (ctx->log_new_dentries) {
+                               dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+                               if (!dir_elem) {
+                                       ret = -ENOMEM;
+                                       goto out;
+                               }
+                               dir_elem->ino = di_key.objectid;
+                               list_add_tail(&dir_elem->list, &dir_list);
+                       }
+                       break;
+               }
+
+               if (continue_curr_inode && min_key.offset < (u64)-1) {
+                       min_key.offset++;
+                       goto again;
+               }
+
+next:
+               if (list_empty(&dir_list))
+                       break;
+
+               dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
+               ino = dir_elem->ino;
+               list_del(&dir_elem->list);
+               kfree(dir_elem);
+       }
+out:
+       btrfs_free_path(path);
+       if (ret) {
+               struct btrfs_dir_list *next;
+
+               list_for_each_entry_safe(dir_elem, next, &dir_list, list)
+                       kfree(dir_elem);
+       }
+
+       return ret;
+}
+
+struct btrfs_ino_list {
+       u64 ino;
+       u64 parent;
+       struct list_head list;
+};
+
+static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_ino_list *curr;
+       struct btrfs_ino_list *next;
+
+       list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
+               list_del(&curr->list);
+               kfree(curr);
+       }
+}
+
+static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
+                                   struct btrfs_path *path)
+{
+       struct btrfs_key key;
+       int ret;
+
+       key.objectid = ino;
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+
+       path->search_commit_root = 1;
+       path->skip_locking = 1;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (WARN_ON_ONCE(ret > 0)) {
+               /*
+                * We have previously found the inode through the commit root
+                * so this should not happen. If it does, just error out and
+                * fallback to a transaction commit.
+                */
+               ret = -ENOENT;
+       } else if (ret == 0) {
+               struct btrfs_inode_item *item;
+
+               item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_item);
+               if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
+                       ret = 1;
+       }
+
+       btrfs_release_path(path);
+       path->search_commit_root = 0;
+       path->skip_locking = 0;
+
+       return ret;
+}
+
+static int add_conflicting_inode(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                u64 ino, u64 parent,
+                                struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_ino_list *ino_elem;
+       struct inode *inode;
+
+       /*
+        * It's rare to have a lot of conflicting inodes, in practice it is not
+        * common to have more than 1 or 2. We don't want to collect too many,
+        * as we could end up logging too many inodes (even if only in
+        * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+        * commits.
+        */
+       if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+               return BTRFS_LOG_FORCE_COMMIT;
+
+       inode = btrfs_iget(root->fs_info->sb, ino, root);
+       /*
+        * If the other inode that had a conflicting dir entry was deleted in
+        * the current transaction then we either:
+        *
+        * 1) Log the parent directory (later after adding it to the list) if
+        *    the inode is a directory. This is because it may be a deleted
+        *    subvolume/snapshot or it may be a regular directory that had
+        *    deleted subvolumes/snapshots (or subdirectories that had them),
+        *    and at the moment we can't deal with dropping subvolumes/snapshots
+        *    during log replay. So we just log the parent, which will result in
+        *    a fallback to a transaction commit if we are dealing with those
+        *    cases (last_unlink_trans will match the current transaction);
+        *
+        * 2) Do nothing if it's not a directory. During log replay we simply
+        *    unlink the conflicting dentry from the parent directory and then
+        *    add the dentry for our inode. Like this we can avoid logging the
+        *    parent directory (and maybe fallback to a transaction commit in
+        *    case it has a last_unlink_trans == trans->transid, due to moving
+        *    some inode from it to some other directory).
+        */
+       if (IS_ERR(inode)) {
+               int ret = PTR_ERR(inode);
+
+               if (ret != -ENOENT)
+                       return ret;
+
+               ret = conflicting_inode_is_dir(root, ino, path);
+               /* Not a directory or we got an error. */
+               if (ret <= 0)
+                       return ret;
+
+               /* Conflicting inode is a directory, so we'll log its parent. */
+               ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+               if (!ino_elem)
+                       return -ENOMEM;
+               ino_elem->ino = ino;
+               ino_elem->parent = parent;
+               list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+               ctx->num_conflict_inodes++;
+
+               return 0;
+       }
+
+       /*
+        * If the inode was already logged skip it - otherwise we can hit an
+        * infinite loop. Example:
+        *
+        * From the commit root (previous transaction) we have the following
+        * inodes:
+        *
+        * inode 257 a directory
+        * inode 258 with references "zz" and "zz_link" on inode 257
+        * inode 259 with reference "a" on inode 257
+        *
+        * And in the current (uncommitted) transaction we have:
+        *
+        * inode 257 a directory, unchanged
+        * inode 258 with references "a" and "a2" on inode 257
+        * inode 259 with reference "zz_link" on inode 257
+        * inode 261 with reference "zz" on inode 257
+        *
+        * When logging inode 261 the following infinite loop could
+        * happen if we don't skip already logged inodes:
+        *
+        * - we detect inode 258 as a conflicting inode, with inode 261
+        *   on reference "zz", and log it;
+        *
+        * - we detect inode 259 as a conflicting inode, with inode 258
+        *   on reference "a", and log it;
+        *
+        * - we detect inode 258 as a conflicting inode, with inode 259
+        *   on reference "zz_link", and log it - again! After this we
+        *   repeat the above steps forever.
+        *
+        * Here we can use need_log_inode() because we only need to log the
+        * inode in LOG_INODE_EXISTS mode and rename operations update the log,
+        * so that the log ends up with the new name and without the old name.
+        */
+       if (!need_log_inode(trans, BTRFS_I(inode))) {
+               btrfs_add_delayed_iput(inode);
+               return 0;
+       }
+
+       btrfs_add_delayed_iput(inode);
+
+       ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
+       if (!ino_elem)
+               return -ENOMEM;
+       ino_elem->ino = ino;
+       ino_elem->parent = parent;
+       list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
+       ctx->num_conflict_inodes++;
+
+       return 0;
+}
+
+static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       int ret = 0;
+
+       /*
+        * Conflicting inodes are logged by the first call to btrfs_log_inode(),
+        * otherwise we could have unbounded recursion of btrfs_log_inode()
+        * calls. This check guarantees we can have only 1 level of recursion.
+        */
+       if (ctx->logging_conflict_inodes)
+               return 0;
+
+       ctx->logging_conflict_inodes = true;
+
+       /*
+        * New conflicting inodes may be found and added to the list while we
+        * are logging a conflicting inode, so keep iterating while the list is
+        * not empty.
+        */
+       while (!list_empty(&ctx->conflict_inodes)) {
+               struct btrfs_ino_list *curr;
+               struct inode *inode;
+               u64 ino;
+               u64 parent;
+
+               curr = list_first_entry(&ctx->conflict_inodes,
+                                       struct btrfs_ino_list, list);
+               ino = curr->ino;
+               parent = curr->parent;
+               list_del(&curr->list);
+               kfree(curr);
+
+               inode = btrfs_iget(fs_info->sb, ino, root);
+               /*
+                * If the other inode that had a conflicting dir entry was
                 * deleted in the current transaction, we need to log its parent
-                * directory.
+                * directory. See the comment at add_conflicting_inode().
                 */
                if (IS_ERR(inode)) {
                        ret = PTR_ERR(inode);
-                       if (ret == -ENOENT) {
-                               inode = btrfs_iget(fs_info->sb, parent, root);
-                               if (IS_ERR(inode)) {
-                                       ret = PTR_ERR(inode);
-                               } else {
-                                       ret = btrfs_log_inode(trans,
-                                                     BTRFS_I(inode),
-                                                     LOG_OTHER_INODE_ALL,
-                                                     ctx);
-                                       btrfs_add_delayed_iput(inode);
-                               }
+                       if (ret != -ENOENT)
+                               break;
+
+                       inode = btrfs_iget(fs_info->sb, parent, root);
+                       if (IS_ERR(inode)) {
+                               ret = PTR_ERR(inode);
+                               break;
                        }
+
+                       /*
+                        * Always log the directory, we cannot make this
+                        * conditional on need_log_inode() because the directory
+                        * might have been logged in LOG_INODE_EXISTS mode or
+                        * the dir index of the conflicting inode is not in a
+                        * dir index key range logged for the directory. So we
+                        * must make sure the deletion is recorded.
+                        */
+                       ret = btrfs_log_inode(trans, BTRFS_I(inode),
+                                             LOG_INODE_ALL, ctx);
+                       btrfs_add_delayed_iput(inode);
+                       if (ret)
+                               break;
                        continue;
                }
+
                /*
-                * If the inode was already logged skip it - otherwise we can
-                * hit an infinite loop. Example:
-                *
-                * From the commit root (previous transaction) we have the
-                * following inodes:
-                *
-                * inode 257 a directory
-                * inode 258 with references "zz" and "zz_link" on inode 257
-                * inode 259 with reference "a" on inode 257
-                *
-                * And in the current (uncommitted) transaction we have:
-                *
-                * inode 257 a directory, unchanged
-                * inode 258 with references "a" and "a2" on inode 257
-                * inode 259 with reference "zz_link" on inode 257
-                * inode 261 with reference "zz" on inode 257
+                * Here we can use need_log_inode() because we only need to log
+                * the inode in LOG_INODE_EXISTS mode and rename operations
+                * update the log, so that the log ends up with the new name and
+                * without the old name.
                 *
-                * When logging inode 261 the following infinite loop could
-                * happen if we don't skip already logged inodes:
-                *
-                * - we detect inode 258 as a conflicting inode, with inode 261
-                *   on reference "zz", and log it;
-                *
-                * - we detect inode 259 as a conflicting inode, with inode 258
-                *   on reference "a", and log it;
-                *
-                * - we detect inode 258 as a conflicting inode, with inode 259
-                *   on reference "zz_link", and log it - again! After this we
-                *   repeat the above steps forever.
+                * We did this check at add_conflicting_inode(), but here we do
+                * it again because if some other task logged the inode after
+                * that, we can avoid doing it again.
                 */
-               spin_lock(&BTRFS_I(inode)->lock);
-               /*
-                * Check the inode's logged_trans only instead of
-                * btrfs_inode_in_log(). This is because the last_log_commit of
-                * the inode is not updated when we only log that it exists (see
-                * btrfs_log_inode()).
-                */
-               if (BTRFS_I(inode)->logged_trans == trans->transid) {
-                       spin_unlock(&BTRFS_I(inode)->lock);
+               if (!need_log_inode(trans, BTRFS_I(inode))) {
                        btrfs_add_delayed_iput(inode);
                        continue;
                }
-               spin_unlock(&BTRFS_I(inode)->lock);
+
                /*
                 * We are safe logging the other inode without acquiring its
                 * lock as long as we log with the LOG_INODE_EXISTS mode. We
@@ -5511,67 +5747,16 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
                 * well because during a rename we pin the log and update the
                 * log with the new name before we unpin it.
                 */
-               ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx);
-               if (ret) {
-                       btrfs_add_delayed_iput(inode);
-                       continue;
-               }
-
-               key.objectid = ino;
-               key.type = BTRFS_INODE_REF_KEY;
-               key.offset = 0;
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0) {
-                       btrfs_add_delayed_iput(inode);
-                       continue;
-               }
-
-               while (true) {
-                       struct extent_buffer *leaf = path->nodes[0];
-                       int slot = path->slots[0];
-                       u64 other_ino = 0;
-                       u64 other_parent = 0;
-
-                       if (slot >= btrfs_header_nritems(leaf)) {
-                               ret = btrfs_next_leaf(root, path);
-                               if (ret < 0) {
-                                       break;
-                               } else if (ret > 0) {
-                                       ret = 0;
-                                       break;
-                               }
-                               continue;
-                       }
-
-                       btrfs_item_key_to_cpu(leaf, &key, slot);
-                       if (key.objectid != ino ||
-                           (key.type != BTRFS_INODE_REF_KEY &&
-                            key.type != BTRFS_INODE_EXTREF_KEY)) {
-                               ret = 0;
-                               break;
-                       }
-
-                       ret = btrfs_check_ref_name_override(leaf, slot, &key,
-                                       BTRFS_I(inode), &other_ino,
-                                       &other_parent);
-                       if (ret < 0)
-                               break;
-                       if (ret > 0) {
-                               ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
-                               if (!ino_elem) {
-                                       ret = -ENOMEM;
-                                       break;
-                               }
-                               ino_elem->ino = other_ino;
-                               ino_elem->parent = other_parent;
-                               list_add_tail(&ino_elem->list, &inode_list);
-                               ret = 0;
-                       }
-                       path->slots[0]++;
-               }
+               ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
                btrfs_add_delayed_iput(inode);
+               if (ret)
+                       break;
        }
 
+       ctx->logging_conflict_inodes = false;
+       if (ret)
+               free_conflicting_inodes(ctx);
+
        return ret;
 }
 
@@ -5582,7 +5767,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path,
                                   struct btrfs_path *dst_path,
                                   const u64 logged_isize,
-                                  const bool recursive_logging,
                                   const int inode_only,
                                   struct btrfs_log_ctx *ctx,
                                   bool *need_log_inode_item)
@@ -5621,8 +5805,8 @@ again:
                        break;
                } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
                            min_key->type == BTRFS_INODE_EXTREF_KEY) &&
-                          inode->generation == trans->transid &&
-                          !recursive_logging) {
+                          (inode->generation == trans->transid ||
+                           ctx->logging_conflict_inodes)) {
                        u64 other_ino = 0;
                        u64 other_parent = 0;
 
@@ -5646,11 +5830,12 @@ again:
                                        return ret;
                                ins_nr = 0;
 
-                               ret = log_conflicting_inodes(trans, root, path,
-                                               ctx, other_ino, other_parent);
+                               btrfs_release_path(path);
+                               ret = add_conflicting_inode(trans, root, path,
+                                                           other_ino,
+                                                           other_parent, ctx);
                                if (ret)
                                        return ret;
-                               btrfs_release_path(path);
                                goto next_key;
                        }
                } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
@@ -5708,28 +5893,393 @@ next_key:
                }
 
                /*
-                * We may process many leaves full of items for our inode, so
-                * avoid monopolizing a cpu for too long by rescheduling while
-                * not holding locks on any tree.
+                * We may process many leaves full of items for our inode, so
+                * avoid monopolizing a cpu for too long by rescheduling while
+                * not holding locks on any tree.
+                */
+               cond_resched();
+       }
+       if (ins_nr) {
+               ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+                                ins_nr, inode_only, logged_isize);
+               if (ret)
+                       return ret;
+       }
+
+       if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+               /*
+                * Release the path because otherwise we might attempt to double
+                * lock the same leaf with btrfs_log_prealloc_extents() below.
+                */
+               btrfs_release_path(path);
+               ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+       }
+
+       return ret;
+}
+
+static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *log,
+                                     struct btrfs_path *path,
+                                     const struct btrfs_item_batch *batch,
+                                     const struct btrfs_delayed_item *first_item)
+{
+       const struct btrfs_delayed_item *curr = first_item;
+       int ret;
+
+       ret = btrfs_insert_empty_items(trans, log, path, batch);
+       if (ret)
+               return ret;
+
+       for (int i = 0; i < batch->nr; i++) {
+               char *data_ptr;
+
+               data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
+               write_extent_buffer(path->nodes[0], &curr->data,
+                                   (unsigned long)data_ptr, curr->data_len);
+               curr = list_next_entry(curr, log_list);
+               path->slots[0]++;
+       }
+
+       btrfs_release_path(path);
+
+       return 0;
+}
+
+static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
+                                      struct btrfs_inode *inode,
+                                      struct btrfs_path *path,
+                                      const struct list_head *delayed_ins_list,
+                                      struct btrfs_log_ctx *ctx)
+{
+       /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
+       const int max_batch_size = 195;
+       const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_root *log = inode->root->log_root;
+       struct btrfs_item_batch batch = {
+               .nr = 0,
+               .total_data_size = 0,
+       };
+       const struct btrfs_delayed_item *first = NULL;
+       const struct btrfs_delayed_item *curr;
+       char *ins_data;
+       struct btrfs_key *ins_keys;
+       u32 *ins_sizes;
+       u64 curr_batch_size = 0;
+       int batch_idx = 0;
+       int ret;
+
+       /* We are adding dir index items to the log tree. */
+       lockdep_assert_held(&inode->log_mutex);
+
+       /*
+        * We collect delayed items before copying index keys from the subvolume
+        * to the log tree. However just after we collected them, they may have
+        * been flushed (all of them or just some of them), and therefore we
+        * could have copied them from the subvolume tree to the log tree.
+        * So find the first delayed item that was not yet logged (they are
+        * sorted by index number).
+        */
+       list_for_each_entry(curr, delayed_ins_list, log_list) {
+               if (curr->index > inode->last_dir_index_offset) {
+                       first = curr;
+                       break;
+               }
+       }
+
+       /* Empty list or all delayed items were already logged. */
+       if (!first)
+               return 0;
+
+       ins_data = kmalloc(max_batch_size * sizeof(u32) +
+                          max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
+       if (!ins_data)
+               return -ENOMEM;
+       ins_sizes = (u32 *)ins_data;
+       batch.data_sizes = ins_sizes;
+       ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
+       batch.keys = ins_keys;
+
+       curr = first;
+       while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
+               const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
+
+               if (curr_batch_size + curr_size > leaf_data_size ||
+                   batch.nr == max_batch_size) {
+                       ret = insert_delayed_items_batch(trans, log, path,
+                                                        &batch, first);
+                       if (ret)
+                               goto out;
+                       batch_idx = 0;
+                       batch.nr = 0;
+                       batch.total_data_size = 0;
+                       curr_batch_size = 0;
+                       first = curr;
+               }
+
+               ins_sizes[batch_idx] = curr->data_len;
+               ins_keys[batch_idx].objectid = ino;
+               ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
+               ins_keys[batch_idx].offset = curr->index;
+               curr_batch_size += curr_size;
+               batch.total_data_size += curr->data_len;
+               batch.nr++;
+               batch_idx++;
+               curr = list_next_entry(curr, log_list);
+       }
+
+       ASSERT(batch.nr >= 1);
+       ret = insert_delayed_items_batch(trans, log, path, &batch, first);
+
+       curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
+                              log_list);
+       inode->last_dir_index_offset = curr->index;
+out:
+       kfree(ins_data);
+
+       return ret;
+}
+
+static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
+                                     struct btrfs_inode *inode,
+                                     struct btrfs_path *path,
+                                     const struct list_head *delayed_del_list,
+                                     struct btrfs_log_ctx *ctx)
+{
+       const u64 ino = btrfs_ino(inode);
+       const struct btrfs_delayed_item *curr;
+
+       curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+                               log_list);
+
+       while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+               u64 first_dir_index = curr->index;
+               u64 last_dir_index;
+               const struct btrfs_delayed_item *next;
+               int ret;
+
+               /*
+                * Find a range of consecutive dir index items to delete. Like
+                * this we log a single dir range item spanning several contiguous
+                * dir items instead of logging one range item per dir index item.
+                */
+               next = list_next_entry(curr, log_list);
+               while (!list_entry_is_head(next, delayed_del_list, log_list)) {
+                       if (next->index != curr->index + 1)
+                               break;
+                       curr = next;
+                       next = list_next_entry(next, log_list);
+               }
+
+               last_dir_index = curr->index;
+               ASSERT(last_dir_index >= first_dir_index);
+
+               ret = insert_dir_log_key(trans, inode->root->log_root, path,
+                                        ino, first_dir_index, last_dir_index);
+               if (ret)
+                       return ret;
+               curr = list_next_entry(curr, log_list);
+       }
+
+       return 0;
+}
+
+static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
+                                       struct btrfs_inode *inode,
+                                       struct btrfs_path *path,
+                                       struct btrfs_log_ctx *ctx,
+                                       const struct list_head *delayed_del_list,
+                                       const struct btrfs_delayed_item *first,
+                                       const struct btrfs_delayed_item **last_ret)
+{
+       const struct btrfs_delayed_item *next;
+       struct extent_buffer *leaf = path->nodes[0];
+       const int last_slot = btrfs_header_nritems(leaf) - 1;
+       int slot = path->slots[0] + 1;
+       const u64 ino = btrfs_ino(inode);
+
+       next = list_next_entry(first, log_list);
+
+       while (slot < last_slot &&
+              !list_entry_is_head(next, delayed_del_list, log_list)) {
+               struct btrfs_key key;
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid != ino ||
+                   key.type != BTRFS_DIR_INDEX_KEY ||
+                   key.offset != next->index)
+                       break;
+
+               slot++;
+               *last_ret = next;
+               next = list_next_entry(next, log_list);
+       }
+
+       return btrfs_del_items(trans, inode->root->log_root, path,
+                              path->slots[0], slot - path->slots[0]);
+}
+
+static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
+                                            struct btrfs_inode *inode,
+                                            struct btrfs_path *path,
+                                            const struct list_head *delayed_del_list,
+                                            struct btrfs_log_ctx *ctx)
+{
+       struct btrfs_root *log = inode->root->log_root;
+       const struct btrfs_delayed_item *curr;
+       u64 last_range_start;
+       u64 last_range_end = 0;
+       struct btrfs_key key;
+
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_DIR_INDEX_KEY;
+       curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
+                               log_list);
+
+       while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
+               const struct btrfs_delayed_item *last = curr;
+               u64 first_dir_index = curr->index;
+               u64 last_dir_index;
+               bool deleted_items = false;
+               int ret;
+
+               key.offset = curr->index;
+               ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret == 0) {
+                       ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+                                                          delayed_del_list, curr,
+                                                          &last);
+                       if (ret)
+                               return ret;
+                       deleted_items = true;
+               }
+
+               btrfs_release_path(path);
+
+               /*
+                * If we deleted items from the leaf, it means we have a range
+                * item logging their range, so no need to add one or update an
+                * existing one. Otherwise we have to log a dir range item.
                 */
-               cond_resched();
-       }
-       if (ins_nr) {
-               ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
-                                ins_nr, inode_only, logged_isize);
+               if (deleted_items)
+                       goto next_batch;
+
+               last_dir_index = last->index;
+               ASSERT(last_dir_index >= first_dir_index);
+               /*
+                * If this range starts right after where the previous one ends,
+                * then we want to reuse the previous range item and change its
+                * end offset to the end of this range. This is just to minimize
+                * leaf space usage, by avoiding adding a new range item.
+                */
+               if (last_range_end != 0 && first_dir_index == last_range_end + 1)
+                       first_dir_index = last_range_start;
+
+               ret = insert_dir_log_key(trans, log, path, key.objectid,
+                                        first_dir_index, last_dir_index);
                if (ret)
                        return ret;
+
+               last_range_start = first_dir_index;
+               last_range_end = last_dir_index;
+next_batch:
+               curr = list_next_entry(last, log_list);
        }
 
-       if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
-               /*
-                * Release the path because otherwise we might attempt to double
-                * lock the same leaf with btrfs_log_prealloc_extents() below.
-                */
-               btrfs_release_path(path);
-               ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+       return 0;
+}
+
+static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
+                                     struct btrfs_inode *inode,
+                                     struct btrfs_path *path,
+                                     const struct list_head *delayed_del_list,
+                                     struct btrfs_log_ctx *ctx)
+{
+       /*
+        * We are deleting dir index items from the log tree or adding range
+        * items to it.
+        */
+       lockdep_assert_held(&inode->log_mutex);
+
+       if (list_empty(delayed_del_list))
+               return 0;
+
+       if (ctx->logged_before)
+               return log_delayed_deletions_incremental(trans, inode, path,
+                                                        delayed_del_list, ctx);
+
+       return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
+                                         ctx);
+}
+
+/*
+ * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
+ * items instead of the subvolume tree.
+ */
+static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
+                                   struct btrfs_inode *inode,
+                                   const struct list_head *delayed_ins_list,
+                                   struct btrfs_log_ctx *ctx)
+{
+       const bool orig_log_new_dentries = ctx->log_new_dentries;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_delayed_item *item;
+       int ret = 0;
+
+       /*
+        * No need for the log mutex, plus to avoid potential deadlocks or
+        * lockdep annotations due to nesting of delayed inode mutexes and log
+        * mutexes.
+        */
+       lockdep_assert_not_held(&inode->log_mutex);
+
+       ASSERT(!ctx->logging_new_delayed_dentries);
+       ctx->logging_new_delayed_dentries = true;
+
+       list_for_each_entry(item, delayed_ins_list, log_list) {
+               struct btrfs_dir_item *dir_item;
+               struct inode *di_inode;
+               struct btrfs_key key;
+               int log_mode = LOG_INODE_EXISTS;
+
+               dir_item = (struct btrfs_dir_item *)item->data;
+               btrfs_disk_key_to_cpu(&key, &dir_item->location);
+
+               if (key.type == BTRFS_ROOT_ITEM_KEY)
+                       continue;
+
+               di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
+               if (IS_ERR(di_inode)) {
+                       ret = PTR_ERR(di_inode);
+                       break;
+               }
+
+               if (!need_log_inode(trans, BTRFS_I(di_inode))) {
+                       btrfs_add_delayed_iput(di_inode);
+                       continue;
+               }
+
+               if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR)
+                       log_mode = LOG_INODE_ALL;
+
+               ctx->log_new_dentries = false;
+               ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+
+               if (!ret && ctx->log_new_dentries)
+                       ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+
+               btrfs_add_delayed_iput(di_inode);
+
+               if (ret)
+                       break;
        }
 
+       ctx->log_new_dentries = orig_log_new_dentries;
+       ctx->logging_new_delayed_dentries = false;
+
        return ret;
 }
 
@@ -5764,9 +6314,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        u64 logged_isize = 0;
        bool need_log_inode_item = true;
        bool xattrs_logged = false;
-       bool recursive_logging = false;
        bool inode_item_dropped = true;
-       const bool orig_logged_before = ctx->logged_before;
+       bool full_dir_logging = false;
+       LIST_HEAD(delayed_ins_list);
+       LIST_HEAD(delayed_del_list);
 
        path = btrfs_alloc_path();
        if (!path)
@@ -5794,27 +6345,46 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
 
+       if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
+               full_dir_logging = true;
+
        /*
-        * Only run delayed items if we are a directory. We want to make sure
-        * all directory indexes hit the fs/subvolume tree so we can find them
-        * and figure out which index ranges have to be logged.
+        * If we are logging a directory while we are logging dentries of the
+        * delayed items of some other inode, then we need to flush the delayed
+        * items of this directory and not log the delayed items directly. This
+        * is to prevent more than one level of recursion into btrfs_log_inode()
+        * by having something like this:
+        *
+        *     $ mkdir -p a/b/c/d/e/f/g/h/...
+        *     $ xfs_io -c "fsync" a
+        *
+        * Where all directories in the path did not exist before and are
+        * created in the current transaction.
+        * So in such a case we directly log the delayed items of the main
+        * directory ("a") without flushing them first, while for each of its
+        * subdirectories we flush their delayed items before logging them.
+        * This prevents a potential unbounded recursion like this:
+        *
+        * btrfs_log_inode()
+        *   log_new_delayed_dentries()
+        *      btrfs_log_inode()
+        *        log_new_delayed_dentries()
+        *          btrfs_log_inode()
+        *            log_new_delayed_dentries()
+        *              (...)
+        *
+        * We have thresholds for the maximum number of delayed items to have in
+        * memory, and once they are hit, the items are flushed asynchronously.
+        * However the limit is quite high, so lets prevent deep levels of
+        * recursion to happen by limiting the maximum depth to be 1.
         */
-       if (S_ISDIR(inode->vfs_inode.i_mode)) {
+       if (full_dir_logging && ctx->logging_new_delayed_dentries) {
                ret = btrfs_commit_inode_delayed_items(trans, inode);
                if (ret)
                        goto out;
        }
 
-       if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
-               recursive_logging = true;
-               if (inode_only == LOG_OTHER_INODE)
-                       inode_only = LOG_INODE_EXISTS;
-               else
-                       inode_only = LOG_INODE_ALL;
-               mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
-       } else {
-               mutex_lock(&inode->log_mutex);
-       }
+       mutex_lock(&inode->log_mutex);
 
        /*
         * For symlinks, we must always log their content, which is stored in an
@@ -5846,9 +6416,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         * to known the file was moved from A to B, so logging just A would
         * result in losing the file after a log replay.
         */
-       if (S_ISDIR(inode->vfs_inode.i_mode) &&
-           inode_only == LOG_INODE_ALL &&
-           inode->last_unlink_trans >= trans->transid) {
+       if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
                btrfs_set_log_full_commit(trans);
                ret = BTRFS_LOG_FORCE_COMMIT;
                goto out_unlock;
@@ -5859,14 +6427,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         * copies of everything.
         */
        if (S_ISDIR(inode->vfs_inode.i_mode)) {
-               int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-
                clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
-               if (inode_only == LOG_INODE_EXISTS)
-                       max_key_type = BTRFS_XATTR_ITEM_KEY;
                if (ctx->logged_before)
                        ret = drop_inode_items(trans, log, path, inode,
-                                              max_key_type);
+                                              BTRFS_XATTR_ITEM_KEY);
        } else {
                if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
                        /*
@@ -5922,9 +6486,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        if (ret)
                goto out_unlock;
 
+       /*
+        * If we are logging a directory in full mode, collect the delayed items
+        * before iterating the subvolume tree, so that we don't miss any new
+        * dir index items in case they get flushed while or right after we are
+        * iterating the subvolume tree.
+        */
+       if (full_dir_logging && !ctx->logging_new_delayed_dentries)
+               btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+                                           &delayed_del_list);
+
        ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
                                      path, dst_path, logged_isize,
-                                     recursive_logging, inode_only, ctx,
+                                     inode_only, ctx,
                                      &need_log_inode_item);
        if (ret)
                goto out_unlock;
@@ -5977,10 +6551,18 @@ log_extents:
                write_unlock(&em_tree->lock);
        }
 
-       if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
+       if (full_dir_logging) {
                ret = log_directory_changes(trans, inode, path, dst_path, ctx);
                if (ret)
                        goto out_unlock;
+               ret = log_delayed_insertion_items(trans, inode, path,
+                                                 &delayed_ins_list, ctx);
+               if (ret)
+                       goto out_unlock;
+               ret = log_delayed_deletion_items(trans, inode, path,
+                                                &delayed_del_list, ctx);
+               if (ret)
+                       goto out_unlock;
        }
 
        spin_lock(&inode->lock);
@@ -6033,208 +6615,20 @@ out:
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
 
-       if (recursive_logging)
-               ctx->logged_before = orig_logged_before;
-
-       return ret;
-}
-
-/*
- * Check if we need to log an inode. This is used in contexts where while
- * logging an inode we need to log another inode (either that it exists or in
- * full mode). This is used instead of btrfs_inode_in_log() because the later
- * requires the inode to be in the log and have the log transaction committed,
- * while here we do not care if the log transaction was already committed - our
- * caller will commit the log later - and we want to avoid logging an inode
- * multiple times when multiple tasks have joined the same log transaction.
- */
-static bool need_log_inode(struct btrfs_trans_handle *trans,
-                          struct btrfs_inode *inode)
-{
-       /*
-        * If a directory was not modified, no dentries added or removed, we can
-        * and should avoid logging it.
-        */
-       if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
-               return false;
-
-       /*
-        * If this inode does not have new/updated/deleted xattrs since the last
-        * time it was logged and is flagged as logged in the current transaction,
-        * we can skip logging it. As for new/deleted names, those are updated in
-        * the log by link/unlink/rename operations.
-        * In case the inode was logged and then evicted and reloaded, its
-        * logged_trans will be 0, in which case we have to fully log it since
-        * logged_trans is a transient field, not persisted.
-        */
-       if (inode->logged_trans == trans->transid &&
-           !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
-               return false;
-
-       return true;
-}
-
-struct btrfs_dir_list {
-       u64 ino;
-       struct list_head list;
-};
-
-/*
- * Log the inodes of the new dentries of a directory. See log_dir_items() for
- * details about the why it is needed.
- * This is a recursive operation - if an existing dentry corresponds to a
- * directory, that directory's new entries are logged too (same behaviour as
- * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
- * the dentries point to we do not lock their i_mutex, otherwise lockdep
- * complains about the following circular lock dependency / possible deadlock:
- *
- *        CPU0                                        CPU1
- *        ----                                        ----
- * lock(&type->i_mutex_dir_key#3/2);
- *                                            lock(sb_internal#2);
- *                                            lock(&type->i_mutex_dir_key#3/2);
- * lock(&sb->s_type->i_mutex_key#14);
- *
- * Where sb_internal is the lock (a counter that works as a lock) acquired by
- * sb_start_intwrite() in btrfs_start_transaction().
- * Not locking i_mutex of the inodes is still safe because:
- *
- * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
- *    that while logging the inode new references (names) are added or removed
- *    from the inode, leaving the logged inode item with a link count that does
- *    not match the number of logged inode reference items. This is fine because
- *    at log replay time we compute the real number of links and correct the
- *    link count in the inode item (see replay_one_buffer() and
- *    link_to_fixup_dir());
- *
- * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
- *    while logging the inode's items new index items (key type
- *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
- *    has a size that doesn't match the sum of the lengths of all the logged
- *    names - this is ok, not a problem, because at log replay time we set the
- *    directory's i_size to the correct value (see replay_one_name() and
- *    do_overwrite_item()).
- */
-static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct btrfs_inode *start_inode,
-                               struct btrfs_log_ctx *ctx)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_path *path;
-       LIST_HEAD(dir_list);
-       struct btrfs_dir_list *dir_elem;
-       int ret = 0;
-
-       /*
-        * If we are logging a new name, as part of a link or rename operation,
-        * don't bother logging new dentries, as we just want to log the names
-        * of an inode and that any new parents exist.
-        */
-       if (ctx->logging_new_name)
-               return 0;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-
-       dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
-       if (!dir_elem) {
-               btrfs_free_path(path);
-               return -ENOMEM;
-       }
-       dir_elem->ino = btrfs_ino(start_inode);
-       list_add_tail(&dir_elem->list, &dir_list);
-
-       while (!list_empty(&dir_list)) {
-               struct extent_buffer *leaf;
-               struct btrfs_key min_key;
-               int nritems;
-               int i;
-
-               dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
-                                           list);
-               if (ret)
-                       goto next_dir_inode;
-
-               min_key.objectid = dir_elem->ino;
-               min_key.type = BTRFS_DIR_INDEX_KEY;
-               min_key.offset = 0;
-again:
-               btrfs_release_path(path);
-               ret = btrfs_search_forward(root, &min_key, path, trans->transid);
-               if (ret < 0) {
-                       goto next_dir_inode;
-               } else if (ret > 0) {
-                       ret = 0;
-                       goto next_dir_inode;
-               }
-
-               leaf = path->nodes[0];
-               nritems = btrfs_header_nritems(leaf);
-               for (i = path->slots[0]; i < nritems; i++) {
-                       struct btrfs_dir_item *di;
-                       struct btrfs_key di_key;
-                       struct inode *di_inode;
-                       struct btrfs_dir_list *new_dir_elem;
-                       int log_mode = LOG_INODE_EXISTS;
-                       int type;
-
-                       btrfs_item_key_to_cpu(leaf, &min_key, i);
-                       if (min_key.objectid != dir_elem->ino ||
-                           min_key.type != BTRFS_DIR_INDEX_KEY)
-                               goto next_dir_inode;
-
-                       di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
-                       type = btrfs_dir_type(leaf, di);
-                       if (btrfs_dir_transid(leaf, di) < trans->transid)
-                               continue;
-                       btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
-                       if (di_key.type == BTRFS_ROOT_ITEM_KEY)
-                               continue;
-
-                       btrfs_release_path(path);
-                       di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
-                       if (IS_ERR(di_inode)) {
-                               ret = PTR_ERR(di_inode);
-                               goto next_dir_inode;
-                       }
+       if (ret)
+               free_conflicting_inodes(ctx);
+       else
+               ret = log_conflicting_inodes(trans, inode->root, ctx);
 
-                       if (!need_log_inode(trans, BTRFS_I(di_inode))) {
-                               btrfs_add_delayed_iput(di_inode);
-                               break;
-                       }
+       if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
+               if (!ret)
+                       ret = log_new_delayed_dentries(trans, inode,
+                                                      &delayed_ins_list, ctx);
 
-                       ctx->log_new_dentries = false;
-                       if (type == BTRFS_FT_DIR)
-                               log_mode = LOG_INODE_ALL;
-                       ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
-                                             log_mode, ctx);
-                       btrfs_add_delayed_iput(di_inode);
-                       if (ret)
-                               goto next_dir_inode;
-                       if (ctx->log_new_dentries) {
-                               new_dir_elem = kmalloc(sizeof(*new_dir_elem),
-                                                      GFP_NOFS);
-                               if (!new_dir_elem) {
-                                       ret = -ENOMEM;
-                                       goto next_dir_inode;
-                               }
-                               new_dir_elem->ino = di_key.objectid;
-                               list_add_tail(&new_dir_elem->list, &dir_list);
-                       }
-                       break;
-               }
-               if (min_key.offset < (u64)-1) {
-                       min_key.offset++;
-                       goto again;
-               }
-next_dir_inode:
-               list_del(&dir_elem->list);
-               kfree(dir_elem);
+               btrfs_log_put_delayed_items(inode, &delayed_ins_list,
+                                           &delayed_del_list);
        }
 
-       btrfs_free_path(path);
        return ret;
 }
 
@@ -6346,7 +6740,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
                        ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
                                              LOG_INODE_ALL, ctx);
                        if (!ret && ctx->log_new_dentries)
-                               ret = log_new_dir_dentries(trans, root,
+                               ret = log_new_dir_dentries(trans,
                                                   BTRFS_I(dir_inode), ctx);
                        btrfs_add_delayed_iput(dir_inode);
                        if (ret)
@@ -6661,7 +7055,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
 
        if (log_dentries)
-               ret = log_new_dir_dentries(trans, root, inode, ctx);
+               ret = log_new_dir_dentries(trans, inode, ctx);
        else
                ret = 0;
 end_trans:
@@ -7088,6 +7482,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
         * inconsistent state after a rename operation.
         */
        btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+       ASSERT(list_empty(&ctx.conflict_inodes));
 out:
        /*
         * If an error happened mark the log for a full commit because it's not
index 57ab5f3..aed1e05 100644 (file)
@@ -20,6 +20,7 @@ struct btrfs_log_ctx {
        int log_transid;
        bool log_new_dentries;
        bool logging_new_name;
+       bool logging_new_delayed_dentries;
        /* Indicate if the inode being logged was logged before. */
        bool logged_before;
        /* Tracks the last logged dir item/index key offset. */
@@ -28,6 +29,9 @@ struct btrfs_log_ctx {
        struct list_head list;
        /* Only used for fast fsyncs. */
        struct list_head ordered_extents;
+       struct list_head conflict_inodes;
+       int num_conflict_inodes;
+       bool logging_conflict_inodes;
 };
 
 static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -37,10 +41,14 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
        ctx->log_transid = 0;
        ctx->log_new_dentries = false;
        ctx->logging_new_name = false;
+       ctx->logging_new_delayed_dentries = false;
        ctx->logged_before = false;
        ctx->inode = inode;
        INIT_LIST_HEAD(&ctx->list);
        INIT_LIST_HEAD(&ctx->ordered_extents);
+       INIT_LIST_HEAD(&ctx->conflict_inodes);
+       ctx->num_conflict_inodes = 0;
+       ctx->logging_conflict_inodes = false;
 }
 
 static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
index 90eb5c2..ee00e33 100644 (file)
@@ -659,8 +659,7 @@ rollback:
  *
  * Returns the size on success or a negative error code on failure.
  */
-static int btrfs_get_verity_descriptor(struct inode *inode, void *buf,
-                                      size_t buf_size)
+int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
 {
        u64 true_size;
        int ret = 0;
index f63ff91..94ba46d 100644 (file)
@@ -34,6 +34,8 @@
 #include "discard.h"
 #include "zoned.h"
 
+static struct bio_set btrfs_bioset;
+
 #define BTRFS_BLOCK_GROUP_STRIPE_MASK  (BTRFS_BLOCK_GROUP_RAID0 | \
                                         BTRFS_BLOCK_GROUP_RAID10 | \
                                         BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -247,10 +249,10 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans);
 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
-                            enum btrfs_map_op op,
-                            u64 logical, u64 *length,
+                            enum btrfs_map_op op, u64 logical, u64 *length,
                             struct btrfs_io_context **bioc_ret,
-                            int mirror_num, int need_raid_map);
+                            struct btrfs_io_stripe *smap,
+                            int *mirror_num_ret, int need_raid_map);
 
 /*
  * Device locking
@@ -2017,7 +2019,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
                struct page *page;
                int ret;
 
-               disk_super = btrfs_read_dev_one_super(bdev, copy_num);
+               disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
                if (IS_ERR(disk_super))
                        continue;
 
@@ -5595,7 +5597,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
        if (ret)
                goto out;
 
-       bg->chunk_item_inserted = 1;
+       set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
 
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
@@ -5896,7 +5898,6 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
                sizeof(u64) * (total_stripes),
                GFP_NOFS|__GFP_NOFAIL);
 
-       atomic_set(&bioc->error, 0);
        refcount_set(&bioc->refs, 1);
 
        bioc->fs_info = fs_info;
@@ -6092,7 +6093,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
        int ret = 0;
 
        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
-                               logical, &length, &bioc, 0, 0);
+                               logical, &length, &bioc, NULL, NULL, 0);
        if (ret) {
                ASSERT(bioc == NULL);
                return ret;
@@ -6153,9 +6154,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
 
        cache = btrfs_lookup_block_group(fs_info, logical);
 
-       spin_lock(&cache->lock);
-       ret = cache->to_copy;
-       spin_unlock(&cache->lock);
+       ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
 
        btrfs_put_block_group(cache);
        return ret;
@@ -6351,11 +6350,19 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
        return 0;
 }
 
+static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+                         u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
+{
+       dst->dev = map->stripes[stripe_index].dev;
+       dst->physical = map->stripes[stripe_index].physical +
+                       stripe_offset + stripe_nr * map->stripe_len;
+}
+
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
-                            enum btrfs_map_op op,
-                            u64 logical, u64 *length,
+                            enum btrfs_map_op op, u64 logical, u64 *length,
                             struct btrfs_io_context **bioc_ret,
-                            int mirror_num, int need_raid_map)
+                            struct btrfs_io_stripe *smap,
+                            int *mirror_num_ret, int need_raid_map)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -6366,6 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        int data_stripes;
        int i;
        int ret = 0;
+       int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
        int num_stripes;
        int max_errors = 0;
        int tgtdev_indexes = 0;
@@ -6526,6 +6534,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
                tgtdev_indexes = num_stripes;
        }
 
+       /*
+        * If this I/O maps to a single device, try to return the device and
+        * physical block information on the stack instead of allocating an
+        * I/O context structure.
+        */
+       if (smap && num_alloc_stripes == 1 &&
+           !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+           (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+            !dev_replace->tgtdev)) {
+               if (patch_the_first_stripe_for_dev_replace) {
+                       smap->dev = dev_replace->tgtdev;
+                       smap->physical = physical_to_patch_in_first_stripe;
+                       *mirror_num_ret = map->num_stripes + 1;
+               } else {
+                       set_io_stripe(smap, map, stripe_index, stripe_offset,
+                                     stripe_nr);
+                       *mirror_num_ret = mirror_num;
+               }
+               *bioc_ret = NULL;
+               ret = 0;
+               goto out;
+       }
+
        bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
        if (!bioc) {
                ret = -ENOMEM;
@@ -6533,9 +6564,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
        }
 
        for (i = 0; i < num_stripes; i++) {
-               bioc->stripes[i].physical = map->stripes[stripe_index].physical +
-                       stripe_offset + stripe_nr * map->stripe_len;
-               bioc->stripes[i].dev = map->stripes[stripe_index].dev;
+               set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+                             stripe_nr);
                stripe_index++;
        }
 
@@ -6603,7 +6633,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                      struct btrfs_io_context **bioc_ret, int mirror_num)
 {
        return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
-                                mirror_num, 0);
+                                NULL, &mirror_num, 0);
 }
 
 /* For Scrub/replace */
@@ -6611,14 +6641,77 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
                     u64 logical, u64 *length,
                     struct btrfs_io_context **bioc_ret)
 {
-       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1);
+       return __btrfs_map_block(fs_info, op, logical, length, bioc_ret,
+                                NULL, NULL, 1);
 }
 
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc)
+/*
+ * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
+ * is already initialized by the block layer.
+ */
+static inline void btrfs_bio_init(struct btrfs_bio *bbio,
+                                 btrfs_bio_end_io_t end_io, void *private)
+{
+       memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+       bbio->end_io = end_io;
+       bbio->private = private;
+}
+
+/*
+ * Allocate a btrfs_bio structure.  The btrfs_bio is the main I/O container for
+ * btrfs, and is used for all I/O submitted through btrfs_submit_bio.
+ *
+ * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+ * a mempool.
+ */
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+                           btrfs_bio_end_io_t end_io, void *private)
 {
-       if (bioc->orig_bio->bi_opf & REQ_META)
-               return bioc->fs_info->endio_meta_workers;
-       return bioc->fs_info->endio_workers;
+       struct bio *bio;
+
+       bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+       btrfs_bio_init(btrfs_bio(bio), end_io, private);
+       return bio;
+}
+
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+                                   btrfs_bio_end_io_t end_io, void *private)
+{
+       struct bio *bio;
+       struct btrfs_bio *bbio;
+
+       ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
+
+       bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
+       bbio = btrfs_bio(bio);
+       btrfs_bio_init(bbio, end_io, private);
+
+       bio_trim(bio, offset >> 9, size >> 9);
+       bbio->iter = bio->bi_iter;
+       return bio;
+}
+
+static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+{
+       if (!dev || !dev->bdev)
+               return;
+       if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
+               return;
+
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+       if (!(bio->bi_opf & REQ_RAHEAD))
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+       if (bio->bi_opf & REQ_PREFLUSH)
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
+}
+
+static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
+                                               struct bio *bio)
+{
+       if (bio->bi_opf & REQ_META)
+               return fs_info->endio_meta_workers;
+       return fs_info->endio_workers;
 }
 
 static void btrfs_end_bio_work(struct work_struct *work)
@@ -6626,103 +6719,101 @@ static void btrfs_end_bio_work(struct work_struct *work)
        struct btrfs_bio *bbio =
                container_of(work, struct btrfs_bio, end_io_work);
 
-       bio_endio(&bbio->bio);
+       bbio->end_io(bbio);
 }
 
-static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async)
+static void btrfs_simple_end_io(struct bio *bio)
 {
-       struct bio *orig_bio = bioc->orig_bio;
-       struct btrfs_bio *bbio = btrfs_bio(orig_bio);
+       struct btrfs_fs_info *fs_info = bio->bi_private;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
 
-       bbio->mirror_num = bioc->mirror_num;
-       orig_bio->bi_private = bioc->private;
-       orig_bio->bi_end_io = bioc->end_io;
+       btrfs_bio_counter_dec(fs_info);
 
-       /*
-        * Only send an error to the higher layers if it is beyond the tolerance
-        * threshold.
-        */
-       if (atomic_read(&bioc->error) > bioc->max_errors)
-               orig_bio->bi_status = BLK_STS_IOERR;
-       else
-               orig_bio->bi_status = BLK_STS_OK;
+       if (bio->bi_status)
+               btrfs_log_dev_io_error(bio, bbio->device);
 
-       if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) {
+       if (bio_op(bio) == REQ_OP_READ) {
                INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
-               queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work);
+               queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
        } else {
-               bio_endio(orig_bio);
+               bbio->end_io(bbio);
        }
+}
+
+static void btrfs_raid56_end_io(struct bio *bio)
+{
+       struct btrfs_io_context *bioc = bio->bi_private;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
+
+       btrfs_bio_counter_dec(bioc->fs_info);
+       bbio->mirror_num = bioc->mirror_num;
+       bbio->end_io(bbio);
 
        btrfs_put_bioc(bioc);
 }
 
-static void btrfs_end_bio(struct bio *bio)
+static void btrfs_orig_write_end_io(struct bio *bio)
 {
        struct btrfs_io_stripe *stripe = bio->bi_private;
        struct btrfs_io_context *bioc = stripe->bioc;
+       struct btrfs_bio *bbio = btrfs_bio(bio);
+
+       btrfs_bio_counter_dec(bioc->fs_info);
 
        if (bio->bi_status) {
                atomic_inc(&bioc->error);
-               if (bio->bi_status == BLK_STS_IOERR ||
-                   bio->bi_status == BLK_STS_TARGET) {
-                       if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_WRITE_ERRS);
-                       else if (!(bio->bi_opf & REQ_RAHEAD))
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_READ_ERRS);
-                       if (bio->bi_opf & REQ_PREFLUSH)
-                               btrfs_dev_stat_inc_and_print(stripe->dev,
-                                               BTRFS_DEV_STAT_FLUSH_ERRS);
-               }
+               btrfs_log_dev_io_error(bio, stripe->dev);
        }
 
-       if (bio != bioc->orig_bio)
-               bio_put(bio);
+       /*
+        * Only send an error to the higher layers if it is beyond the tolerance
+        * threshold.
+        */
+       if (atomic_read(&bioc->error) > bioc->max_errors)
+               bio->bi_status = BLK_STS_IOERR;
+       else
+               bio->bi_status = BLK_STS_OK;
 
-       btrfs_bio_counter_dec(bioc->fs_info);
-       if (atomic_dec_and_test(&bioc->stripes_pending))
-               btrfs_end_bioc(bioc, true);
+       bbio->end_io(bbio);
+       btrfs_put_bioc(bioc);
 }
 
-static void submit_stripe_bio(struct btrfs_io_context *bioc,
-                             struct bio *orig_bio, int dev_nr, bool clone)
+static void btrfs_clone_write_end_io(struct bio *bio)
 {
-       struct btrfs_fs_info *fs_info = bioc->fs_info;
-       struct btrfs_device *dev = bioc->stripes[dev_nr].dev;
-       u64 physical = bioc->stripes[dev_nr].physical;
-       struct bio *bio;
+       struct btrfs_io_stripe *stripe = bio->bi_private;
 
+       if (bio->bi_status) {
+               atomic_inc(&stripe->bioc->error);
+               btrfs_log_dev_io_error(bio, stripe->dev);
+       }
+
+       /* Pass on control to the original bio this one was cloned from */
+       bio_endio(stripe->bioc->orig_bio);
+       bio_put(bio);
+}
+
+static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+{
        if (!dev || !dev->bdev ||
            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
-           (btrfs_op(orig_bio) == BTRFS_MAP_WRITE &&
+           (btrfs_op(bio) == BTRFS_MAP_WRITE &&
             !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
-               atomic_inc(&bioc->error);
-               if (atomic_dec_and_test(&bioc->stripes_pending))
-                       btrfs_end_bioc(bioc, false);
+               bio_io_error(bio);
                return;
        }
 
-       if (clone) {
-               bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set);
-       } else {
-               bio = orig_bio;
-               bio_set_dev(bio, dev->bdev);
-               btrfs_bio(bio)->device = dev;
-       }
+       bio_set_dev(bio, dev->bdev);
 
-       bioc->stripes[dev_nr].bioc = bioc;
-       bio->bi_private = &bioc->stripes[dev_nr];
-       bio->bi_end_io = btrfs_end_bio;
-       bio->bi_iter.bi_sector = physical >> 9;
        /*
         * For zone append writing, bi_sector must point the beginning of the
         * zone
         */
        if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+               u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
                if (btrfs_dev_is_sequential(dev, physical)) {
-                       u64 zone_start = round_down(physical, fs_info->zone_size);
+                       u64 zone_start = round_down(physical,
+                                                   dev->fs_info->zone_size);
 
                        bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
                } else {
@@ -6730,50 +6821,53 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc,
                        bio->bi_opf |= REQ_OP_WRITE;
                }
        }
-       btrfs_debug_in_rcu(fs_info,
+       btrfs_debug_in_rcu(dev->fs_info,
        "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
                __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
                (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
                dev->devid, bio->bi_iter.bi_size);
 
-       btrfs_bio_counter_inc_noblocked(fs_info);
-
        btrfsic_check_bio(bio);
        submit_bio(bio);
 }
 
+static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+{
+       struct bio *orig_bio = bioc->orig_bio, *bio;
+
+       ASSERT(bio_op(orig_bio) != REQ_OP_READ);
+
+       /* Reuse the bio embedded into the btrfs_bio for the last mirror */
+       if (dev_nr == bioc->num_stripes - 1) {
+               bio = orig_bio;
+               bio->bi_end_io = btrfs_orig_write_end_io;
+       } else {
+               bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
+               bio_inc_remaining(orig_bio);
+               bio->bi_end_io = btrfs_clone_write_end_io;
+       }
+
+       bio->bi_private = &bioc->stripes[dev_nr];
+       bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
+       bioc->stripes[dev_nr].bioc = bioc;
+       btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
+}
+
 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
 {
        u64 logical = bio->bi_iter.bi_sector << 9;
        u64 length = bio->bi_iter.bi_size;
        u64 map_length = length;
-       int ret;
-       int dev_nr;
-       int total_devs;
        struct btrfs_io_context *bioc = NULL;
+       struct btrfs_io_stripe smap;
+       int ret;
 
        btrfs_bio_counter_inc_blocked(fs_info);
-       ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
-                               &map_length, &bioc, mirror_num, 1);
+       ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+                               &bioc, &smap, &mirror_num, 1);
        if (ret) {
                btrfs_bio_counter_dec(fs_info);
-               bio->bi_status = errno_to_blk_status(ret);
-               bio_endio(bio);
-               return;
-       }
-
-       total_devs = bioc->num_stripes;
-       bioc->orig_bio = bio;
-       bioc->private = bio->bi_private;
-       bioc->end_io = bio->bi_end_io;
-       atomic_set(&bioc->stripes_pending, total_devs);
-
-       if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
-           ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
-               if (btrfs_op(bio) == BTRFS_MAP_WRITE)
-                       raid56_parity_write(bio, bioc);
-               else
-                       raid56_parity_recover(bio, bioc, mirror_num, true);
+               btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
                return;
        }
 
@@ -6784,12 +6878,31 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror
                BUG();
        }
 
-       for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
-               const bool should_clone = (dev_nr < total_devs - 1);
+       if (!bioc) {
+               /* Single mirror read/write fast path */
+               btrfs_bio(bio)->mirror_num = mirror_num;
+               btrfs_bio(bio)->device = smap.dev;
+               bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+               bio->bi_private = fs_info;
+               bio->bi_end_io = btrfs_simple_end_io;
+               btrfs_submit_dev_bio(smap.dev, bio);
+       } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               /* Parity RAID write or read recovery */
+               bio->bi_private = bioc;
+               bio->bi_end_io = btrfs_raid56_end_io;
+               if (bio_op(bio) == REQ_OP_READ)
+                       raid56_parity_recover(bio, bioc, mirror_num);
+               else
+                       raid56_parity_write(bio, bioc);
+       } else {
+               /* Write to multiple mirrors */
+               int total_devs = bioc->num_stripes;
+               int dev_nr;
 
-               submit_stripe_bio(bioc, bio, dev_nr, should_clone);
+               bioc->orig_bio = bio;
+               for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
+                       btrfs_submit_mirrored_bio(bioc, dev_nr);
        }
-       btrfs_bio_counter_dec(fs_info);
 }
 
 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
@@ -8244,7 +8357,7 @@ static int relocating_repair_kthread(void *data)
        if (!cache)
                goto out;
 
-       if (!cache->relocating_repair)
+       if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
                goto out;
 
        ret = btrfs_may_alloc_data_chunk(fs_info, target);
@@ -8281,17 +8394,27 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
        if (!cache)
                return true;
 
-       spin_lock(&cache->lock);
-       if (cache->relocating_repair) {
-               spin_unlock(&cache->lock);
+       if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
                btrfs_put_block_group(cache);
                return true;
        }
-       cache->relocating_repair = 1;
-       spin_unlock(&cache->lock);
 
        kthread_run(relocating_repair_kthread, cache,
                    "btrfs-relocating-repair");
 
        return true;
 }
+
+int __init btrfs_bioset_init(void)
+{
+       if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+                       offsetof(struct btrfs_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -ENOMEM;
+       return 0;
+}
+
+void __cold btrfs_bioset_exit(void)
+{
+       bioset_exit(&btrfs_bioset);
+}
index 5639961..599b9d5 100644 (file)
@@ -181,6 +181,31 @@ struct btrfs_device {
 };
 
 /*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+       struct rb_node node;
+       void *ptr;
+       struct inode *inode;
+       /*
+        * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+        * points to a struct btrfs_device.
+        */
+       bool is_block_group;
+       /*
+        * Only used when 'is_block_group' is true and it is the number of
+        * extents used by a swapfile for this block group ('ptr' field).
+        */
+       int bg_extent_count;
+};
+
+/*
  * If we read those variants at the context of their own lock, we needn't
  * use the following helpers, reading them directly is safe.
  */
@@ -361,6 +386,8 @@ struct btrfs_fs_devices {
  */
 #define BTRFS_MAX_BIO_SECTORS                          (256)
 
+typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+
 /*
  * Additional info to pass along bio.
  *
@@ -378,6 +405,10 @@ struct btrfs_bio {
        u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
        struct bvec_iter iter;
 
+       /* End I/O information supplied to btrfs_bio_alloc */
+       btrfs_bio_end_io_t end_io;
+       void *private;
+
        /* For read end I/O handling */
        struct work_struct end_io_work;
 
@@ -393,6 +424,20 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
        return container_of(bio, struct btrfs_bio, bio);
 }
 
+int __init btrfs_bioset_init(void);
+void __cold btrfs_bioset_exit(void);
+
+struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+                           btrfs_bio_end_io_t end_io, void *private);
+struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
+                                   btrfs_bio_end_io_t end_io, void *private);
+
+static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+{
+       bbio->bio.bi_status = status;
+       bbio->end_io(bbio);
+}
+
 static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
 {
        if (bbio->csum != bbio->csum_inline) {
@@ -451,12 +496,9 @@ struct btrfs_discard_stripe {
  */
 struct btrfs_io_context {
        refcount_t refs;
-       atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
        u64 map_type; /* get from map_lookup->type */
-       bio_end_io_t *end_io;
        struct bio *orig_bio;
-       void *private;
        atomic_t error;
        int max_errors;
        int num_stripes;
@@ -714,4 +756,6 @@ const char *btrfs_bg_type_to_raid_name(u64 flags);
 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
 
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
 #endif
index 73c6929..e2d073b 100644 (file)
@@ -652,80 +652,55 @@ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
        return 0;
 }
 
+static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_device *device;
+
+       list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+               if (device->bdev &&
+                   bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+                       btrfs_err(fs_info,
+                               "zoned: mode not enabled but zoned device found: %pg",
+                               device->bdev);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 {
-       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
-       u64 zoned_devices = 0;
-       u64 nr_devices = 0;
        u64 zone_size = 0;
        u64 max_zone_append_size = 0;
-       const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
-       int ret = 0;
+       int ret;
 
-       /* Count zoned devices */
-       list_for_each_entry(device, &fs_devices->devices, dev_list) {
-               enum blk_zoned_model model;
+       /*
+        * Host-Managed devices can't be used without the ZONED flag.  With the
+        * ZONED all devices can be used, using zone emulation if required.
+        */
+       if (!btrfs_fs_incompat(fs_info, ZONED))
+               return btrfs_check_for_zoned_device(fs_info);
+
+       list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+               struct btrfs_zoned_device_info *zone_info = device->zone_info;
 
                if (!device->bdev)
                        continue;
 
-               model = bdev_zoned_model(device->bdev);
-               /*
-                * A Host-Managed zoned device must be used as a zoned device.
-                * A Host-Aware zoned device and a non-zoned devices can be
-                * treated as a zoned device, if ZONED flag is enabled in the
-                * superblock.
-                */
-               if (model == BLK_ZONED_HM ||
-                   (model == BLK_ZONED_HA && incompat_zoned) ||
-                   (model == BLK_ZONED_NONE && incompat_zoned)) {
-                       struct btrfs_zoned_device_info *zone_info;
-
-                       zone_info = device->zone_info;
-                       zoned_devices++;
-                       if (!zone_size) {
-                               zone_size = zone_info->zone_size;
-                       } else if (zone_info->zone_size != zone_size) {
-                               btrfs_err(fs_info,
+               if (!zone_size) {
+                       zone_size = zone_info->zone_size;
+               } else if (zone_info->zone_size != zone_size) {
+                       btrfs_err(fs_info,
                "zoned: unequal block device zone sizes: have %llu found %llu",
-                                         device->zone_info->zone_size,
-                                         zone_size);
-                               ret = -EINVAL;
-                               goto out;
-                       }
-                       if (!max_zone_append_size ||
-                           (zone_info->max_zone_append_size &&
-                            zone_info->max_zone_append_size < max_zone_append_size))
-                               max_zone_append_size =
-                                       zone_info->max_zone_append_size;
+                                 zone_info->zone_size, zone_size);
+                       return -EINVAL;
                }
-               nr_devices++;
-       }
-
-       if (!zoned_devices && !incompat_zoned)
-               goto out;
-
-       if (!zoned_devices && incompat_zoned) {
-               /* No zoned block device found on ZONED filesystem */
-               btrfs_err(fs_info,
-                         "zoned: no zoned devices found on a zoned filesystem");
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (zoned_devices && !incompat_zoned) {
-               btrfs_err(fs_info,
-                         "zoned: mode not enabled but zoned device found");
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (zoned_devices != nr_devices) {
-               btrfs_err(fs_info,
-                         "zoned: cannot mix zoned and regular devices");
-               ret = -EINVAL;
-               goto out;
+               if (!max_zone_append_size ||
+                   (zone_info->max_zone_append_size &&
+                    zone_info->max_zone_append_size < max_zone_append_size))
+                       max_zone_append_size = zone_info->max_zone_append_size;
        }
 
        /*
@@ -737,14 +712,12 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
                btrfs_err(fs_info,
                          "zoned: zone size %llu not aligned to stripe %u",
                          zone_size, BTRFS_STRIPE_LEN);
-               ret = -EINVAL;
-               goto out;
+               return -EINVAL;
        }
 
        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
                btrfs_err(fs_info, "zoned: mixed block groups not supported");
-               ret = -EINVAL;
-               goto out;
+               return -EINVAL;
        }
 
        fs_info->zone_size = zone_size;
@@ -760,11 +733,10 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
         */
        ret = btrfs_check_mountopts_zoned(fs_info);
        if (ret)
-               goto out;
+               return ret;
 
        btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
-out:
-       return ret;
+       return 0;
 }
 
 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
@@ -1436,7 +1408,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                        goto out;
                } else if (map->num_stripes == num_conventional) {
                        cache->alloc_offset = last_alloc;
-                       cache->zone_is_active = 1;
+                       set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
                        goto out;
                }
        }
@@ -1452,7 +1424,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                }
                cache->alloc_offset = alloc_offsets[0];
                cache->zone_capacity = caps[0];
-               cache->zone_is_active = test_bit(0, active);
+               if (test_bit(0, active))
+                       set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
                break;
        case BTRFS_BLOCK_GROUP_DUP:
                if (map->type & BTRFS_BLOCK_GROUP_DATA) {
@@ -1486,7 +1459,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
                                goto out;
                        }
                } else {
-                       cache->zone_is_active = test_bit(0, active);
+                       if (test_bit(0, active))
+                               set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+                                       &cache->runtime_flags);
                }
                cache->alloc_offset = alloc_offsets[0];
                cache->zone_capacity = min(caps[0], caps[1]);
@@ -1530,7 +1505,7 @@ out:
 
        if (!ret) {
                cache->meta_write_pointer = cache->alloc_offset + cache->start;
-               if (cache->zone_is_active) {
+               if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) {
                        btrfs_get_block_group(cache);
                        spin_lock(&fs_info->zone_active_bgs_lock);
                        list_add_tail(&cache->active_bg_list,
@@ -1563,7 +1538,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
        free = cache->zone_capacity - cache->alloc_offset;
 
        /* We only need ->free_space in ALLOC_SEQ block groups */
-       cache->last_byte_to_unpin = (u64)-1;
        cache->cached = BTRFS_CACHE_FINISHED;
        cache->free_space_ctl->free_space = free;
        cache->zone_unusable = unusable;
@@ -1871,7 +1845,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
 
        spin_lock(&space_info->lock);
        spin_lock(&block_group->lock);
-       if (block_group->zone_is_active) {
+       if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
                ret = true;
                goto out_unlock;
        }
@@ -1897,7 +1871,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
        }
 
        /* Successfully activated all the zones */
-       block_group->zone_is_active = 1;
+       set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
        space_info->active_total_bytes += block_group->length;
        spin_unlock(&block_group->lock);
        btrfs_try_granting_tickets(fs_info, space_info);
@@ -1960,7 +1934,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
        int i;
 
        spin_lock(&block_group->lock);
-       if (!block_group->zone_is_active) {
+       if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
                spin_unlock(&block_group->lock);
                return 0;
        }
@@ -2001,7 +1975,8 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
                 * Bail out if someone already deactivated the block group, or
                 * allocated space is left in the block group.
                 */
-               if (!block_group->zone_is_active) {
+               if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+                             &block_group->runtime_flags)) {
                        spin_unlock(&block_group->lock);
                        btrfs_dec_block_group_ro(block_group);
                        return 0;
@@ -2014,7 +1989,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
                }
        }
 
-       block_group->zone_is_active = 0;
+       clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
        block_group->alloc_offset = block_group->zone_capacity;
        block_group->free_space_ctl->free_space = 0;
        btrfs_clear_treelog_bg(block_group);
@@ -2222,13 +2197,14 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
        ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
 
        spin_lock(&block_group->lock);
-       if (!block_group->zoned_data_reloc_ongoing)
+       if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))
                goto out;
 
        /* All relocation extents are written. */
        if (block_group->start + block_group->alloc_offset == logical + length) {
                /* Now, release this block group for further allocations. */
-               block_group->zoned_data_reloc_ongoing = 0;
+               clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
+                         &block_group->runtime_flags);
        }
 
 out:
@@ -2300,7 +2276,9 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
                                            list) {
                                if (!spin_trylock(&bg->lock))
                                        continue;
-                               if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) {
+                               if (btrfs_zoned_bg_is_full(bg) ||
+                                   test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+                                            &bg->runtime_flags)) {
                                        spin_unlock(&bg->lock);
                                        continue;
                                }
index 629785c..dbe1ce5 100644 (file)
@@ -70,8 +70,6 @@ struct fsverity_info {
        const struct inode *inode;
 };
 
-/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
-#define FS_VERITY_MAX_DESCRIPTOR_SIZE  16384
 
 #define FS_VERITY_MAX_SIGNATURE_SIZE   (FS_VERITY_MAX_DESCRIPTOR_SIZE - \
                                         sizeof(struct fsverity_descriptor))
index 7af030f..40f14e5 100644 (file)
@@ -22,6 +22,9 @@
  */
 #define FS_VERITY_MAX_DIGEST_SIZE      SHA512_DIGEST_SIZE
 
+/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
+#define FS_VERITY_MAX_DESCRIPTOR_SIZE  16384
+
 /* Verity operations for filesystems */
 struct fsverity_operations {
 
index 73df80d..ed50e81 100644 (file)
@@ -84,7 +84,6 @@ struct raid56_bio_trace_info;
        EM( IO_TREE_FS_EXCLUDED_EXTENTS,  "EXCLUDED_EXTENTS")       \
        EM( IO_TREE_BTREE_INODE_IO,       "BTREE_INODE_IO")         \
        EM( IO_TREE_INODE_IO,             "INODE_IO")               \
-       EM( IO_TREE_INODE_IO_FAILURE,     "INODE_IO_FAILURE")       \
        EM( IO_TREE_RELOC_BLOCKS,         "RELOC_BLOCKS")           \
        EM( IO_TREE_TRANS_DIRTY_PAGES,    "TRANS_DIRTY_PAGES")      \
        EM( IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES")   \
@@ -154,7 +153,6 @@ FLUSH_STATES
        { EXTENT_NODATASUM,             "NODATASUM"},           \
        { EXTENT_CLEAR_META_RESV,       "CLEAR_META_RESV"},     \
        { EXTENT_NEED_WAIT,             "NEED_WAIT"},           \
-       { EXTENT_DAMAGED,               "DAMAGED"},             \
        { EXTENT_NORESERVE,             "NORESERVE"},           \
        { EXTENT_QGROUP_RESERVED,       "QGROUP_RESERVED"},     \
        { EXTENT_CLEAR_DATA_RESV,       "CLEAR_DATA_RESV"},     \
index 7ada84e..5655e89 100644 (file)
@@ -290,6 +290,12 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID  (1ULL << 1)
 #define BTRFS_FEATURE_COMPAT_RO_VERITY                 (1ULL << 2)
 
+/*
+ * Put all block group items into a dedicated block group tree, greatly
+ * reducing mount time for large filesystem due to better locality.
+ */
+#define BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE       (1ULL << 3)
+
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS    (1ULL << 2)
index 5f32a2a..1f7a38e 100644 (file)
@@ -965,6 +965,10 @@ static inline __u16 btrfs_qgroup_level(__u64 qgroupid)
  */
 #define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT  (1ULL << 2)
 
+#define BTRFS_QGROUP_STATUS_FLAGS_MASK (BTRFS_QGROUP_STATUS_FLAG_ON |          \
+                                        BTRFS_QGROUP_STATUS_FLAG_RESCAN |      \
+                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)
+
 #define BTRFS_QGROUP_STATUS_VERSION        1
 
 struct btrfs_qgroup_status_item {
index 032a7bf..7e9d8d8 100644 (file)
@@ -1933,6 +1933,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
        wb_put(wb);
        return ret;
 }
+EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);
 
 /**
  * balance_dirty_pages_ratelimited - balance dirty memory state.