btrfs: make fiemap more efficient and accurate reporting extent sharedness

author Filipe Manana <fdmanana@suse.com>

Thu, 1 Sep 2022 13:18:30 +0000 (14:18 +0100)

committer David Sterba <dsterba@suse.com>

Mon, 26 Sep 2022 10:28:01 +0000 (12:28 +0200)
author Filipe Manana <fdmanana@suse.com>
Thu, 1 Sep 2022 13:18:30 +0000 (14:18 +0100)
committer David Sterba <dsterba@suse.com>
Mon, 26 Sep 2022 10:28:01 +0000 (12:28 +0200)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index cb786a25a926394996ba26f5663266ae1132d6dd..05df502c3c9d29fa1bde34d689288ffcdb0190ef 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3378,8 +3378,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
                                     u64 start, u64 end);
  int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
                           u32 bio_offset, struct page *page, u32 pgoff);
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-                                          u64 start, u64 len);
  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
                               u64 *orig_start, u64 *orig_block_len,
                               u64 *ram_bytes, bool strict);
@@ -3559,6 +3557,8 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
  int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
                            size_t *write_bytes);
  void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+                                 u64 *delalloc_start_ret, u64 *delalloc_end_ret);
  
  /* tree-defrag.c */
  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 46f89cbe61937d57455d2b592a05d61cd7a59422..47367ca6d0d79543ca14a3d9a1c9602b9a52fa90 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5282,42 +5282,6 @@ next:
         return try_release_extent_state(tree, page, mask);
  }
  
-/*
- * helper function for fiemap, which doesn't want to see any holes.
- * This maps until we find something past 'last'
- */
-static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
-                                               u64 offset, u64 last)
-{
-       u64 sectorsize = btrfs_inode_sectorsize(inode);
-       struct extent_map *em;
-       u64 len;
-
-       if (offset >= last)
-               return NULL;
-
-       while (1) {
-               len = last - offset;
-               if (len == 0)
-                       break;
-               len = ALIGN(len, sectorsize);
-               em = btrfs_get_extent_fiemap(inode, offset, len);
-               if (IS_ERR(em))
-                       return em;
-
-               /* if this isn't a hole return it */
-               if (em->block_start != EXTENT_MAP_HOLE)
-                       return em;
-
-               /* this is a hole, advance to the next extent */
-               offset = extent_map_end(em);
-               free_extent_map(em);
-               if (offset >= last)
-                       break;
-       }
-       return NULL;
-}
-
  /*
   * To cache previous fiemap extent
   *
@@ -5347,6 +5311,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
  {
         int ret = 0;
  
+       /* Set at the end of extent_fiemap(). */
+       ASSERT((flags & FIEMAP_EXTENT_LAST) == 0);
+
         if (!cache->cached)
                 goto assign;
  
@@ -5370,16 +5337,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
          *    So truly compressed (physical size smaller than logical size)
          *    extents won't get merged with each other
          *
-        * 3) Share same flags except FIEMAP_EXTENT_LAST
-        *    So regular extent won't get merged with prealloc extent
+        * 3) Share same flags
          */
         if (cache->offset + cache->len  == offset &&
             cache->phys + cache->len == phys  &&
-           (cache->flags & ~FIEMAP_EXTENT_LAST) ==
-                       (flags & ~FIEMAP_EXTENT_LAST)) {
+           cache->flags == flags) {
                 cache->len += len;
-               cache->flags |= flags;
-               goto try_submit_last;
+               return 0;
         }
  
         /* Not mergeable, need to submit cached one */
@@ -5394,13 +5358,8 @@ assign:
         cache->phys = phys;
         cache->len = len;
         cache->flags = flags;
-try_submit_last:
-       if (cache->flags & FIEMAP_EXTENT_LAST) {
-               ret = fiemap_fill_next_extent(fieinfo, cache->offset,
-                               cache->phys, cache->len, cache->flags);
-               cache->cached = false;
-       }
-       return ret;
+
+       return 0;
  }
  
  /*
@@ -5430,229 +5389,533 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
         return ret;
  }
  
-int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
-                 u64 start, u64 len)
+static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *path)
  {
-       int ret = 0;
-       u64 off;
-       u64 max = start + len;
-       u32 flags = 0;
-       u32 found_type;
-       u64 last;
-       u64 last_for_get_extent = 0;
-       u64 disko = 0;
-       u64 isize = i_size_read(&inode->vfs_inode);
-       struct btrfs_key found_key;
-       struct extent_map *em = NULL;
-       struct extent_state *cached_state = NULL;
-       struct btrfs_path *path;
+       struct extent_buffer *clone;
+       struct btrfs_key key;
+       int slot;
+       int ret;
+
+       path->slots[0]++;
+       if (path->slots[0] < btrfs_header_nritems(path->nodes[0]))
+               return 0;
+
+       ret = btrfs_next_leaf(inode->root, path);
+       if (ret != 0)
+               return ret;
+
+       /*
+        * Don't bother with cloning if there are no more file extent items for
+        * our inode.
+        */
+       btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+       if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY)
+               return 1;
+
+       /* See the comment at fiemap_search_slot() about why we clone. */
+       clone = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!clone)
+               return -ENOMEM;
+
+       slot = path->slots[0];
+       btrfs_release_path(path);
+       path->nodes[0] = clone;
+       path->slots[0] = slot;
+
+       return 0;
+}
+
+/*
+ * Search for the first file extent item that starts at a given file offset or
+ * the one that starts immediately before that offset.
+ * Returns: 0 on success, < 0 on error, 1 if not found.
+ */
+static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path,
+                             u64 file_offset)
+{
+       const u64 ino = btrfs_ino(inode);
         struct btrfs_root *root = inode->root;
-       struct fiemap_cache cache = { 0 };
-       struct btrfs_backref_shared_cache *backref_cache;
-       struct ulist *roots;
-       struct ulist *tmp_ulist;
-       int end = 0;
-       u64 em_start = 0;
-       u64 em_len = 0;
-       u64 em_end = 0;
+       struct extent_buffer *clone;
+       struct btrfs_key key;
+       int slot;
+       int ret;
  
-       backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
-       path = btrfs_alloc_path();
-       roots = ulist_alloc(GFP_KERNEL);
-       tmp_ulist = ulist_alloc(GFP_KERNEL);
-       if (!backref_cache || !path || !roots || !tmp_ulist) {
-               ret = -ENOMEM;
-               goto out_free_ulist;
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = file_offset;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               return ret;
+
+       if (ret > 0 && path->slots[0] > 0) {
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+               if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+                       path->slots[0]--;
+       }
+
+       if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret != 0)
+                       return ret;
+
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+               if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+                       return 1;
         }
  
         /*
-        * We can't initialize that to 'start' as this could miss extents due
-        * to extent item merging
+        * We clone the leaf and use it during fiemap. This is because while
+        * using the leaf we do expensive things like checking if an extent is
+        * shared, which can take a long time. In order to prevent blocking
+        * other tasks for too long, we use a clone of the leaf. We have locked
+        * the file range in the inode's io tree, so we know none of our file
+        * extent items can change. This way we avoid blocking other tasks that
+        * want to insert items for other inodes in the same leaf or b+tree
+        * rebalance operations (triggered for example when someone is trying
+        * to push items into this leaf when trying to insert an item in a
+        * neighbour leaf).
+        * We also need the private clone because holding a read lock on an
+        * extent buffer of the subvolume's b+tree will make lockdep unhappy
+        * when we call fiemap_fill_next_extent(), because that may cause a page
+        * fault when filling the user space buffer with fiemap data.
          */
-       off = 0;
-       start = round_down(start, btrfs_inode_sectorsize(inode));
-       len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
+       clone = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!clone)
+               return -ENOMEM;
+
+       slot = path->slots[0];
+       btrfs_release_path(path);
+       path->nodes[0] = clone;
+       path->slots[0] = slot;
+
+       return 0;
+}
+
+/*
+ * Process a range which is a hole or a prealloc extent in the inode's subvolume
+ * btree. If @disk_bytenr is 0, we are dealing with a hole, otherwise a prealloc
+ * extent. The end offset (@end) is inclusive.
+ */
+static int fiemap_process_hole(struct btrfs_inode *inode,
+                              struct fiemap_extent_info *fieinfo,
+                              struct fiemap_cache *cache,
+                              struct btrfs_backref_shared_cache *backref_cache,
+                              u64 disk_bytenr, u64 extent_offset,
+                              u64 extent_gen,
+                              struct ulist *roots, struct ulist *tmp_ulist,
+                              u64 start, u64 end)
+{
+       const u64 i_size = i_size_read(&inode->vfs_inode);
+       const u64 ino = btrfs_ino(inode);
+       u64 cur_offset = start;
+       u64 last_delalloc_end = 0;
+       u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN;
+       bool checked_extent_shared = false;
+       int ret;
  
         /*
-        * lookup the last file extent.  We're not using i_size here
-        * because there might be preallocation past i_size
+        * There can be no delalloc past i_size, so don't waste time looking for
+        * it beyond i_size.
          */
-       ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
-                                      0);
-       if (ret < 0) {
-               goto out_free_ulist;
-       } else {
-               WARN_ON(!ret);
-               if (ret == 1)
-                       ret = 0;
-       }
+       while (cur_offset < end && cur_offset < i_size) {
+               u64 delalloc_start;
+               u64 delalloc_end;
+               u64 prealloc_start;
+               u64 prealloc_len = 0;
+               bool delalloc;
+
+               delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
+                                                       &delalloc_start,
+                                                       &delalloc_end);
+               if (!delalloc)
+                       break;
  
-       path->slots[0]--;
-       btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
-       found_type = found_key.type;
-
-       /* No extents, but there might be delalloc bits */
-       if (found_key.objectid != btrfs_ino(inode) ||
-           found_type != BTRFS_EXTENT_DATA_KEY) {
-               /* have to trust i_size as the end */
-               last = (u64)-1;
-               last_for_get_extent = isize;
-       } else {
                 /*
-                * remember the start of the last extent.  There are a
-                * bunch of different factors that go into the length of the
-                * extent, so its much less complex to remember where it started
+                * If this is a prealloc extent we have to report every section
+                * of it that has no delalloc.
                  */
-               last = found_key.offset;
-               last_for_get_extent = last + 1;
+               if (disk_bytenr != 0) {
+                       if (last_delalloc_end == 0) {
+                               prealloc_start = start;
+                               prealloc_len = delalloc_start - start;
+                       } else {
+                               prealloc_start = last_delalloc_end + 1;
+                               prealloc_len = delalloc_start - prealloc_start;
+                       }
+               }
+
+               if (prealloc_len > 0) {
+                       if (!checked_extent_shared && fieinfo->fi_extents_max) {
+                               ret = btrfs_is_data_extent_shared(inode->root,
+                                                         ino, disk_bytenr,
+                                                         extent_gen, roots,
+                                                         tmp_ulist,
+                                                         backref_cache);
+                               if (ret < 0)
+                                       return ret;
+                               else if (ret > 0)
+                                       prealloc_flags |= FIEMAP_EXTENT_SHARED;
+
+                               checked_extent_shared = true;
+                       }
+                       ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+                                                disk_bytenr + extent_offset,
+                                                prealloc_len, prealloc_flags);
+                       if (ret)
+                               return ret;
+                       extent_offset += prealloc_len;
+               }
+
+               ret = emit_fiemap_extent(fieinfo, cache, delalloc_start, 0,
+                                        delalloc_end + 1 - delalloc_start,
+                                        FIEMAP_EXTENT_DELALLOC |
+                                        FIEMAP_EXTENT_UNKNOWN);
+               if (ret)
+                       return ret;
+
+               last_delalloc_end = delalloc_end;
+               cur_offset = delalloc_end + 1;
+               extent_offset += cur_offset - delalloc_start;
+               cond_resched();
+       }
+
+       /*
+        * Either we found no delalloc for the whole prealloc extent or we have
+        * a prealloc extent that spans i_size or starts at or after i_size.
+        */
+       if (disk_bytenr != 0 && last_delalloc_end < end) {
+               u64 prealloc_start;
+               u64 prealloc_len;
+
+               if (last_delalloc_end == 0) {
+                       prealloc_start = start;
+                       prealloc_len = end + 1 - start;
+               } else {
+                       prealloc_start = last_delalloc_end + 1;
+                       prealloc_len = end + 1 - prealloc_start;
+               }
+
+               if (!checked_extent_shared && fieinfo->fi_extents_max) {
+                       ret = btrfs_is_data_extent_shared(inode->root,
+                                                         ino, disk_bytenr,
+                                                         extent_gen, roots,
+                                                         tmp_ulist,
+                                                         backref_cache);
+                       if (ret < 0)
+                               return ret;
+                       else if (ret > 0)
+                               prealloc_flags |= FIEMAP_EXTENT_SHARED;
+               }
+               ret = emit_fiemap_extent(fieinfo, cache, prealloc_start,
+                                        disk_bytenr + extent_offset,
+                                        prealloc_len, prealloc_flags);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int fiemap_find_last_extent_offset(struct btrfs_inode *inode,
+                                         struct btrfs_path *path,
+                                         u64 *last_extent_end_ret)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_root *root = inode->root;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *ei;
+       struct btrfs_key key;
+       u64 disk_bytenr;
+       int ret;
+
+       /*
+        * Lookup the last file extent. We're not using i_size here because
+        * there might be preallocation past i_size.
+        */
+       ret = btrfs_lookup_file_extent(NULL, root, path, ino, (u64)-1, 0);
+       /* There can't be a file extent item at offset (u64)-1 */
+       ASSERT(ret != 0);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * For a non-existing key, btrfs_search_slot() always leaves us at a
+        * slot > 0, except if the btree is empty, which is impossible because
+        * at least it has the inode item for this inode and all the items for
+        * the root inode 256.
+        */
+       ASSERT(path->slots[0] > 0);
+       path->slots[0]--;
+       leaf = path->nodes[0];
+       btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+       if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+               /* No file extent items in the subvolume tree. */
+               *last_extent_end_ret = 0;
+               return 0;
         }
-       btrfs_release_path(path);
  
         /*
-        * we might have some extents allocated but more delalloc past those
-        * extents.  so, we trust isize unless the start of the last extent is
-        * beyond isize
+        * For an inline extent, the disk_bytenr is where inline data starts at,
+        * so first check if we have an inline extent item before checking if we
+        * have an implicit hole (disk_bytenr == 0).
          */
-       if (last < isize) {
-               last = (u64)-1;
-               last_for_get_extent = isize;
+       ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+       if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
+               *last_extent_end_ret = btrfs_file_extent_end(path);
+               return 0;
         }
  
-       lock_extent_bits(&inode->io_tree, start, start + len - 1,
-                        &cached_state);
+       /*
+        * Find the last file extent item that is not a hole (when NO_HOLES is
+        * not enabled). This should take at most 2 iterations in the worst
+        * case: we have one hole file extent item at slot 0 of a leaf and
+        * another hole file extent item as the last item in the previous leaf.
+        * This is because we merge file extent items that represent holes.
+        */
+       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+       while (disk_bytenr == 0) {
+               ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+               if (ret < 0) {
+                       return ret;
+               } else if (ret > 0) {
+                       /* No file extent items that are not holes. */
+                       *last_extent_end_ret = 0;
+                       return 0;
+               }
+               leaf = path->nodes[0];
+               ei = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+       }
  
-       em = get_extent_skip_holes(inode, start, last_for_get_extent);
-       if (!em)
-               goto out;
-       if (IS_ERR(em)) {
-               ret = PTR_ERR(em);
+       *last_extent_end_ret = btrfs_file_extent_end(path);
+       return 0;
+}
+
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
+                 u64 start, u64 len)
+{
+       const u64 ino = btrfs_ino(inode);
+       struct extent_state *cached_state = NULL;
+       struct btrfs_path *path;
+       struct btrfs_root *root = inode->root;
+       struct fiemap_cache cache = { 0 };
+       struct btrfs_backref_shared_cache *backref_cache;
+       struct ulist *roots;
+       struct ulist *tmp_ulist;
+       u64 last_extent_end;
+       u64 prev_extent_end;
+       u64 lockstart;
+       u64 lockend;
+       bool stopped = false;
+       int ret;
+
+       backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL);
+       path = btrfs_alloc_path();
+       roots = ulist_alloc(GFP_KERNEL);
+       tmp_ulist = ulist_alloc(GFP_KERNEL);
+       if (!backref_cache || !path || !roots || !tmp_ulist) {
+               ret = -ENOMEM;
                 goto out;
         }
  
-       while (!end) {
-               u64 offset_in_extent = 0;
+       lockstart = round_down(start, btrfs_inode_sectorsize(inode));
+       lockend = round_up(start + len, btrfs_inode_sectorsize(inode));
+       prev_extent_end = lockstart;
  
-               /* break if the extent we found is outside the range */
-               if (em->start >= max || extent_map_end(em) < off)
-                       break;
+       lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
  
-               /*
-                * get_extent may return an extent that starts before our
-                * requested range.  We have to make sure the ranges
-                * we return to fiemap always move forward and don't
-                * overlap, so adjust the offsets here
-                */
-               em_start = max(em->start, off);
+       ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
+       if (ret < 0)
+               goto out_unlock;
+       btrfs_release_path(path);
  
+       path->reada = READA_FORWARD;
+       ret = fiemap_search_slot(inode, path, lockstart);
+       if (ret < 0) {
+               goto out_unlock;
+       } else if (ret > 0) {
                 /*
-                * record the offset from the start of the extent
-                * for adjusting the disk offset below.  Only do this if the
-                * extent isn't compressed since our in ram offset may be past
-                * what we have actually allocated on disk.
+                * No file extent item found, but we may have delalloc between
+                * the current offset and i_size. So check for that.
                  */
-               if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-                       offset_in_extent = em_start - em->start;
-               em_end = extent_map_end(em);
-               em_len = em_end - em_start;
-               flags = 0;
-               if (em->block_start < EXTENT_MAP_LAST_BYTE)
-                       disko = em->block_start + offset_in_extent;
-               else
-                       disko = 0;
+               ret = 0;
+               goto check_eof_delalloc;
+       }
+
+       while (prev_extent_end < lockend) {
+               struct extent_buffer *leaf = path->nodes[0];
+               struct btrfs_file_extent_item *ei;
+               struct btrfs_key key;
+               u64 extent_end;
+               u64 extent_len;
+               u64 extent_offset = 0;
+               u64 extent_gen;
+               u64 disk_bytenr = 0;
+               u64 flags = 0;
+               int extent_type;
+               u8 compression;
+
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+                       break;
+
+               extent_end = btrfs_file_extent_end(path);
  
                 /*
-                * bump off for our next call to get_extent
+                * The first iteration can leave us at an extent item that ends
+                * before our range's start. Move to the next item.
                  */
-               off = extent_map_end(em);
-               if (off >= max)
-                       end = 1;
-
-               if (em->block_start == EXTENT_MAP_INLINE) {
-                       flags |= (FIEMAP_EXTENT_DATA_INLINE |
-                                 FIEMAP_EXTENT_NOT_ALIGNED);
-               } else if (em->block_start == EXTENT_MAP_DELALLOC) {
-                       flags |= (FIEMAP_EXTENT_DELALLOC |
-                                 FIEMAP_EXTENT_UNKNOWN);
-               } else if (fieinfo->fi_extents_max) {
-                       u64 extent_gen;
-                       u64 bytenr = em->block_start -
-                               (em->start - em->orig_start);
+               if (extent_end <= lockstart)
+                       goto next_item;
  
-                       /*
-                        * If two extent maps are merged, then their generation
-                        * is set to the maximum between their generations.
-                        * Otherwise its generation matches the one we have in
-                        * corresponding file extent item. If we have a merged
-                        * extent map, don't use its generation to speedup the
-                        * sharedness check below.
-                        */
-                       if (test_bit(EXTENT_FLAG_MERGED, &em->flags))
-                               extent_gen = 0;
-                       else
-                               extent_gen = em->generation;
+               /* We have in implicit hole (NO_HOLES feature enabled). */
+               if (prev_extent_end < key.offset) {
+                       const u64 range_end = min(key.offset, lockend) - 1;
  
-                       /*
-                        * As btrfs supports shared space, this information
-                        * can be exported to userspace tools via
-                        * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
-                        * then we're just getting a count and we can skip the
-                        * lookup stuff.
-                        */
-                       ret = btrfs_is_data_extent_shared(root, btrfs_ino(inode),
-                                                         bytenr, extent_gen,
-                                                         roots, tmp_ulist,
-                                                         backref_cache);
-                       if (ret < 0)
-                               goto out_free;
-                       if (ret)
-                               flags |= FIEMAP_EXTENT_SHARED;
-                       ret = 0;
-               }
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-                       flags |= FIEMAP_EXTENT_ENCODED;
-               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-                       flags |= FIEMAP_EXTENT_UNWRITTEN;
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache, 0, 0, 0,
+                                                 roots, tmp_ulist,
+                                                 prev_extent_end, range_end);
+                       if (ret < 0) {
+                               goto out_unlock;
+                       } else if (ret > 0) {
+                               /* fiemap_fill_next_extent() told us to stop. */
+                               stopped = true;
+                               break;
+                       }
  
-               free_extent_map(em);
-               em = NULL;
-               if ((em_start >= last) || em_len == (u64)-1 ||
-                  (last == (u64)-1 && isize <= em_end)) {
-                       flags |= FIEMAP_EXTENT_LAST;
-                       end = 1;
+                       /* We've reached the end of the fiemap range, stop. */
+                       if (key.offset >= lockend) {
+                               stopped = true;
+                               break;
+                       }
                 }
  
-               /* now scan forward to see if this is really the last extent. */
-               em = get_extent_skip_holes(inode, off, last_for_get_extent);
-               if (IS_ERR(em)) {
-                       ret = PTR_ERR(em);
-                       goto out;
+               extent_len = extent_end - key.offset;
+               ei = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               compression = btrfs_file_extent_compression(leaf, ei);
+               extent_type = btrfs_file_extent_type(leaf, ei);
+               extent_gen = btrfs_file_extent_generation(leaf, ei);
+
+               if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                       disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
+                       if (compression == BTRFS_COMPRESS_NONE)
+                               extent_offset = btrfs_file_extent_offset(leaf, ei);
                 }
-               if (!em) {
-                       flags |= FIEMAP_EXTENT_LAST;
-                       end = 1;
+
+               if (compression != BTRFS_COMPRESS_NONE)
+                       flags |= FIEMAP_EXTENT_ENCODED;
+
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                       flags |= FIEMAP_EXTENT_DATA_INLINE;
+                       flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+                       ret = emit_fiemap_extent(fieinfo, &cache, key.offset, 0,
+                                                extent_len, flags);
+               } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache,
+                                                 disk_bytenr, extent_offset,
+                                                 extent_gen, roots, tmp_ulist,
+                                                 key.offset, extent_end - 1);
+               } else if (disk_bytenr == 0) {
+                       /* We have an explicit hole. */
+                       ret = fiemap_process_hole(inode, fieinfo, &cache,
+                                                 backref_cache, 0, 0, 0,
+                                                 roots, tmp_ulist,
+                                                 key.offset, extent_end - 1);
+               } else {
+                       /* We have a regular extent. */
+                       if (fieinfo->fi_extents_max) {
+                               ret = btrfs_is_data_extent_shared(root, ino,
+                                                                 disk_bytenr,
+                                                                 extent_gen,
+                                                                 roots,
+                                                                 tmp_ulist,
+                                                                 backref_cache);
+                               if (ret < 0)
+                                       goto out_unlock;
+                               else if (ret > 0)
+                                       flags |= FIEMAP_EXTENT_SHARED;
+                       }
+
+                       ret = emit_fiemap_extent(fieinfo, &cache, key.offset,
+                                                disk_bytenr + extent_offset,
+                                                extent_len, flags);
                 }
-               ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
-                                          em_len, flags);
-               if (ret) {
-                       if (ret == 1)
-                               ret = 0;
-                       goto out_free;
+
+               if (ret < 0) {
+                       goto out_unlock;
+               } else if (ret > 0) {
+                       /* fiemap_fill_next_extent() told us to stop. */
+                       stopped = true;
+                       break;
                 }
  
+               prev_extent_end = extent_end;
+next_item:
                 if (fatal_signal_pending(current)) {
                         ret = -EINTR;
-                       goto out_free;
+                       goto out_unlock;
                 }
+
+               ret = fiemap_next_leaf_item(inode, path);
+               if (ret < 0) {
+                       goto out_unlock;
+               } else if (ret > 0) {
+                       /* No more file extent items for this inode. */
+                       break;
+               }
+               cond_resched();
         }
-out_free:
-       if (!ret)
-               ret = emit_last_fiemap_cache(fieinfo, &cache);
-       free_extent_map(em);
-out:
-       unlock_extent_cached(&inode->io_tree, start, start + len - 1,
-                            &cached_state);
  
-out_free_ulist:
+check_eof_delalloc:
+       /*
+        * Release (and free) the path before emitting any final entries to
+        * fiemap_fill_next_extent() to keep lockdep happy. This is because
+        * once we find no more file extent items exist, we may have a
+        * non-cloned leaf, and fiemap_fill_next_extent() can trigger page
+        * faults when copying data to the user space buffer.
+        */
+       btrfs_free_path(path);
+       path = NULL;
+
+       if (!stopped && prev_extent_end < lockend) {
+               ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache,
+                                         0, 0, 0, roots, tmp_ulist,
+                                         prev_extent_end, lockend - 1);
+               if (ret < 0)
+                       goto out_unlock;
+               prev_extent_end = lockend;
+       }
+
+       if (cache.cached && cache.offset + cache.len >= last_extent_end) {
+               const u64 i_size = i_size_read(&inode->vfs_inode);
+
+               if (prev_extent_end < i_size) {
+                       u64 delalloc_start;
+                       u64 delalloc_end;
+                       bool delalloc;
+
+                       delalloc = btrfs_find_delalloc_in_range(inode,
+                                                               prev_extent_end,
+                                                               i_size - 1,
+                                                               &delalloc_start,
+                                                               &delalloc_end);
+                       if (!delalloc)
+                               cache.flags |= FIEMAP_EXTENT_LAST;
+               } else {
+                       cache.flags |= FIEMAP_EXTENT_LAST;
+               }
+       }
+
+       ret = emit_last_fiemap_cache(fieinfo, &cache);
+
+out_unlock:
+       unlock_extent_cached(&inode->io_tree, lockstart, lockend, &cached_state);
+out:
         kfree(backref_cache);
         btrfs_free_path(path);
         ulist_free(roots);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index b292a8ada3a42af2ad302487f75594d8f99018f3..636b3ec461846667fcc84369c46167de7827c2c3 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3602,10 +3602,10 @@ out:
  }
  
  /*
- * Helper for have_delalloc_in_range(). Find a subrange in a given range that
- * has unflushed and/or flushing delalloc. There might be other adjacent
- * subranges after the one it found, so have_delalloc_in_range() keeps looping
- * while it gets adjacent subranges, and merging them together.
+ * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
+ * that has unflushed and/or flushing delalloc. There might be other adjacent
+ * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
+ * looping while it gets adjacent subranges, and merging them together.
   */
  static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
@@ -3740,8 +3740,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
   * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
   * end offsets of the subrange.
   */
-static bool have_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
-                                  u64 *delalloc_start_ret, u64 *delalloc_end_ret)
+bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
+                                 u64 *delalloc_start_ret, u64 *delalloc_end_ret)
  {
         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
         u64 prev_delalloc_end = 0;
@@ -3804,8 +3804,8 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
         u64 delalloc_end;
         bool delalloc;
  
-       delalloc = have_delalloc_in_range(inode, start, end, &delalloc_start,
-                                         &delalloc_end);
+       delalloc = btrfs_find_delalloc_in_range(inode, start, end,
+                                               &delalloc_start, &delalloc_end);
         if (delalloc && whence == SEEK_DATA) {
                 *start_ret = delalloc_start;
                 return true;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index c479e5f16f74cf88e06a7de50ed55012c2920c5e..10849db7f3a58a12c8d884d50063a8d6fdc5b014 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7060,133 +7060,6 @@ out:
         return em;
  }
  
-struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
-                                          u64 start, u64 len)
-{
-       struct extent_map *em;
-       struct extent_map *hole_em = NULL;
-       u64 delalloc_start = start;
-       u64 end;
-       u64 delalloc_len;
-       u64 delalloc_end;
-       int err = 0;
-
-       em = btrfs_get_extent(inode, NULL, 0, start, len);
-       if (IS_ERR(em))
-               return em;
-       /*
-        * If our em maps to:
-        * - a hole or
-        * - a pre-alloc extent,
-        * there might actually be delalloc bytes behind it.
-        */
-       if (em->block_start != EXTENT_MAP_HOLE &&
-           !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-               return em;
-       else
-               hole_em = em;
-
-       /* check to see if we've wrapped (len == -1 or similar) */
-       end = start + len;
-       if (end < start)
-               end = (u64)-1;
-       else
-               end -= 1;
-
-       em = NULL;
-
-       /* ok, we didn't find anything, lets look for delalloc */
-       delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
-                                end, len, EXTENT_DELALLOC, 1);
-       delalloc_end = delalloc_start + delalloc_len;
-       if (delalloc_end < delalloc_start)
-               delalloc_end = (u64)-1;
-
-       /*
-        * We didn't find anything useful, return the original results from
-        * get_extent()
-        */
-       if (delalloc_start > end || delalloc_end <= start) {
-               em = hole_em;
-               hole_em = NULL;
-               goto out;
-       }
-
-       /*
-        * Adjust the delalloc_start to make sure it doesn't go backwards from
-        * the start they passed in
-        */
-       delalloc_start = max(start, delalloc_start);
-       delalloc_len = delalloc_end - delalloc_start;
-
-       if (delalloc_len > 0) {
-               u64 hole_start;
-               u64 hole_len;
-               const u64 hole_end = extent_map_end(hole_em);
-
-               em = alloc_extent_map();
-               if (!em) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-
-               ASSERT(hole_em);
-               /*
-                * When btrfs_get_extent can't find anything it returns one
-                * huge hole
-                *
-                * Make sure what it found really fits our range, and adjust to
-                * make sure it is based on the start from the caller
-                */
-               if (hole_end <= start || hole_em->start > end) {
-                      free_extent_map(hole_em);
-                      hole_em = NULL;
-               } else {
-                      hole_start = max(hole_em->start, start);
-                      hole_len = hole_end - hole_start;
-               }
-
-               if (hole_em && delalloc_start > hole_start) {
-                       /*
-                        * Our hole starts before our delalloc, so we have to
-                        * return just the parts of the hole that go until the
-                        * delalloc starts
-                        */
-                       em->len = min(hole_len, delalloc_start - hole_start);
-                       em->start = hole_start;
-                       em->orig_start = hole_start;
-                       /*
-                        * Don't adjust block start at all, it is fixed at
-                        * EXTENT_MAP_HOLE
-                        */
-                       em->block_start = hole_em->block_start;
-                       em->block_len = hole_len;
-                       if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
-                               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
-               } else {
-                       /*
-                        * Hole is out of passed range or it starts after
-                        * delalloc range
-                        */
-                       em->start = delalloc_start;
-                       em->len = delalloc_len;
-                       em->orig_start = delalloc_start;
-                       em->block_start = EXTENT_MAP_DELALLOC;
-                       em->block_len = delalloc_len;
-               }
-       } else {
-               return hole_em;
-       }
-out:
-
-       free_extent_map(hole_em);
-       if (err) {
-               free_extent_map(em);
-               return ERR_PTR(err);
-       }
-       return em;
-}
-
  static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
                                                   const u64 start,
                                                   const u64 len,
@@ -8259,15 +8132,14 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
          * in the compression of data (in an async thread) and will return
          * before the compression is done and writeback is started. A second
          * filemap_fdatawrite_range() is needed to wait for the compression to
-        * complete and writeback to start. Without this, our user is very
-        * likely to get stale results, because the extents and extent maps for
-        * delalloc regions are only allocated when writeback starts.
+        * complete and writeback to start. We also need to wait for ordered
+        * extents to complete, because our fiemap implementation uses mainly
+        * file extent items to list the extents, searching for extent maps
+        * only for file ranges with holes or prealloc extents to figure out
+        * if we have delalloc in those ranges.
          */
         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
-               ret = btrfs_fdatawrite_range(inode, 0, LLONG_MAX);
-               if (ret)
-                       return ret;
-               ret = filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+               ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
                 if (ret)
                         return ret;
         }
author	Filipe Manana <fdmanana@suse.com>
	Thu, 1 Sep 2022 13:18:30 +0000 (14:18 +0100)
committer	David Sterba <dsterba@suse.com>
	Mon, 26 Sep 2022 10:28:01 +0000 (12:28 +0200)
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history