Merge tag 'hardening-v6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...

[platform/kernel/linux-starfive.git] / fs / btrfs / send.c
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index f53e804..e65e6b6 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,6 +27,11 @@
  #include "compression.h"
  #include "xattr.h"
  #include "print-tree.h"
+#include "accessors.h"
+#include "dir-item.h"
+#include "file-item.h"
+#include "ioctl.h"
+#include "verity.h"
  
  /*
   * Maximum number of references an extent can have in order for us to attempt to
@@ -34,7 +39,7 @@
   * avoid hitting limitations of the backreference walking code (taking a lot of
   * time and using too much memory for extents with large number of references).
   */
-#define SEND_MAX_EXTENT_REFS   64
+#define SEND_MAX_EXTENT_REFS   1024
  
  /*
   * A fs_path is a helper to dynamically build path names with unknown size.
@@ -71,13 +76,46 @@ struct clone_root {
         struct btrfs_root *root;
         u64 ino;
         u64 offset;
-
-       u64 found_refs;
+       u64 num_bytes;
+       bool found_ref;
  };
  
  #define SEND_CTX_MAX_NAME_CACHE_SIZE 128
  #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
  
+/*
+ * Limit the root_ids array of struct backref_cache_entry to 12 elements.
+ * This makes the size of a cache entry to be exactly 128 bytes on x86_64.
+ * The most common case is to have a single root for cloning, which corresponds
+ * to the send root. Having the user specify more than 11 clone roots is not
+ * common, and in such rare cases we simply don't use caching if the number of
+ * cloning roots that lead down to a leaf is more than 12.
+ */
+#define SEND_MAX_BACKREF_CACHE_ROOTS 12
+
+/*
+ * Max number of entries in the cache.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
+ * maple tree's internal nodes, is 16K.
+ */
+#define SEND_MAX_BACKREF_CACHE_SIZE 128
+
+/*
+ * A backref cache entry maps a leaf to a list of IDs of roots from which the
+ * leaf is accessible and we can use for clone operations.
+ * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
+ * x86_64).
+ */
+struct backref_cache_entry {
+       /* List to link to the cache's lru list. */
+       struct list_head list;
+       /* The key for this entry in the cache. */
+       u64 key;
+       u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
+       /* Number of valid elements in the root_ids array. */
+       int num_roots;
+};
+
  struct send_ctx {
         struct file *send_filp;
         loff_t send_off;
@@ -246,6 +284,14 @@ struct send_ctx {
  
         struct rb_root rbtree_new_refs;
         struct rb_root rbtree_deleted_refs;
+
+       struct {
+               u64 last_reloc_trans;
+               struct list_head lru_list;
+               struct maple_tree entries;
+               /* Number of entries stored in the cache. */
+               int size;
+       } backref_cache;
  };
  
  struct pending_dir_move {
@@ -348,6 +394,7 @@ static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
         switch (sctx->proto) {
         case 1:  return cmd <= BTRFS_SEND_C_MAX_V1;
         case 2:  return cmd <= BTRFS_SEND_C_MAX_V2;
+       case 3:  return cmd <= BTRFS_SEND_C_MAX_V3;
         default: return false;
         }
  }
@@ -1093,7 +1140,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                 data_len = btrfs_dir_data_len(eb, di);
                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
  
-               if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
+               if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
                         if (name_len > XATTR_NAME_MAX) {
                                 ret = -ENAMETOOLONG;
                                 goto out;
@@ -1236,8 +1283,12 @@ struct backref_ctx {
         /* may be truncated in case it's the last extent in a file */
         u64 extent_len;
  
-       /* Just to check for bugs in backref resolving */
-       int found_itself;
+       /* The bytenr the file extent item we are processing refers to. */
+       u64 bytenr;
+       /* The owner (root id) of the data backref for the current extent. */
+       u64 backref_owner;
+       /* The offset of the data backref for the current extent. */
+       u64 backref_offset;
  };
  
  static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1266,32 +1317,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
  
  /*
   * Called for every backref that is found for the current extent.
- * Results are collected in sctx->clone_roots->ino/offset/found_refs
+ * Results are collected in sctx->clone_roots->ino/offset.
   */
-static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
+static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
+                           void *ctx_)
  {
         struct backref_ctx *bctx = ctx_;
-       struct clone_root *found;
+       struct clone_root *clone_root;
  
         /* First check if the root is in the list of accepted clone sources */
-       found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
-                       bctx->sctx->clone_roots_cnt,
-                       sizeof(struct clone_root),
-                       __clone_root_cmp_bsearch);
-       if (!found)
+       clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
+                            bctx->sctx->clone_roots_cnt,
+                            sizeof(struct clone_root),
+                            __clone_root_cmp_bsearch);
+       if (!clone_root)
                 return 0;
  
-       if (found->root == bctx->sctx->send_root &&
+       /* This is our own reference, bail out as we can't clone from it. */
+       if (clone_root->root == bctx->sctx->send_root &&
             ino == bctx->cur_objectid &&
-           offset == bctx->cur_offset) {
-               bctx->found_itself = 1;
-       }
+           offset == bctx->cur_offset)
+               return 0;
  
         /*
          * Make sure we don't consider clones from send_root that are
          * behind the current inode/offset.
          */
-       if (found->root == bctx->sctx->send_root) {
+       if (clone_root->root == bctx->sctx->send_root) {
                 /*
                  * If the source inode was not yet processed we can't issue a
                  * clone operation, as the source extent does not exist yet at
@@ -1312,21 +1364,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
         }
  
         bctx->found++;
-       found->found_refs++;
-       if (ino < found->ino) {
-               found->ino = ino;
-               found->offset = offset;
-       } else if (found->ino == ino) {
+       clone_root->found_ref = true;
+
+       /*
+        * If the given backref refers to a file extent item with a larger
+        * number of bytes than what we found before, use the new one so that
+        * we clone more optimally and end up doing less writes and getting
+        * less exclusive, non-shared extents at the destination.
+        */
+       if (num_bytes > clone_root->num_bytes) {
+               clone_root->ino = ino;
+               clone_root->offset = offset;
+               clone_root->num_bytes = num_bytes;
+
+               /*
+                * Found a perfect candidate, so there's no need to continue
+                * backref walking.
+                */
+               if (num_bytes >= bctx->extent_len)
+                       return BTRFS_ITERATE_EXTENT_INODES_STOP;
+       }
+
+       return 0;
+}
+
+static void empty_backref_cache(struct send_ctx *sctx)
+{
+       struct backref_cache_entry *entry;
+       struct backref_cache_entry *tmp;
+
+       list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
+               kfree(entry);
+
+       INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+       mtree_destroy(&sctx->backref_cache.entries);
+       sctx->backref_cache.size = 0;
+}
+
+static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
+                                const u64 **root_ids_ret, int *root_count_ret)
+{
+       struct backref_ctx *bctx = ctx;
+       struct send_ctx *sctx = bctx->sctx;
+       struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+       const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
+       struct backref_cache_entry *entry;
+
+       if (sctx->backref_cache.size == 0)
+               return false;
+
+       /*
+        * If relocation happened since we first filled the cache, then we must
+        * empty the cache and can not use it, because even though we operate on
+        * read-only roots, their leaves and nodes may have been reallocated and
+        * now be used for different nodes/leaves of the same tree or some other
+        * tree.
+        *
+        * We are called from iterate_extent_inodes() while either holding a
+        * transaction handle or holding fs_info->commit_root_sem, so no need
+        * to take any lock here.
+        */
+       if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
+               empty_backref_cache(sctx);
+               return false;
+       }
+
+       entry = mtree_load(&sctx->backref_cache.entries, key);
+       if (!entry)
+               return false;
+
+       *root_ids_ret = entry->root_ids;
+       *root_count_ret = entry->num_roots;
+       list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
+
+       return true;
+}
+
+static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
+                               void *ctx)
+{
+       struct backref_ctx *bctx = ctx;
+       struct send_ctx *sctx = bctx->sctx;
+       struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
+       struct backref_cache_entry *new_entry;
+       struct ulist_iterator uiter;
+       struct ulist_node *node;
+       int ret;
+
+       /*
+        * We're called while holding a transaction handle or while holding
+        * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
+        * NOFS allocation.
+        */
+       new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
+       /* No worries, cache is optional. */
+       if (!new_entry)
+               return;
+
+       new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
+       new_entry->num_roots = 0;
+       ULIST_ITER_INIT(&uiter);
+       while ((node = ulist_next(root_ids, &uiter)) != NULL) {
+               const u64 root_id = node->val;
+               struct clone_root *root;
+
+               root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
+                              sctx->clone_roots_cnt, sizeof(struct clone_root),
+                              __clone_root_cmp_bsearch);
+               if (!root)
+                       continue;
+
+               /* Too many roots, just exit, no worries as caching is optional. */
+               if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
+                       kfree(new_entry);
+                       return;
+               }
+
+               new_entry->root_ids[new_entry->num_roots] = root_id;
+               new_entry->num_roots++;
+       }
+
+       /*
+        * We may have not added any roots to the new cache entry, which means
+        * none of the roots is part of the list of roots from which we are
+        * allowed to clone. Cache the new entry as it's still useful to avoid
+        * backref walking to determine which roots have a path to the leaf.
+        */
+
+       if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
+               struct backref_cache_entry *lru_entry;
+               struct backref_cache_entry *mt_entry;
+
+               lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
+                                            struct backref_cache_entry, list);
+               mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
+               ASSERT(mt_entry == lru_entry);
+               list_del(&mt_entry->list);
+               kfree(mt_entry);
+               sctx->backref_cache.size--;
+       }
+
+       ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
+                          new_entry, GFP_NOFS);
+       ASSERT(ret == 0 || ret == -ENOMEM);
+       if (ret) {
+               /* Caching is optional, no worries. */
+               kfree(new_entry);
+               return;
+       }
+
+       list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
+
+       /*
+        * We are called from iterate_extent_inodes() while either holding a
+        * transaction handle or holding fs_info->commit_root_sem, so no need
+        * to take any lock here.
+        */
+       if (sctx->backref_cache.size == 0)
+               sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
+
+       sctx->backref_cache.size++;
+}
+
+static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
+                            const struct extent_buffer *leaf, void *ctx)
+{
+       const u64 refs = btrfs_extent_refs(leaf, ei);
+       const struct backref_ctx *bctx = ctx;
+       const struct send_ctx *sctx = bctx->sctx;
+
+       if (bytenr == bctx->bytenr) {
+               const u64 flags = btrfs_extent_flags(leaf, ei);
+
+               if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                       return -EUCLEAN;
+
                 /*
-                * same extent found more then once in the same file.
+                * If we have only one reference and only the send root as a
+                * clone source - meaning no clone roots were given in the
+                * struct btrfs_ioctl_send_args passed to the send ioctl - then
+                * it's our reference and there's no point in doing backref
+                * walking which is expensive, so exit early.
                  */
-               if (found->offset > offset + bctx->extent_len)
-                       found->offset = offset;
+               if (refs == 1 && sctx->clone_roots_cnt == 1)
+                       return -ENOENT;
         }
  
+       /*
+        * Backreference walking (iterate_extent_inodes() below) is currently
+        * too expensive when an extent has a large number of references, both
+        * in time spent and used memory. So for now just fallback to write
+        * operations instead of clone operations when an extent has more than
+        * a certain amount of references.
+        */
+       if (refs > SEND_MAX_EXTENT_REFS)
+               return -ENOENT;
+
         return 0;
  }
  
+static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
+{
+       const struct backref_ctx *bctx = ctx;
+
+       if (ino == bctx->cur_objectid &&
+           root == bctx->backref_owner &&
+           offset == bctx->backref_offset)
+               return true;
+
+       return false;
+}
+
  /*
   * Given an inode, offset and extent item, it finds a good clone for a clone
   * instruction. Returns -ENOENT when none could be found. The function makes
@@ -1348,79 +1596,36 @@ static int find_extent_clone(struct send_ctx *sctx,
         u64 logical;
         u64 disk_byte;
         u64 num_bytes;
-       u64 extent_item_pos;
-       u64 flags = 0;
         struct btrfs_file_extent_item *fi;
         struct extent_buffer *eb = path->nodes[0];
-       struct backref_ctx backref_ctx = {0};
+       struct backref_ctx backref_ctx = { 0 };
+       struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
         struct clone_root *cur_clone_root;
-       struct btrfs_key found_key;
-       struct btrfs_path *tmp_path;
-       struct btrfs_extent_item *ei;
         int compressed;
         u32 i;
  
-       tmp_path = alloc_path_for_send();
-       if (!tmp_path)
-               return -ENOMEM;
+       /*
+        * With fallocate we can get prealloc extents beyond the inode's i_size,
+        * so we don't do anything here because clone operations can not clone
+        * to a range beyond i_size without increasing the i_size of the
+        * destination inode.
+        */
+       if (data_offset >= ino_size)
+               return 0;
  
-       /* We only use this path under the commit sem */
-       tmp_path->need_commit_sem = 0;
+       fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
+       extent_type = btrfs_file_extent_type(eb, fi);
+       if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+               return -ENOENT;
  
-       if (data_offset >= ino_size) {
-               /*
-                * There may be extents that lie behind the file's size.
-                * I at least had this in combination with snapshotting while
-                * writing large files.
-                */
-               ret = 0;
-               goto out;
-       }
+       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+       if (disk_byte == 0)
+               return -ENOENT;
  
-       fi = btrfs_item_ptr(eb, path->slots[0],
-                       struct btrfs_file_extent_item);
-       extent_type = btrfs_file_extent_type(eb, fi);
-       if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-               ret = -ENOENT;
-               goto out;
-       }
         compressed = btrfs_file_extent_compression(eb, fi);
-
         num_bytes = btrfs_file_extent_num_bytes(eb, fi);
-       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-       if (disk_byte == 0) {
-               ret = -ENOENT;
-               goto out;
-       }
         logical = disk_byte + btrfs_file_extent_offset(eb, fi);
  
-       down_read(&fs_info->commit_root_sem);
-       ret = extent_from_logical(fs_info, disk_byte, tmp_path,
-                                 &found_key, &flags);
-       up_read(&fs_info->commit_root_sem);
-
-       if (ret < 0)
-               goto out;
-       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-               ret = -EIO;
-               goto out;
-       }
-
-       ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
-                           struct btrfs_extent_item);
-       /*
-        * Backreference walking (iterate_extent_inodes() below) is currently
-        * too expensive when an extent has a large number of references, both
-        * in time spent and used memory. So for now just fallback to write
-        * operations instead of clone operations when an extent has more than
-        * a certain amount of references.
-        */
-       if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
-               ret = -ENOENT;
-               goto out;
-       }
-       btrfs_release_path(tmp_path);
-
         /*
          * Setup the clone roots.
          */
@@ -1428,37 +1633,59 @@ static int find_extent_clone(struct send_ctx *sctx,
                 cur_clone_root = sctx->clone_roots + i;
                 cur_clone_root->ino = (u64)-1;
                 cur_clone_root->offset = 0;
-               cur_clone_root->found_refs = 0;
+               cur_clone_root->num_bytes = 0;
+               cur_clone_root->found_ref = false;
         }
  
         backref_ctx.sctx = sctx;
-       backref_ctx.found = 0;
         backref_ctx.cur_objectid = ino;
         backref_ctx.cur_offset = data_offset;
-       backref_ctx.found_itself = 0;
-       backref_ctx.extent_len = num_bytes;
+       backref_ctx.bytenr = disk_byte;
+       /*
+        * Use the header owner and not the send root's id, because in case of a
+        * snapshot we can have shared subtrees.
+        */
+       backref_ctx.backref_owner = btrfs_header_owner(eb);
+       backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
  
         /*
          * The last extent of a file may be too large due to page alignment.
          * We need to adjust extent_len in this case so that the checks in
-        * __iterate_backrefs work.
+        * iterate_backrefs() work.
          */
         if (data_offset + num_bytes >= ino_size)
                 backref_ctx.extent_len = ino_size - data_offset;
+       else
+               backref_ctx.extent_len = num_bytes;
  
         /*
          * Now collect all backrefs.
          */
+       backref_walk_ctx.bytenr = disk_byte;
         if (compressed == BTRFS_COMPRESS_NONE)
-               extent_item_pos = logical - found_key.objectid;
-       else
-               extent_item_pos = 0;
-       ret = iterate_extent_inodes(fs_info, found_key.objectid,
-                                   extent_item_pos, 1, __iterate_backrefs,
-                                   &backref_ctx, false);
+               backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
+       backref_walk_ctx.fs_info = fs_info;
+       backref_walk_ctx.cache_lookup = lookup_backref_cache;
+       backref_walk_ctx.cache_store = store_backref_cache;
+       backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
+       backref_walk_ctx.check_extent_item = check_extent_item;
+       backref_walk_ctx.user_ctx = &backref_ctx;
+
+       /*
+        * If have a single clone root, then it's the send root and we can tell
+        * the backref walking code to skip our own backref and not resolve it,
+        * since we can not use it for cloning - the source and destination
+        * ranges can't overlap and in case the leaf is shared through a subtree
+        * due to snapshots, we can't use those other roots since they are not
+        * in the list of clone roots.
+        */
+       if (sctx->clone_roots_cnt == 1)
+               backref_walk_ctx.skip_data_ref = skip_self_data_ref;
  
+       ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
+                                   &backref_ctx);
         if (ret < 0)
-               goto out;
+               return ret;
  
         down_read(&fs_info->commit_root_sem);
         if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@@ -1475,37 +1702,42 @@ static int find_extent_clone(struct send_ctx *sctx,
                  * was already reallocated after the relocation.
                  */
                 up_read(&fs_info->commit_root_sem);
-               ret = -ENOENT;
-               goto out;
+               return -ENOENT;
         }
         up_read(&fs_info->commit_root_sem);
  
-       if (!backref_ctx.found_itself) {
-               /* found a bug in backref code? */
-               ret = -EIO;
-               btrfs_err(fs_info,
-                         "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
-                         ino, data_offset, disk_byte, found_key.objectid);
-               goto out;
-       }
-
         btrfs_debug(fs_info,
                     "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
                     data_offset, ino, num_bytes, logical);
  
-       if (!backref_ctx.found)
+       if (!backref_ctx.found) {
                 btrfs_debug(fs_info, "no clones found");
+               return -ENOENT;
+       }
  
         cur_clone_root = NULL;
         for (i = 0; i < sctx->clone_roots_cnt; i++) {
-               if (sctx->clone_roots[i].found_refs) {
-                       if (!cur_clone_root)
-                               cur_clone_root = sctx->clone_roots + i;
-                       else if (sctx->clone_roots[i].root == sctx->send_root)
-                               /* prefer clones from send_root over others */
-                               cur_clone_root = sctx->clone_roots + i;
-               }
+               struct clone_root *clone_root = &sctx->clone_roots[i];
+
+               if (!clone_root->found_ref)
+                       continue;
+
+               /*
+                * Choose the root from which we can clone more bytes, to
+                * minimize write operations and therefore have more extent
+                * sharing at the destination (the same as in the source).
+                */
+               if (!cur_clone_root ||
+                   clone_root->num_bytes > cur_clone_root->num_bytes) {
+                       cur_clone_root = clone_root;
  
+                       /*
+                        * We found an optimal clone candidate (any inode from
+                        * any root is fine), so we're done.
+                        */
+                       if (clone_root->num_bytes >= backref_ctx.extent_len)
+                               break;
+               }
         }
  
         if (cur_clone_root) {
@@ -1515,8 +1747,6 @@ static int find_extent_clone(struct send_ctx *sctx,
                 ret = -ENOENT;
         }
  
-out:
-       btrfs_free_path(tmp_path);
         return ret;
  }
  
@@ -1596,13 +1826,17 @@ static int gen_unique_name(struct send_ctx *sctx,
                 return -ENOMEM;
  
         while (1) {
+               struct fscrypt_str tmp_name;
+
                 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
                                 ino, gen, idx);
                 ASSERT(len < sizeof(tmp));
+               tmp_name.name = tmp;
+               tmp_name.len = strlen(tmp);
  
                 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
                                 path, BTRFS_FIRST_FREE_OBJECTID,
-                               tmp, strlen(tmp), 0);
+                               &tmp_name, 0);
                 btrfs_release_path(path);
                 if (IS_ERR(di)) {
                         ret = PTR_ERR(di);
@@ -1622,7 +1856,7 @@ static int gen_unique_name(struct send_ctx *sctx,
  
                 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
                                 path, BTRFS_FIRST_FREE_OBJECTID,
-                               tmp, strlen(tmp), 0);
+                               &tmp_name, 0);
                 btrfs_release_path(path);
                 if (IS_ERR(di)) {
                         ret = PTR_ERR(di);
@@ -1752,13 +1986,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
         struct btrfs_dir_item *di;
         struct btrfs_key key;
         struct btrfs_path *path;
+       struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
  
         path = alloc_path_for_send();
         if (!path)
                 return -ENOMEM;
  
-       di = btrfs_lookup_dir_item(NULL, root, path,
-                       dir, name, name_len, 0);
+       di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
         if (IS_ERR_OR_NULL(di)) {
                 ret = di ? PTR_ERR(di) : -ENOENT;
                 goto out;
@@ -5702,6 +5936,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
                 u64 ext_len;
                 u64 clone_len;
                 u64 clone_data_offset;
+               bool crossed_src_i_size = false;
  
                 if (slot >= btrfs_header_nritems(leaf)) {
                         ret = btrfs_next_leaf(clone_root->root, path);
@@ -5759,8 +5994,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
                 if (key.offset >= clone_src_i_size)
                         break;
  
-               if (key.offset + ext_len > clone_src_i_size)
+               if (key.offset + ext_len > clone_src_i_size) {
                         ext_len = clone_src_i_size - key.offset;
+                       crossed_src_i_size = true;
+               }
  
                 clone_data_offset = btrfs_file_extent_offset(leaf, ei);
                 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5821,6 +6058,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
                                 ret = send_clone(sctx, offset, clone_len,
                                                  clone_root);
                         }
+               } else if (crossed_src_i_size && clone_len < len) {
+                       /*
+                        * If we are at i_size of the clone source inode and we
+                        * can not clone from it, terminate the loop. This is
+                        * to avoid sending two write operations, one with a
+                        * length matching clone_len and the final one after
+                        * this loop with a length of len - clone_len.
+                        *
+                        * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+                        * was passed to the send ioctl), this helps avoid
+                        * sending an encoded write for an offset that is not
+                        * sector size aligned, in case the i_size of the source
+                        * inode is not sector size aligned. That will make the
+                        * receiver fallback to decompression of the data and
+                        * writing it using regular buffered IO, therefore while
+                        * not incorrect, it's not optimal due decompression and
+                        * possible re-compression at the receiver.
+                        */
+                       break;
                 } else {
                         ret = send_extent_data(sctx, dst_path, offset,
                                                clone_len);
@@ -6470,7 +6726,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                 if (ret < 0)
                         goto out;
         }
-       if (sctx->cur_inode_needs_verity) {
+
+       if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
+           && sctx->cur_inode_needs_verity) {
                 ret = process_verity(sctx);
                 if (ret < 0)
                         goto out;
@@ -6666,17 +6924,19 @@ static int changed_inode(struct send_ctx *sctx,
                         /*
                          * First, process the inode as if it was deleted.
                          */
-                       sctx->cur_inode_gen = right_gen;
-                       sctx->cur_inode_new = false;
-                       sctx->cur_inode_deleted = true;
-                       sctx->cur_inode_size = btrfs_inode_size(
-                                       sctx->right_path->nodes[0], right_ii);
-                       sctx->cur_inode_mode = btrfs_inode_mode(
-                                       sctx->right_path->nodes[0], right_ii);
-                       ret = process_all_refs(sctx,
-                                       BTRFS_COMPARE_TREE_DELETED);
-                       if (ret < 0)
-                               goto out;
+                       if (old_nlinks > 0) {
+                               sctx->cur_inode_gen = right_gen;
+                               sctx->cur_inode_new = false;
+                               sctx->cur_inode_deleted = true;
+                               sctx->cur_inode_size = btrfs_inode_size(
+                                               sctx->right_path->nodes[0], right_ii);
+                               sctx->cur_inode_mode = btrfs_inode_mode(
+                                               sctx->right_path->nodes[0], right_ii);
+                               ret = process_all_refs(sctx,
+                                               BTRFS_COMPARE_TREE_DELETED);
+                               if (ret < 0)
+                                       goto out;
+                       }
  
                         /*
                          * Now process the inode as if it was new.
@@ -7837,6 +8097,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
         INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
         INIT_LIST_HEAD(&sctx->name_cache_list);
  
+       INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
+       mt_init(&sctx->backref_cache.entries);
+
         sctx->flags = arg->flags;
  
         if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@@ -7875,7 +8138,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
         if (sctx->proto >= 2) {
                 u32 send_buf_num_pages;
  
-               sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
+               sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
                 sctx->send_buf = vmalloc(sctx->send_max_size);
                 if (!sctx->send_buf) {
                         ret = -ENOMEM;
@@ -8099,6 +8362,8 @@ out:
  
                 close_current_inode(sctx);
  
+               empty_backref_cache(sctx);
+
                 kfree(sctx);
         }