btrfs: fix race between quota disable and quota assign ioctls
[platform/kernel/linux-rpi.git] / fs / btrfs / send.c
index 72f9b86..9250a17 100644 (file)
@@ -24,6 +24,7 @@
 #include "transaction.h"
 #include "compression.h"
 #include "xattr.h"
+#include "print-tree.h"
 
 /*
  * Maximum number of references an extent can have in order for us to attempt to
@@ -96,6 +97,15 @@ struct send_ctx {
        struct btrfs_key *cmp_key;
 
        /*
+        * Keep track of the generation of the last transaction that was used
+        * for relocating a block group. This is periodically checked in order
+        * to detect if a relocation happened since the last check, so that we
+        * don't operate on stale extent buffers for nodes (level >= 1) or on
+        * stale disk_bytenr values of file extent items.
+        */
+       u64 last_reloc_trans;
+
+       /*
         * infos of the currently processed inode. In case of deleted inodes,
         * these are the values from the deleted inode.
         */
@@ -1415,6 +1425,26 @@ static int find_extent_clone(struct send_ctx *sctx,
        if (ret < 0)
                goto out;
 
+       down_read(&fs_info->commit_root_sem);
+       if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+               /*
+                * A transaction commit for a transaction in which block group
+                * relocation was done just happened.
+                * The disk_bytenr of the file extent item we processed is
+                * possibly stale, referring to the extent's location before
+                * relocation. So act as if we haven't found any clone sources
+                * and fallback to write commands, which will read the correct
+                * data from the new extent location. Otherwise we will fail
+                * below because we haven't found our own back reference or we
+                * could be getting incorrect sources in case the old extent
+                * was already reallocated after the relocation.
+                */
+               up_read(&fs_info->commit_root_sem);
+               ret = -ENOENT;
+               goto out;
+       }
+       up_read(&fs_info->commit_root_sem);
+
        if (!backref_ctx.found_itself) {
                /* found a bug in backref code? */
                ret = -EIO;
@@ -4978,6 +5008,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
                        lock_page(page);
                        if (!PageUptodate(page)) {
                                unlock_page(page);
+                               btrfs_err(fs_info,
+                       "send: IO error at offset %llu for inode %llu root %llu",
+                                       page_offset(page), sctx->cur_ino,
+                                       sctx->send_root->root_key.objectid);
                                put_page(page);
                                ret = -EIO;
                                break;
@@ -5364,6 +5398,7 @@ static int clone_range(struct send_ctx *sctx,
                u64 ext_len;
                u64 clone_len;
                u64 clone_data_offset;
+               bool crossed_src_i_size = false;
 
                if (slot >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(clone_root->root, path);
@@ -5420,8 +5455,10 @@ static int clone_range(struct send_ctx *sctx,
                if (key.offset >= clone_src_i_size)
                        break;
 
-               if (key.offset + ext_len > clone_src_i_size)
+               if (key.offset + ext_len > clone_src_i_size) {
                        ext_len = clone_src_i_size - key.offset;
+                       crossed_src_i_size = true;
+               }
 
                clone_data_offset = btrfs_file_extent_offset(leaf, ei);
                if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5481,6 +5518,25 @@ static int clone_range(struct send_ctx *sctx,
                                ret = send_clone(sctx, offset, clone_len,
                                                 clone_root);
                        }
+               } else if (crossed_src_i_size && clone_len < len) {
+                       /*
+                        * If we are at i_size of the clone source inode and we
+                        * can not clone from it, terminate the loop. This is
+                        * to avoid sending two write operations, one with a
+                        * length matching clone_len and the final one after
+                        * this loop with a length of len - clone_len.
+                        *
+                        * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+                        * was passed to the send ioctl), this helps avoid
+                        * sending an encoded write for an offset that is not
+                        * sector size aligned, in case the i_size of the source
+                        * inode is not sector size aligned. That will make the
+                        * receiver fallback to decompression of the data and
+                        * writing it using regular buffered IO, therefore while
+                        * not incorrect, it's not optimal due decompression and
+                        * possible re-compression at the receiver.
+                        */
+                       break;
                } else {
                        ret = send_extent_data(sctx, offset, clone_len);
                }
@@ -6592,6 +6648,50 @@ static int changed_cb(struct btrfs_path *left_path,
 {
        int ret = 0;
 
+       /*
+        * We can not hold the commit root semaphore here. This is because in
+        * the case of sending and receiving to the same filesystem, using a
+        * pipe, could result in a deadlock:
+        *
+        * 1) The task running send blocks on the pipe because it's full;
+        *
+        * 2) The task running receive, which is the only consumer of the pipe,
+        *    is waiting for a transaction commit (for example due to a space
+        *    reservation when doing a write or triggering a transaction commit
+        *    when creating a subvolume);
+        *
+        * 3) The transaction is waiting to write lock the commit root semaphore,
+        *    but can not acquire it since it's being held at 1).
+        *
+        * Down this call chain we write to the pipe through kernel_write().
+        * The same type of problem can also happen when sending to a file that
+        * is stored in the same filesystem - when reserving space for a write
+        * into the file, we can trigger a transaction commit.
+        *
+        * Our caller has supplied us with clones of leaves from the send and
+        * parent roots, so we're safe here from a concurrent relocation and
+        * further reallocation of metadata extents while we are here. Below we
+        * also assert that the leaves are clones.
+        */
+       lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
+
+       /*
+        * We always have a send root, so left_path is never NULL. We will not
+        * have a leaf when we have reached the end of the send root but have
+        * not yet reached the end of the parent root.
+        */
+       if (left_path->nodes[0])
+               ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+                               &left_path->nodes[0]->bflags));
+       /*
+        * When doing a full send we don't have a parent root, so right_path is
+        * NULL. When doing an incremental send, we may have reached the end of
+        * the parent root already, so we don't have a leaf at right_path.
+        */
+       if (right_path && right_path->nodes[0])
+               ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
+                               &right_path->nodes[0]->bflags));
+
        if (result == BTRFS_COMPARE_TREE_SAME) {
                if (key->type == BTRFS_INODE_REF_KEY ||
                    key->type == BTRFS_INODE_EXTREF_KEY) {
@@ -6638,14 +6738,46 @@ out:
        return ret;
 }
 
+static int search_key_again(const struct send_ctx *sctx,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path,
+                           const struct btrfs_key *key)
+{
+       int ret;
+
+       if (!path->need_commit_sem)
+               lockdep_assert_held_read(&root->fs_info->commit_root_sem);
+
+       /*
+        * Roots used for send operations are readonly and no one can add,
+        * update or remove keys from them, so we should be able to find our
+        * key again. The only exception is deduplication, which can operate on
+        * readonly roots and add, update or remove keys to/from them - but at
+        * the moment we don't allow it to run in parallel with send.
+        */
+       ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+       ASSERT(ret <= 0);
+       if (ret > 0) {
+               btrfs_print_tree(path->nodes[path->lowest_level], false);
+               btrfs_err(root->fs_info,
+"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
+                         key->objectid, key->type, key->offset,
+                         (root == sctx->parent_root ? "parent" : "send"),
+                         root->root_key.objectid, path->lowest_level,
+                         path->slots[path->lowest_level]);
+               return -EUCLEAN;
+       }
+
+       return ret;
+}
+
 static int full_send_tree(struct send_ctx *sctx)
 {
        int ret;
        struct btrfs_root *send_root = sctx->send_root;
        struct btrfs_key key;
+       struct btrfs_fs_info *fs_info = send_root->fs_info;
        struct btrfs_path *path;
-       struct extent_buffer *eb;
-       int slot;
 
        path = alloc_path_for_send();
        if (!path)
@@ -6656,6 +6788,10 @@ static int full_send_tree(struct send_ctx *sctx)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
 
+       down_read(&fs_info->commit_root_sem);
+       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+       up_read(&fs_info->commit_root_sem);
+
        ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
        if (ret < 0)
                goto out;
@@ -6663,15 +6799,35 @@ static int full_send_tree(struct send_ctx *sctx)
                goto out_finish;
 
        while (1) {
-               eb = path->nodes[0];
-               slot = path->slots[0];
-               btrfs_item_key_to_cpu(eb, &key, slot);
+               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 
                ret = changed_cb(path, NULL, &key,
                                 BTRFS_COMPARE_TREE_NEW, sctx);
                if (ret < 0)
                        goto out;
 
+               down_read(&fs_info->commit_root_sem);
+               if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+                       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+                       up_read(&fs_info->commit_root_sem);
+                       /*
+                        * A transaction used for relocating a block group was
+                        * committed or is about to finish its commit. Release
+                        * our path (leaf) and restart the search, so that we
+                        * avoid operating on any file extent items that are
+                        * stale, with a disk_bytenr that reflects a pre
+                        * relocation value. This way we avoid as much as
+                        * possible to fallback to regular writes when checking
+                        * if we can clone file ranges.
+                        */
+                       btrfs_release_path(path);
+                       ret = search_key_again(sctx, send_root, path, &key);
+                       if (ret < 0)
+                               goto out;
+               } else {
+                       up_read(&fs_info->commit_root_sem);
+               }
+
                ret = btrfs_next_item(send_root, path);
                if (ret < 0)
                        goto out;
@@ -6689,6 +6845,20 @@ out:
        return ret;
 }
 
+static int replace_node_with_clone(struct btrfs_path *path, int level)
+{
+       struct extent_buffer *clone;
+
+       clone = btrfs_clone_extent_buffer(path->nodes[level]);
+       if (!clone)
+               return -ENOMEM;
+
+       free_extent_buffer(path->nodes[level]);
+       path->nodes[level] = clone;
+
+       return 0;
+}
+
 static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
 {
        struct extent_buffer *eb;
@@ -6698,6 +6868,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
        u64 reada_max;
        u64 reada_done = 0;
 
+       lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
+
        BUG_ON(*level == 0);
        eb = btrfs_read_node_slot(parent, slot);
        if (IS_ERR(eb))
@@ -6721,6 +6893,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen
        path->nodes[*level - 1] = eb;
        path->slots[*level - 1] = 0;
        (*level)--;
+
+       if (*level == 0)
+               return replace_node_with_clone(path, 0);
+
        return 0;
 }
 
@@ -6734,8 +6910,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
        path->slots[*level]++;
 
        while (path->slots[*level] >= nritems) {
-               if (*level == root_level)
+               if (*level == root_level) {
+                       path->slots[*level] = nritems - 1;
                        return -1;
+               }
 
                /* move upnext */
                path->slots[*level] = 0;
@@ -6767,14 +6945,20 @@ static int tree_advance(struct btrfs_path *path,
        } else {
                ret = tree_move_down(path, level, reada_min_gen);
        }
-       if (ret >= 0) {
-               if (*level == 0)
-                       btrfs_item_key_to_cpu(path->nodes[*level], key,
-                                       path->slots[*level]);
-               else
-                       btrfs_node_key_to_cpu(path->nodes[*level], key,
-                                       path->slots[*level]);
-       }
+
+       /*
+        * Even if we have reached the end of a tree, ret is -1, update the key
+        * anyway, so that in case we need to restart due to a block group
+        * relocation, we can assert that the last key of the root node still
+        * exists in the tree.
+        */
+       if (*level == 0)
+               btrfs_item_key_to_cpu(path->nodes[*level], key,
+                                     path->slots[*level]);
+       else
+               btrfs_node_key_to_cpu(path->nodes[*level], key,
+                                     path->slots[*level]);
+
        return ret;
 }
 
@@ -6804,6 +6988,97 @@ static int tree_compare_item(struct btrfs_path *left_path,
 }
 
 /*
+ * A transaction used for relocating a block group was committed or is about to
+ * finish its commit. Release our paths and restart the search, so that we are
+ * not using stale extent buffers:
+ *
+ * 1) For levels > 0, we are only holding references of extent buffers, without
+ *    any locks on them, which does not prevent them from having been relocated
+ *    and reallocated after the last time we released the commit root semaphore.
+ *    The exception are the root nodes, for which we always have a clone, see
+ *    the comment at btrfs_compare_trees();
+ *
+ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
+ *    we are safe from the concurrent relocation and reallocation. However they
+ *    can have file extent items with a pre relocation disk_bytenr value, so we
+ *    restart the start from the current commit roots and clone the new leaves so
+ *    that we get the post relocation disk_bytenr values. Not doing so, could
+ *    make us clone the wrong data in case there are new extents using the old
+ *    disk_bytenr that happen to be shared.
+ */
+static int restart_after_relocation(struct btrfs_path *left_path,
+                                   struct btrfs_path *right_path,
+                                   const struct btrfs_key *left_key,
+                                   const struct btrfs_key *right_key,
+                                   int left_level,
+                                   int right_level,
+                                   const struct send_ctx *sctx)
+{
+       int root_level;
+       int ret;
+
+       lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
+
+       btrfs_release_path(left_path);
+       btrfs_release_path(right_path);
+
+       /*
+        * Since keys can not be added or removed to/from our roots because they
+        * are readonly and we do not allow deduplication to run in parallel
+        * (which can add, remove or change keys), the layout of the trees should
+        * not change.
+        */
+       left_path->lowest_level = left_level;
+       ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
+       if (ret < 0)
+               return ret;
+
+       right_path->lowest_level = right_level;
+       ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * If the lowest level nodes are leaves, clone them so that they can be
+        * safely used by changed_cb() while not under the protection of the
+        * commit root semaphore, even if relocation and reallocation happens in
+        * parallel.
+        */
+       if (left_level == 0) {
+               ret = replace_node_with_clone(left_path, 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       if (right_level == 0) {
+               ret = replace_node_with_clone(right_path, 0);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /*
+        * Now clone the root nodes (unless they happen to be the leaves we have
+        * already cloned). This is to protect against concurrent snapshotting of
+        * the send and parent roots (see the comment at btrfs_compare_trees()).
+        */
+       root_level = btrfs_header_level(sctx->send_root->commit_root);
+       if (root_level > 0) {
+               ret = replace_node_with_clone(left_path, root_level);
+               if (ret < 0)
+                       return ret;
+       }
+
+       root_level = btrfs_header_level(sctx->parent_root->commit_root);
+       if (root_level > 0) {
+               ret = replace_node_with_clone(right_path, root_level);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
+/*
  * This function compares two trees and calls the provided callback for
  * every changed/new/deleted item it finds.
  * If shared tree blocks are encountered, whole subtrees are skipped, making
@@ -6831,10 +7106,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
        int right_root_level;
        int left_level;
        int right_level;
-       int left_end_reached;
-       int right_end_reached;
-       int advance_left;
-       int advance_right;
+       int left_end_reached = 0;
+       int right_end_reached = 0;
+       int advance_left = 0;
+       int advance_right = 0;
        u64 left_blockptr;
        u64 right_blockptr;
        u64 left_gen;
@@ -6902,12 +7177,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
        down_read(&fs_info->commit_root_sem);
        left_level = btrfs_header_level(left_root->commit_root);
        left_root_level = left_level;
+       /*
+        * We clone the root node of the send and parent roots to prevent races
+        * with snapshot creation of these roots. Snapshot creation COWs the
+        * root node of a tree, so after the transaction is committed the old
+        * extent can be reallocated while this send operation is still ongoing.
+        * So we clone them, under the commit root semaphore, to be race free.
+        */
        left_path->nodes[left_level] =
                        btrfs_clone_extent_buffer(left_root->commit_root);
        if (!left_path->nodes[left_level]) {
-               up_read(&fs_info->commit_root_sem);
                ret = -ENOMEM;
-               goto out;
+               goto out_unlock;
        }
 
        right_level = btrfs_header_level(right_root->commit_root);
@@ -6915,9 +7196,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
        right_path->nodes[right_level] =
                        btrfs_clone_extent_buffer(right_root->commit_root);
        if (!right_path->nodes[right_level]) {
-               up_read(&fs_info->commit_root_sem);
                ret = -ENOMEM;
-               goto out;
+               goto out_unlock;
        }
        /*
         * Our right root is the parent root, while the left root is the "send"
@@ -6927,7 +7207,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
         * will need to read them at some point.
         */
        reada_min_gen = btrfs_header_generation(right_root->commit_root);
-       up_read(&fs_info->commit_root_sem);
 
        if (left_level == 0)
                btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -6942,11 +7221,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
                btrfs_node_key_to_cpu(right_path->nodes[right_level],
                                &right_key, right_path->slots[right_level]);
 
-       left_end_reached = right_end_reached = 0;
-       advance_left = advance_right = 0;
+       sctx->last_reloc_trans = fs_info->last_reloc_trans;
 
        while (1) {
-               cond_resched();
+               if (need_resched() ||
+                   rwsem_is_contended(&fs_info->commit_root_sem)) {
+                       up_read(&fs_info->commit_root_sem);
+                       cond_resched();
+                       down_read(&fs_info->commit_root_sem);
+               }
+
+               if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
+                       ret = restart_after_relocation(left_path, right_path,
+                                                      &left_key, &right_key,
+                                                      left_level, right_level,
+                                                      sctx);
+                       if (ret < 0)
+                               goto out_unlock;
+                       sctx->last_reloc_trans = fs_info->last_reloc_trans;
+               }
+
                if (advance_left && !left_end_reached) {
                        ret = tree_advance(left_path, &left_level,
                                        left_root_level,
@@ -6955,7 +7249,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
                        if (ret == -1)
                                left_end_reached = ADVANCE;
                        else if (ret < 0)
-                               goto out;
+                               goto out_unlock;
                        advance_left = 0;
                }
                if (advance_right && !right_end_reached) {
@@ -6966,54 +7260,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
                        if (ret == -1)
                                right_end_reached = ADVANCE;
                        else if (ret < 0)
-                               goto out;
+                               goto out_unlock;
                        advance_right = 0;
                }
 
                if (left_end_reached && right_end_reached) {
                        ret = 0;
-                       goto out;
+                       goto out_unlock;
                } else if (left_end_reached) {
                        if (right_level == 0) {
+                               up_read(&fs_info->commit_root_sem);
                                ret = changed_cb(left_path, right_path,
                                                &right_key,
                                                BTRFS_COMPARE_TREE_DELETED,
                                                sctx);
                                if (ret < 0)
                                        goto out;
+                               down_read(&fs_info->commit_root_sem);
                        }
                        advance_right = ADVANCE;
                        continue;
                } else if (right_end_reached) {
                        if (left_level == 0) {
+                               up_read(&fs_info->commit_root_sem);
                                ret = changed_cb(left_path, right_path,
                                                &left_key,
                                                BTRFS_COMPARE_TREE_NEW,
                                                sctx);
                                if (ret < 0)
                                        goto out;
+                               down_read(&fs_info->commit_root_sem);
                        }
                        advance_left = ADVANCE;
                        continue;
                }
 
                if (left_level == 0 && right_level == 0) {
+                       up_read(&fs_info->commit_root_sem);
                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
                        if (cmp < 0) {
                                ret = changed_cb(left_path, right_path,
                                                &left_key,
                                                BTRFS_COMPARE_TREE_NEW,
                                                sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_left = ADVANCE;
                        } else if (cmp > 0) {
                                ret = changed_cb(left_path, right_path,
                                                &right_key,
                                                BTRFS_COMPARE_TREE_DELETED,
                                                sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_right = ADVANCE;
                        } else {
                                enum btrfs_compare_tree_result result;
@@ -7027,11 +7322,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
                                        result = BTRFS_COMPARE_TREE_SAME;
                                ret = changed_cb(left_path, right_path,
                                                 &left_key, result, sctx);
-                               if (ret < 0)
-                                       goto out;
                                advance_left = ADVANCE;
                                advance_right = ADVANCE;
                        }
+
+                       if (ret < 0)
+                               goto out;
+                       down_read(&fs_info->commit_root_sem);
                } else if (left_level == right_level) {
                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
                        if (cmp < 0) {
@@ -7071,6 +7368,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
                }
        }
 
+out_unlock:
+       up_read(&fs_info->commit_root_sem);
 out:
        btrfs_free_path(left_path);
        btrfs_free_path(right_path);
@@ -7409,21 +7708,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
        if (ret)
                goto out;
 
-       spin_lock(&fs_info->send_reloc_lock);
-       if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
-               spin_unlock(&fs_info->send_reloc_lock);
-               btrfs_warn_rl(fs_info,
-               "cannot run send because a relocation operation is in progress");
-               ret = -EAGAIN;
-               goto out;
-       }
-       fs_info->send_in_progress++;
-       spin_unlock(&fs_info->send_reloc_lock);
-
        ret = send_subvol(sctx);
-       spin_lock(&fs_info->send_reloc_lock);
-       fs_info->send_in_progress--;
-       spin_unlock(&fs_info->send_reloc_lock);
        if (ret < 0)
                goto out;