From fa1a0f42a0356846fb1acd1d53061d53413a4c45 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:19 +0900 Subject: [PATCH] btrfs: zoned: serialize log transaction on zoned filesystems This is the 2/3 patch to enable tree-log on zoned filesystems. Since we can start more than one log transactions per subvolume simultaneously, nodes from multiple transactions can be allocated interleaved. Such mixed allocation results in non-sequential writes at the time of a log transaction commit. The nodes of the global log root tree (fs_info->log_root_tree), also have the same problem with mixed allocation. Serializes log transactions by waiting for a committing transaction when someone tries to start a new transaction, to avoid the mixed allocation problem. We must also wait for running log transactions from another subvolume, but there is no easy way to detect which subvolume root is running a log transaction. So, this patch forbids starting a new log transaction when other subvolumes already allocated the global log root tree. Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c02eeea..4e72794 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -140,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -150,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -160,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -173,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -201,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); -- 2.7.4