From 3939448a3adb1a157d4b01ca9082b6a19b6c4a34 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Fri, 20 Dec 2013 15:17:46 +0000 Subject: [PATCH] Btrfs: fix tree mod logging commit 5de865eebb8330eee19c37b31fb6f315a09d4273 upstream. While running the test btrfs/004 from xfstests in a loop, it failed about 1 time out of 20 runs in my desktop. The failure happened in the backref walking part of the test, and the test's error message was like this: btrfs/004 93s ... [failed, exit status 1] - output mismatch (see /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad) --- tests/btrfs/004.out 2013-11-26 18:25:29.263333714 +0000 +++ /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad 2013-12-10 15:25:10.327518516 +0000 @@ -1,3 +1,8 @@ QA output created by 004 *** test backref walking -*** done +unexpected output from + /home/fdmanana/git/hub/btrfs-progs/btrfs inspect-internal logical-resolve -P 141512704 /home/fdmanana/btrfs-tests/scratch_1 +expected inum: 405, expected address: 454656, file: /home/fdmanana/btrfs-tests/scratch_1/snap1/p0/d6/d3d/d156/fce, got: + ... (Run 'diff -u tests/btrfs/004.out /home/fdmanana/git/hub/xfstests_2/results//btrfs/004.out.bad' to see the entire diff) Ran: btrfs/004 Failures: btrfs/004 Failed 1 of 1 tests But immediately after the test finished, the btrfs inspect-internal command returned the expected output: $ btrfs inspect-internal logical-resolve -P 141512704 /home/fdmanana/btrfs-tests/scratch_1 inode 405 offset 454656 root 258 inode 405 offset 454656 root 5 It turned out this was because the btrfs_search_old_slot() calls performed during backref walking (backref.c:__resolve_indirect_ref) were not finding anything. The reason for this turned out to be that the tree mod logging code was not logging some node multi-step operations atomically, therefore btrfs_search_old_slot() callers iterated often over an incomplete tree that wasn't fully consistent with any tree state from the past. Besides missing items, this often (but not always) resulted in -EIO errors during old slot searches, reported in dmesg like this: [ 4299.933936] ------------[ cut here ]------------ [ 4299.933949] WARNING: CPU: 0 PID: 23190 at fs/btrfs/ctree.c:1343 btrfs_search_old_slot+0x57b/0xab0 [btrfs]() [ 4299.933950] Modules linked in: btrfs raid6_pq xor pci_stub vboxpci(O) vboxnetadp(O) vboxnetflt(O) vboxdrv(O) bnep rfcomm bluetooth parport_pc ppdev binfmt_misc joydev snd_hda_codec_h [ 4299.933977] CPU: 0 PID: 23190 Comm: btrfs Tainted: G W O 3.12.0-fdm-btrfs-next-16+ #70 [ 4299.933978] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./Z77 Pro4, BIOS P1.50 09/04/2012 [ 4299.933979] 000000000000053f ffff8806f3fd98f8 ffffffff8176d284 0000000000000007 [ 4299.933982] 0000000000000000 ffff8806f3fd9938 ffffffff8104a81c ffff880659c64b70 [ 4299.933984] ffff880659c643d0 ffff8806599233d8 ffff880701e2e938 0000160000000000 [ 4299.933987] Call Trace: [ 4299.933991] [] dump_stack+0x55/0x76 [ 4299.933994] [] warn_slowpath_common+0x8c/0xc0 [ 4299.933997] [] warn_slowpath_null+0x1a/0x20 [ 4299.934003] [] btrfs_search_old_slot+0x57b/0xab0 [btrfs] [ 4299.934005] [] ? _raw_read_unlock+0x2b/0x50 [ 4299.934010] [] ? __tree_mod_log_search+0x81/0xc0 [btrfs] [ 4299.934019] [] __resolve_indirect_refs+0x130/0x5f0 [btrfs] [ 4299.934027] [] ? free_extent_buffer+0x61/0xc0 [btrfs] [ 4299.934034] [] find_parent_nodes+0x1fc/0xe40 [btrfs] [ 4299.934042] [] ? defrag_lookup_extent+0xe0/0xe0 [btrfs] [ 4299.934048] [] ? defrag_lookup_extent+0xe0/0xe0 [btrfs] [ 4299.934056] [] iterate_extent_inodes+0xe0/0x250 [btrfs] [ 4299.934058] [] ? _raw_spin_unlock+0x2b/0x50 [ 4299.934065] [] iterate_inodes_from_logical+0x92/0xb0 [btrfs] [ 4299.934071] [] ? defrag_lookup_extent+0xe0/0xe0 [btrfs] [ 4299.934078] [] btrfs_ioctl+0xf65/0x1f60 [btrfs] [ 4299.934080] [] ? handle_mm_fault+0x278/0xb00 [ 4299.934083] [] ? up_read+0x23/0x40 [ 4299.934085] [] ? __do_page_fault+0x20c/0x5a0 [ 4299.934088] [] do_vfs_ioctl+0x96/0x570 [ 4299.934090] [] ? error_sti+0x5/0x6 [ 4299.934093] [] ? trace_hardirqs_off_caller+0x28/0xd0 [ 4299.934096] [] ? retint_swapgs+0xe/0x13 [ 4299.934098] [] SyS_ioctl+0x91/0xb0 [ 4299.934100] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 4299.934102] [] system_call_fastpath+0x16/0x1b [ 4299.934102] [] system_call_fastpath+0x16/0x1b [ 4299.934104] ---[ end trace 48f0cfc902491414 ]--- [ 4299.934378] btrfs bad fsid on block 0 These tree mod log operations that must be performed atomically, tree_mod_log_free_eb, tree_mod_log_eb_copy, tree_mod_log_insert_root and tree_mod_log_insert_move, used to be performed atomically before the following commit: c8cc6341653721b54760480b0d0d9b5f09b46741 (Btrfs: stop using GFP_ATOMIC for the tree mod log allocations) That change removed the atomicity of such operations. This patch restores the atomicity while still not doing the GFP_ATOMIC allocations of tree_mod_elem structures, so it has to do the allocations using GFP_NOFS before acquiring the mod log lock. This issue has been experienced by several users recently, such as for example: http://www.spinics.net/lists/linux-btrfs/msg28574.html After running the btrfs/004 test for 679 consecutive iterations with this patch applied, I didn't ran into the issue anymore. Cc: stable@vger.kernel.org Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Jiri Slaby --- fs/btrfs/ctree.c | 385 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 296 insertions(+), 89 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index adc990a..c1123ec 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -39,7 +39,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, +static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); @@ -475,6 +475,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * the index is the shifted logical of the *new* root node for root replace * operations, or the shifted logical of the affected block for all other * operations. + * + * Note: must be called with write lock (tree_mod_log_write_lock). */ static noinline int __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) @@ -483,24 +485,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; - int ret = 0; BUG_ON(!tm); - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - tree_mod_log_write_unlock(fs_info); - /* - * Ok we no longer care about logging modifications, free up tm - * and return 0. Any callers shouldn't be using tm after - * calling tree_mod_log_insert, but if they do we can just - * change this to return a special error code to let the callers - * do their own thing. - */ - kfree(tm); - return 0; - } - spin_lock(&fs_info->tree_mod_seq_lock); tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); @@ -518,18 +505,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) new = &((*new)->rb_left); else if (cur->seq > tm->seq) new = &((*new)->rb_right); - else { - ret = -EEXIST; - kfree(tm); - goto out; - } + else + return -EEXIST; } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return 0; } /* @@ -545,19 +527,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; + + tree_mod_log_write_lock(fs_info); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + return 1; + } + return 0; } -static inline int -__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + smp_mb(); + if (list_empty(&(fs_info)->tree_mod_seq_list)) + return 0; + if (eb && btrfs_header_level(eb) == 0) + return 0; + + return 1; +} + +static struct tree_mod_elem * +alloc_tree_mod_elem(struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { struct tree_mod_elem *tm; tm = kzalloc(sizeof(*tm), flags); if (!tm) - return -ENOMEM; + return NULL; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -567,8 +568,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, tm->op = op; tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); - return __tree_mod_log_insert(fs_info, tm); + return tm; } static noinline int @@ -576,10 +578,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - if (tree_mod_dont_log(fs_info, eb)) + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(fs_info, eb)) { + kfree(tm); return 0; + } + + ret = __tree_mod_log_insert(fs_info, tm); + tree_mod_log_write_unlock(fs_info); + if (ret) + kfree(tm); - return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); + return ret; } static noinline int @@ -587,53 +606,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items, gfp_t flags) { - struct tree_mod_elem *tm; - int ret; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, eb)) + if (!tree_mod_need_log(fs_info, eb)) return 0; + tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->index = eb->start >> PAGE_CACHE_SHIFT; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, eb)) + goto free_tms; + locked = 1; + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) + goto free_tms; } - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + ret = __tree_mod_log_insert(fs_info, tm); + if (ret) + goto free_tms; + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); - tm->index = eb->start >> PAGE_CACHE_SHIFT; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; + return 0; +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + kfree(tm); - return __tree_mod_log_insert(fs_info, tm); + return ret; } -static inline void -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static inline int +__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) { - int i; - u32 nritems; + int i, j; int ret; - if (btrfs_header_level(eb) == 0) - return; - - nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert_key(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - BUG_ON(ret < 0); + ret = __tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } } + + return 0; } static noinline int @@ -642,17 +703,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *new_root, gfp_t flags, int log_removal) { - struct tree_mod_elem *tm; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; - if (tree_mod_dont_log(fs_info, NULL)) + if (!tree_mod_need_log(fs_info, NULL)) return 0; - if (log_removal) - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + flags); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return -ENOMEM; + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -660,7 +742,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = __tree_mod_log_insert(fs_info, tm); + + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; } static struct tree_mod_elem * @@ -729,31 +834,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) return __tree_mod_log_search(fs_info, start, min_seq, 0); } -static noinline void +static noinline int tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) { - int ret; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; + int locked = 0; - if (tree_mod_dont_log(fs_info, NULL)) - return; + if (!tree_mod_need_log(fs_info, NULL)) + return 0; if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return; + return 0; + tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert_key(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - BUG_ON(ret < 0); - ret = __tree_mod_log_insert_key(fs_info, dst, - i + dst_offset, - MOD_LOG_KEY_ADD, - GFP_NOFS); - BUG_ON(ret < 0); + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = 1; + + for (i = 0; i < nr_items; i++) { + ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); } + if (locked) + tree_mod_log_write_unlock(fs_info); + kfree(tm_list); + + return ret; } static inline void @@ -778,12 +927,52 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, BUG_ON(ret < 0); } -static noinline void +static noinline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (btrfs_header_level(eb) == 0) + return 0; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + if (tree_mod_dont_log(fs_info, eb)) - return; - __tree_mod_log_free_eb(fs_info, eb); + goto free_tms; + + ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); + tree_mod_log_write_unlock(fs_info); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; } static noinline void @@ -1041,8 +1230,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - if (last_ref) - tree_mod_log_free_eb(root->fs_info, buf); + if (last_ref) { + ret = tree_mod_log_free_eb(root->fs_info, buf); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } + } btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -3023,8 +3217,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3094,8 +3292,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, (dst_nritems) * sizeof(struct btrfs_key_ptr)); - tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + src_nritems - push_items, push_items); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3296,7 +3498,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, + mid, c_nritems - mid); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + return ret; + } copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), -- 2.7.4