In commit
351cbf6e4410e7 ("btrfs: use nofs allocations for running delayed
items") we wrapped all btree updates when running delayed items with
memalloc_nofs_save() and memalloc_nofs_restore(), due to a lock inversion
detected by lockdep involving reclaim and the mutex of delayed nodes.
The problem is because the ref verify tool does some memory allocations
with GFP_KERNEL, which can trigger reclaim and reclaim can trigger inode
eviction, which requires locking the mutex of an inode's delayed node.
On the other hand the ref verify tool is called when allocating metadata
extents as part of operations that modify a btree, which is a problem when
running delayed nodes, where we do btree updates while holding the mutex
of a delayed node. This is what caused the lockdep warning.
Instead of wrapping every btree update when running delayed nodes, change
the ref verify tool to never do GFP_KERNEL allocations, because:
1) We get less repeated code, which at the moment does not even have a
comment mentioning why we need to setup the NOFS context, which is a
recommended good practice as mentioned at
Documentation/core-api/gfp_mask-from-fs-io.rst
2) The ref verify tool is something meant only for debugging and not
something that should be enabled on non-debug / non-development
kernels;
3) We may have yet more places outside delayed-inode.c where we have
similar problem: doing btree updates while holding some lock and
then having the GFP_KERNEL memory allocations, from the ref verify
tool, trigger reclaim and trying again to acquire the same lock
through the reclaim path.
Or we could get more such cases in the future, therefore this change
prevents getting into similar cases when using the ref verify tool.
Curiously most of the memory allocations done by the ref verify tool
were already using GFP_NOFS, except a few ones for no apparent reason.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
#include <linux/slab.h>
#include <linux/iversion.h>
-#include <linux/sched/mm.h>
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info);
int total_size;
int nitems;
- unsigned int nofs_flag;
char *ins_data = NULL;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
}
}
- nofs_flag = memalloc_nofs_save();
ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes,
nitems);
- memalloc_nofs_restore(nofs_flag);
if (ret)
goto out;
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
- unsigned int nofs_flag;
int ret = 0;
do_again:
if (!curr)
goto delete_fail;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- unsigned int nofs_flag;
int mod;
int ret;
else
mod = 1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
- memalloc_nofs_restore(nofs_flag);
if (ret > 0)
ret = -ENOENT;
if (ret < 0)
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
- nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
struct block_entry *be = NULL, *exist;
struct root_entry *re = NULL;
- re = kzalloc(sizeof(struct root_entry), GFP_KERNEL);
- be = kzalloc(sizeof(struct block_entry), GFP_KERNEL);
+ re = kzalloc(sizeof(struct root_entry), GFP_NOFS);
+ be = kzalloc(sizeof(struct block_entry), GFP_NOFS);
if (!be || !re) {
kfree(re);
kfree(be);
struct root_entry *re;
struct ref_entry *ref = NULL, *exist;
- ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
struct block_entry *be;
struct ref_entry *ref;
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, 0);
u64 offset = btrfs_extent_data_ref_offset(leaf, dref);
u32 num_refs = btrfs_extent_data_ref_count(leaf, dref);
- ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL);
+ ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS);
if (!ref)
return -ENOMEM;
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);