btrfs: do not create raid sysfs entries under any locks
authorJosef Bacik <josef@toxicpanda.com>
Tue, 1 Sep 2020 21:40:38 +0000 (17:40 -0400)
committerDavid Sterba <dsterba@suse.com>
Wed, 7 Oct 2020 10:13:19 +0000 (12:13 +0200)
While running xfstests btrfs/177 I got the following lockdep splat

  ======================================================
  WARNING: possible circular locking dependency detected
  5.9.0-rc3+ #5 Not tainted
  ------------------------------------------------------
  kswapd0/100 is trying to acquire lock:
  ffff97066aa56760 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330

  but task is already holding lock:
  ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #3 (fs_reclaim){+.+.}-{0:0}:
 fs_reclaim_acquire+0x65/0x80
 slab_pre_alloc_hook.constprop.0+0x20/0x200
 kmem_cache_alloc+0x37/0x270
 alloc_inode+0x82/0xb0
 iget_locked+0x10d/0x2c0
 kernfs_get_inode+0x1b/0x130
 kernfs_get_tree+0x136/0x240
 sysfs_get_tree+0x16/0x40
 vfs_get_tree+0x28/0xc0
 path_mount+0x434/0xc00
 __x64_sys_mount+0xe3/0x120
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #2 (kernfs_mutex){+.+.}-{3:3}:
 __mutex_lock+0x7e/0x7e0
 kernfs_add_one+0x23/0x150
 kernfs_create_dir_ns+0x7a/0xb0
 sysfs_create_dir_ns+0x60/0xb0
 kobject_add_internal+0xc0/0x2c0
 kobject_add+0x6e/0x90
 btrfs_sysfs_add_block_group_type+0x102/0x160
 btrfs_make_block_group+0x167/0x230
 btrfs_alloc_chunk+0x54f/0xb80
 btrfs_chunk_alloc+0x18e/0x3a0
 find_free_extent+0xdf6/0x1210
 btrfs_reserve_extent+0xb3/0x1b0
 btrfs_alloc_tree_block+0xb0/0x310
 alloc_tree_block_no_bg_flush+0x4a/0x60
 __btrfs_cow_block+0x11a/0x530
 btrfs_cow_block+0x104/0x220
 btrfs_search_slot+0x52e/0x9d0
 btrfs_insert_empty_items+0x64/0xb0
 btrfs_new_inode+0x225/0x730
 btrfs_create+0xab/0x1f0
 lookup_open.isra.0+0x52d/0x690
 path_openat+0x2a7/0x9e0
 do_filp_open+0x75/0x100
 do_sys_openat2+0x7b/0x130
 __x64_sys_openat+0x46/0x70
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}:
 __mutex_lock+0x7e/0x7e0
 btrfs_chunk_alloc+0x125/0x3a0
 find_free_extent+0xdf6/0x1210
 btrfs_reserve_extent+0xb3/0x1b0
 btrfs_alloc_tree_block+0xb0/0x310
 alloc_tree_block_no_bg_flush+0x4a/0x60
 __btrfs_cow_block+0x11a/0x530
 btrfs_cow_block+0x104/0x220
 btrfs_search_slot+0x52e/0x9d0
 btrfs_lookup_inode+0x2a/0x8f
 __btrfs_update_delayed_inode+0x80/0x240
 btrfs_commit_inode_delayed_inode+0x119/0x120
 btrfs_evict_inode+0x357/0x500
 evict+0xcf/0x1f0
 do_unlinkat+0x1a9/0x2b0
 do_syscall_64+0x33/0x40
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
 __lock_acquire+0x119c/0x1fc0
 lock_acquire+0xa7/0x3d0
 __mutex_lock+0x7e/0x7e0
 __btrfs_release_delayed_node.part.0+0x3f/0x330
 btrfs_evict_inode+0x24c/0x500
 evict+0xcf/0x1f0
 dispose_list+0x48/0x70
 prune_icache_sb+0x44/0x50
 super_cache_scan+0x161/0x1e0
 do_shrink_slab+0x178/0x3c0
 shrink_slab+0x17c/0x290
 shrink_node+0x2b2/0x6d0
 balance_pgdat+0x30a/0x670
 kswapd+0x213/0x4c0
 kthread+0x138/0x160
 ret_from_fork+0x1f/0x30

  other info that might help us debug this:

  Chain exists of:
    &delayed_node->mutex --> kernfs_mutex --> fs_reclaim

   Possible unsafe locking scenario:

 CPU0                    CPU1
 ----                    ----
    lock(fs_reclaim);
 lock(kernfs_mutex);
 lock(fs_reclaim);
    lock(&delayed_node->mutex);

   *** DEADLOCK ***

  3 locks held by kswapd0/100:
   #0: ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
   #1: ffffffff9fd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290
   #2: ffff9706629780e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0

  stack backtrace:
  CPU: 1 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #5
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
  Call Trace:
   dump_stack+0x8b/0xb8
   check_noncircular+0x12d/0x150
   __lock_acquire+0x119c/0x1fc0
   lock_acquire+0xa7/0x3d0
   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
   __mutex_lock+0x7e/0x7e0
   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
   ? lock_acquire+0xa7/0x3d0
   ? find_held_lock+0x2b/0x80
   __btrfs_release_delayed_node.part.0+0x3f/0x330
   btrfs_evict_inode+0x24c/0x500
   evict+0xcf/0x1f0
   dispose_list+0x48/0x70
   prune_icache_sb+0x44/0x50
   super_cache_scan+0x161/0x1e0
   do_shrink_slab+0x178/0x3c0
   shrink_slab+0x17c/0x290
   shrink_node+0x2b2/0x6d0
   balance_pgdat+0x30a/0x670
   kswapd+0x213/0x4c0
   ? _raw_spin_unlock_irqrestore+0x41/0x50
   ? add_wait_queue_exclusive+0x70/0x70
   ? balance_pgdat+0x670/0x670
   kthread+0x138/0x160
   ? kthread_create_worker_on_cpu+0x40/0x40
   ret_from_fork+0x1f/0x30

This happens because when we link in a block group with a new raid index
type we'll create the corresponding sysfs entries for it.  This is
problematic because while restriping we're holding the chunk_mutex, and
while mounting we're holding the tree locks.

Fixing this isn't pretty, we move the call to the sysfs stuff into the
btrfs_create_pending_block_groups() work, where we're not holding any
locks.  This creates a slight race where other threads could see that
there's no sysfs kobj for that raid type, and race to create the
sysfs dir.  Fix this by wrapping the creation in space_info->lock, so we
only get one thread calling kobject_add() for the new directory.  We
don't worry about the lock on cleanup as it only gets deleted on
unmount.

On mount it's more straightforward, we loop through the space_infos
already, just check every raid index in each space_info and added the
sysfs entries for the corresponding block groups.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/block-group.c
fs/btrfs/sysfs.c

index a3b2720..c0f1d68 100644 (file)
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
 {
        struct btrfs_space_info *space_info = cache->space_info;
        int index = btrfs_bg_flags_to_raid_index(cache->flags);
-       bool first = false;
 
        down_write(&space_info->groups_sem);
-       if (list_empty(&space_info->block_groups[index]))
-               first = true;
        list_add_tail(&cache->list, &space_info->block_groups[index]);
        up_write(&space_info->groups_sem);
-
-       if (first)
-               btrfs_sysfs_add_block_group_type(cache);
 }
 
 static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -2032,6 +2026,17 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
        }
 
        list_for_each_entry(space_info, &info->space_info, list) {
+               int i;
+
+               for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+                       if (list_empty(&space_info->block_groups[i]))
+                               continue;
+                       cache = list_first_entry(&space_info->block_groups[i],
+                                                struct btrfs_block_group,
+                                                list);
+                       btrfs_sysfs_add_block_group_type(cache);
+               }
+
                if (!(btrfs_get_alloc_profile(info, space_info->flags) &
                      (BTRFS_BLOCK_GROUP_RAID10 |
                       BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2091,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                return;
 
        while (!list_empty(&trans->new_bgs)) {
+               int index;
+
                block_group = list_first_entry(&trans->new_bgs,
                                               struct btrfs_block_group,
                                               bg_list);
                if (ret)
                        goto next;
 
+               index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
                ret = insert_block_group_item(trans, block_group);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
@@ -2105,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                if (ret)
                        btrfs_abort_transaction(trans, ret);
                add_block_group_free_space(trans, block_group);
+
+               /*
+                * If we restriped during balance, we may have added a new raid
+                * type, so now add the sysfs entries when it is safe to do so.
+                * We don't have to worry about locking here as it's handled in
+                * btrfs_sysfs_add_block_group_type.
+                */
+               if (block_group->space_info->block_group_kobjs[index] == NULL)
+                       btrfs_sysfs_add_block_group_type(block_group);
+
                /* Already aborted the transaction if it failed. */
 next:
                btrfs_delayed_refs_rsv_release(fs_info, 1);
index 13354de..279d926 100644 (file)
@@ -1137,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
 
        rkobj->flags = cache->flags;
        kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+       /*
+        * We call this either on mount, or if we've created a block group for a
+        * new index type while running (i.e. when restriping).  The running
+        * case is tricky because we could race with other threads, so we need
+        * to have this check to make sure we didn't already init the kobject.
+        *
+        * We don't have to protect on the free side because it only happens on
+        * unmount.
+        */
+       spin_lock(&space_info->lock);
+       if (space_info->block_group_kobjs[index]) {
+               spin_unlock(&space_info->lock);
+               kobject_put(&rkobj->kobj);
+               return;
+       } else {
+               space_info->block_group_kobjs[index] = &rkobj->kobj;
+       }
+       spin_unlock(&space_info->lock);
+
        ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
                          btrfs_bg_type_to_raid_name(rkobj->flags));
        memalloc_nofs_restore(nofs_flag);
        if (ret) {
+               spin_lock(&space_info->lock);
+               space_info->block_group_kobjs[index] = NULL;
+               spin_unlock(&space_info->lock);
                kobject_put(&rkobj->kobj);
                btrfs_warn(fs_info,
                        "failed to add kobject for block cache, ignoring");
                return;
        }
-
-       space_info->block_group_kobjs[index] = &rkobj->kobj;
 }
 
 /*