dm thin: Fix ABBA deadlock between shrink_slab and dm_pool_abort_metadata
authorZhihao Cheng <chengzhihao1@huawei.com>
Wed, 30 Nov 2022 13:31:34 +0000 (21:31 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 12 Jan 2023 10:58:52 +0000 (11:58 +0100)
commit 8111964f1b8524c4bb56b02cd9c7a37725ea21fd upstream.

Following concurrent processes:

          P1(drop cache)                P2(kworker)
drop_caches_sysctl_handler
 drop_slab
  shrink_slab
   down_read(&shrinker_rwsem)  - LOCK A
   do_shrink_slab
    super_cache_scan
     prune_icache_sb
      dispose_list
       evict
        ext4_evict_inode
 ext4_clear_inode
  ext4_discard_preallocations
   ext4_mb_load_buddy_gfp
    ext4_mb_init_cache
     ext4_read_block_bitmap_nowait
      ext4_read_bh_nowait
       submit_bh
        dm_submit_bio
                 do_worker
  process_deferred_bios
   commit
    metadata_operation_failed
     dm_pool_abort_metadata
      down_write(&pmd->root_lock) - LOCK B
                      __destroy_persistent_data_objects
       dm_block_manager_destroy
        dm_bufio_client_destroy
         unregister_shrinker
  down_write(&shrinker_rwsem)
 thin_map                            |
  dm_thin_find_block                 ↓
   down_read(&pmd->root_lock) --> ABBA deadlock

, which triggers hung task:

[   76.974820] INFO: task kworker/u4:3:63 blocked for more than 15 seconds.
[   76.976019]       Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910
[   76.978521] task:kworker/u4:3    state:D stack:0     pid:63    ppid:2
[   76.978534] Workqueue: dm-thin do_worker
[   76.978552] Call Trace:
[   76.978564]  __schedule+0x6ba/0x10f0
[   76.978582]  schedule+0x9d/0x1e0
[   76.978588]  rwsem_down_write_slowpath+0x587/0xdf0
[   76.978600]  down_write+0xec/0x110
[   76.978607]  unregister_shrinker+0x2c/0xf0
[   76.978616]  dm_bufio_client_destroy+0x116/0x3d0
[   76.978625]  dm_block_manager_destroy+0x19/0x40
[   76.978629]  __destroy_persistent_data_objects+0x5e/0x70
[   76.978636]  dm_pool_abort_metadata+0x8e/0x100
[   76.978643]  metadata_operation_failed+0x86/0x110
[   76.978649]  commit+0x6a/0x230
[   76.978655]  do_worker+0xc6e/0xd90
[   76.978702]  process_one_work+0x269/0x630
[   76.978714]  worker_thread+0x266/0x630
[   76.978730]  kthread+0x151/0x1b0
[   76.978772] INFO: task test.sh:2646 blocked for more than 15 seconds.
[   76.979756]       Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910
[   76.982111] task:test.sh         state:D stack:0     pid:2646  ppid:2459
[   76.982128] Call Trace:
[   76.982139]  __schedule+0x6ba/0x10f0
[   76.982155]  schedule+0x9d/0x1e0
[   76.982159]  rwsem_down_read_slowpath+0x4f4/0x910
[   76.982173]  down_read+0x84/0x170
[   76.982177]  dm_thin_find_block+0x4c/0xd0
[   76.982183]  thin_map+0x201/0x3d0
[   76.982188]  __map_bio+0x5b/0x350
[   76.982195]  dm_submit_bio+0x2b6/0x930
[   76.982202]  __submit_bio+0x123/0x2d0
[   76.982209]  submit_bio_noacct_nocheck+0x101/0x3e0
[   76.982222]  submit_bio_noacct+0x389/0x770
[   76.982227]  submit_bio+0x50/0xc0
[   76.982232]  submit_bh_wbc+0x15e/0x230
[   76.982238]  submit_bh+0x14/0x20
[   76.982241]  ext4_read_bh_nowait+0xc5/0x130
[   76.982247]  ext4_read_block_bitmap_nowait+0x340/0xc60
[   76.982254]  ext4_mb_init_cache+0x1ce/0xdc0
[   76.982259]  ext4_mb_load_buddy_gfp+0x987/0xfa0
[   76.982263]  ext4_discard_preallocations+0x45d/0x830
[   76.982274]  ext4_clear_inode+0x48/0xf0
[   76.982280]  ext4_evict_inode+0xcf/0xc70
[   76.982285]  evict+0x119/0x2b0
[   76.982290]  dispose_list+0x43/0xa0
[   76.982294]  prune_icache_sb+0x64/0x90
[   76.982298]  super_cache_scan+0x155/0x210
[   76.982303]  do_shrink_slab+0x19e/0x4e0
[   76.982310]  shrink_slab+0x2bd/0x450
[   76.982317]  drop_slab+0xcc/0x1a0
[   76.982323]  drop_caches_sysctl_handler+0xb7/0xe0
[   76.982327]  proc_sys_call_handler+0x1bc/0x300
[   76.982331]  proc_sys_write+0x17/0x20
[   76.982334]  vfs_write+0x3d3/0x570
[   76.982342]  ksys_write+0x73/0x160
[   76.982347]  __x64_sys_write+0x1e/0x30
[   76.982352]  do_syscall_64+0x35/0x80
[   76.982357]  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Function metadata_operation_failed() is called when operations failed
on dm pool metadata, dm pool will destroy and recreate metadata. So,
shrinker will be unregistered and registered, which could down write
shrinker_rwsem under pmd_write_lock.

Fix it by allocating dm_block_manager before locking pmd->root_lock
and destroying old dm_block_manager after unlocking pmd->root_lock,
then old dm_block_manager is replaced with new dm_block_manager under
pmd->root_lock. So, shrinker register/unregister could be done without
holding pmd->root_lock.

Fetch a reproducer in [Link].

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216676
Cc: stable@vger.kernel.org #v5.2+
Fixes: e49e582965b3 ("dm thin: add read only and fail io modes")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/md/dm-thin-metadata.c

index 0ada995..6e2612f 100644 (file)
@@ -776,13 +776,15 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
        return r;
 }
 
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
+                                             bool destroy_bm)
 {
        dm_sm_destroy(pmd->data_sm);
        dm_sm_destroy(pmd->metadata_sm);
        dm_tm_destroy(pmd->nb_tm);
        dm_tm_destroy(pmd->tm);
-       dm_block_manager_destroy(pmd->bm);
+       if (destroy_bm)
+               dm_block_manager_destroy(pmd->bm);
 }
 
 static int __begin_transaction(struct dm_pool_metadata *pmd)
@@ -989,7 +991,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
        }
        pmd_write_unlock(pmd);
        if (!pmd->fail_io)
-               __destroy_persistent_data_objects(pmd);
+               __destroy_persistent_data_objects(pmd, true);
 
        kfree(pmd);
        return 0;
@@ -1888,19 +1890,52 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
 {
        int r = -EINVAL;
+       struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
+
+       /* fail_io is double-checked with pmd->root_lock held below */
+       if (unlikely(pmd->fail_io))
+               return r;
+
+       /*
+        * Replacement block manager (new_bm) is created and old_bm destroyed outside of
+        * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
+        * shrinker associated with the block manager's bufio client vs pmd root_lock).
+        * - must take shrinker_rwsem without holding pmd->root_lock
+        */
+       new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+                                        THIN_MAX_CONCURRENT_LOCKS);
 
        pmd_write_lock(pmd);
-       if (pmd->fail_io)
+       if (pmd->fail_io) {
+               pmd_write_unlock(pmd);
                goto out;
+       }
 
        __set_abort_with_changes_flags(pmd);
-       __destroy_persistent_data_objects(pmd);
-       r = __create_persistent_data_objects(pmd, false);
+       __destroy_persistent_data_objects(pmd, false);
+       old_bm = pmd->bm;
+       if (IS_ERR(new_bm)) {
+               DMERR("could not create block manager during abort");
+               pmd->bm = NULL;
+               r = PTR_ERR(new_bm);
+               goto out_unlock;
+       }
+
+       pmd->bm = new_bm;
+       r = __open_or_format_metadata(pmd, false);
+       if (r) {
+               pmd->bm = NULL;
+               goto out_unlock;
+       }
+       new_bm = NULL;
+out_unlock:
        if (r)
                pmd->fail_io = true;
-
-out:
        pmd_write_unlock(pmd);
+       dm_block_manager_destroy(old_bm);
+out:
+       if (new_bm && !IS_ERR(new_bm))
+               dm_block_manager_destroy(new_bm);
 
        return r;
 }