btrfs: fix deadlock between chunk allocation and chunk btree modifications
[platform/kernel/linux-rpi.git] / fs / btrfs / volumes.c
index 2ec3b8a..fa68efd 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/semaphore.h>
 #include <linux/uuid.h>
 #include <linux/list_sort.h>
+#include <linux/namei.h>
 #include "misc.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -529,15 +530,48 @@ error:
        return ret;
 }
 
-static bool device_path_matched(const char *path, struct btrfs_device *device)
+/*
+ * Check if the device in the path matches the device in the given struct device.
+ *
+ * Returns:
+ *   true  If it is the same device.
+ *   false If it is not the same device or on error.
+ */
+static bool device_matched(const struct btrfs_device *device, const char *path)
 {
-       int found;
+       char *device_name;
+       dev_t dev_old;
+       dev_t dev_new;
+       int ret;
+
+       /*
+        * If we are looking for a device with the matching dev_t, then skip
+        * device without a name (a missing device).
+        */
+       if (!device->name)
+               return false;
+
+       device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+       if (!device_name)
+               return false;
 
        rcu_read_lock();
-       found = strcmp(rcu_str_deref(device->name), path);
+       scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
        rcu_read_unlock();
 
-       return found == 0;
+       ret = lookup_bdev(device_name, &dev_old);
+       kfree(device_name);
+       if (ret)
+               return false;
+
+       ret = lookup_bdev(path, &dev_new);
+       if (ret)
+               return false;
+
+       if (dev_old == dev_new)
+               return true;
+
+       return false;
 }
 
 /*
@@ -570,9 +604,7 @@ static int btrfs_free_stale_devices(const char *path,
                                         &fs_devices->devices, dev_list) {
                        if (skip_device && skip_device == device)
                                continue;
-                       if (path && !device->name)
-                               continue;
-                       if (path && !device_path_matched(path, device))
+                       if (path && !device_matched(device, path))
                                continue;
                        if (fs_devices->opened) {
                                /* for an already deleted device return 0 */
@@ -1091,7 +1123,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
        list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
                __btrfs_free_extra_devids(seed_dev, &latest_dev);
 
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
 
        mutex_unlock(&uuid_mutex);
 }
@@ -1122,8 +1154,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
        if (device->devid == BTRFS_DEV_REPLACE_DEVID)
                clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 
-       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+               clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
                fs_devices->missing_devices--;
+       }
 
        btrfs_close_bdev(device);
        if (device->bdev) {
@@ -1222,7 +1256,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
                return -EINVAL;
 
        fs_devices->opened = 1;
-       fs_devices->latest_bdev = latest_dev->bdev;
+       fs_devices->latest_dev = latest_dev;
        fs_devices->total_rw_bytes = 0;
        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
        fs_devices->read_policy = BTRFS_READ_POLICY_PID;
@@ -1363,8 +1397,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
 
        bytenr_orig = btrfs_sb_offset(0);
        ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
-       if (ret)
-               return ERR_PTR(ret);
+       if (ret) {
+               device = ERR_PTR(ret);
+               goto error_bdev_put;
+       }
 
        disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
        if (IS_ERR(disk_super)) {
@@ -1843,8 +1879,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
 
+       btrfs_reserve_chunk_metadata(trans, true);
        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
                                      &key, sizeof(*dev_item));
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret)
                goto out;
 
@@ -1882,18 +1920,22 @@ out:
 /*
  * Function to update ctime/mtime for a given device path.
  * Mainly used for ctime/mtime based probe like libblkid.
+ *
+ * We don't care about errors here, this is just to be kind to userspace.
  */
-static void update_dev_time(struct block_device *bdev)
+static void update_dev_time(const char *device_path)
 {
-       struct inode *inode = bdev->bd_inode;
+       struct path path;
        struct timespec64 now;
+       int ret;
 
-       /* Shouldn't happen but just in case. */
-       if (!inode)
+       ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
+       if (ret)
                return;
 
-       now = current_time(inode);
-       generic_update_time(inode, &now, S_MTIME | S_CTIME);
+       now = current_time(d_inode(path.dentry));
+       inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
+       path_put(&path);
 }
 
 static int btrfs_rm_dev_item(struct btrfs_device *device)
@@ -1917,7 +1959,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
 
+       btrfs_reserve_chunk_metadata(trans, false);
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
@@ -1986,7 +2030,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
 }
 
 /*
- * Helper function to check if the given device is part of s_bdev / latest_bdev
+ * Helper function to check if the given device is part of s_bdev / latest_dev
  * and replace it with the provided or the next active device, in the context
  * where this function called, there should be always be another device (or
  * this_dev) which is active.
@@ -2005,8 +2049,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
                        (fs_info->sb->s_bdev == device->bdev))
                fs_info->sb->s_bdev = next_device->bdev;
 
-       if (fs_info->fs_devices->latest_bdev == device->bdev)
-               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
+               fs_info->fs_devices->latest_dev = next_device;
 }
 
 /*
@@ -2069,7 +2113,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 
        /* Update ctime/mtime for device path for libblkid */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
 }
 
 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
@@ -2081,8 +2125,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        u64 num_devices;
        int ret = 0;
 
-       mutex_lock(&uuid_mutex);
-
+       /*
+        * The device list in fs_devices is accessed without locks (neither
+        * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+        * filesystem and another device rm cannot run.
+        */
        num_devices = btrfs_num_devices(fs_info);
 
        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
@@ -2126,11 +2173,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
                mutex_unlock(&fs_info->chunk_mutex);
        }
 
-       mutex_unlock(&uuid_mutex);
        ret = btrfs_shrink_device(device, 0);
        if (!ret)
                btrfs_reada_remove_dev(device);
-       mutex_lock(&uuid_mutex);
        if (ret)
                goto error_undo;
 
@@ -2217,7 +2262,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
        }
 
 out:
-       mutex_unlock(&uuid_mutex);
        return ret;
 
 error_undo:
@@ -2305,13 +2349,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
 
        mutex_unlock(&fs_devices->device_list_mutex);
 
-       /*
-        * The update_dev_time() with in btrfs_scratch_superblocks()
-        * may lead to a call to btrfs_show_devname() which will try
-        * to hold device_list_mutex. And here this device
-        * is already out of device list, so we don't have to hold
-        * the device_list_mutex lock.
-        */
        btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
                                  tgtdev->name->str);
 
@@ -2480,7 +2517,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
        key.type = BTRFS_DEV_ITEM_KEY;
 
        while (1) {
+               btrfs_reserve_chunk_metadata(trans, false);
                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+               btrfs_trans_release_chunk_metadata(trans);
                if (ret < 0)
                        goto error;
 
@@ -2594,7 +2633,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
        device->fs_info = fs_info;
        device->bdev = bdev;
 
-       ret = btrfs_get_dev_zone_info(device);
+       ret = btrfs_get_dev_zone_info(device, false);
        if (ret)
                goto error_free_device;
 
@@ -2627,6 +2666,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
                        btrfs_abort_transaction(trans, ret);
                        goto error_trans;
                }
+               btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
+                                               device);
        }
 
        device->fs_devices = fs_devices;
@@ -2733,7 +2774,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
        btrfs_forget_devices(device_path);
 
        /* Update ctime/mtime for blkid or udev */
-       update_dev_time(bdev);
+       update_dev_time(device_path);
 
        return ret;
 
@@ -2826,6 +2867,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
        struct btrfs_super_block *super_copy = fs_info->super_copy;
        u64 old_total;
        u64 diff;
+       int ret;
 
        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
                return -EACCES;
@@ -2854,7 +2896,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
                              &trans->transaction->dev_update_list);
        mutex_unlock(&fs_info->chunk_mutex);
 
-       return btrfs_update_device(trans, device);
+       btrfs_reserve_chunk_metadata(trans, false);
+       ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
+
+       return ret;
 }
 
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
@@ -3096,7 +3142,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
                struct btrfs_block_group *sys_bg;
 
-               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               sys_bg = btrfs_create_chunk(trans, sys_flags);
                if (IS_ERR(sys_bg)) {
                        ret = PTR_ERR(sys_bg);
                        btrfs_abort_transaction(trans, ret);
@@ -4354,10 +4400,12 @@ static int balance_kthread(void *data)
        struct btrfs_fs_info *fs_info = data;
        int ret = 0;
 
+       sb_start_write(fs_info->sb);
        mutex_lock(&fs_info->balance_mutex);
        if (fs_info->balance_ctl)
                ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
        mutex_unlock(&fs_info->balance_mutex);
+       sb_end_write(fs_info->sb);
 
        return ret;
 }
@@ -4889,8 +4937,10 @@ again:
                        round_down(old_total - diff, fs_info->sectorsize));
        mutex_unlock(&fs_info->chunk_mutex);
 
+       btrfs_reserve_chunk_metadata(trans, false);
        /* Now btrfs_update_device() will change the on-disk size. */
        ret = btrfs_update_device(trans, device);
+       btrfs_trans_release_chunk_metadata(trans);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                btrfs_end_transaction(trans);
@@ -4973,7 +5023,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
 }
 
 /*
- * Structure used internally for __btrfs_alloc_chunk() function.
+ * Structure used internally for btrfs_create_chunk() function.
  * Wraps needed parameters.
  */
 struct alloc_chunk_ctl {
@@ -5377,7 +5427,7 @@ error_del_extent:
        return block_group;
 }
 
-struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
                                            u64 type)
 {
        struct btrfs_fs_info *info = trans->fs_info;
@@ -5578,12 +5628,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
         */
 
        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-       meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       meta_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(meta_bg))
                return PTR_ERR(meta_bg);
 
        alloc_profile = btrfs_system_alloc_profile(fs_info);
-       sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       sys_bg = btrfs_create_chunk(trans, alloc_profile);
        if (IS_ERR(sys_bg))
                return PTR_ERR(sys_bg);
 
@@ -7482,6 +7532,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
        fs_info->fs_devices->total_rw_bytes = 0;
 
        /*
+        * Lockdep complains about possible circular locking dependency between
+        * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
+        * used for freeze procection of a fs (struct super_block.s_writers),
+        * which we take when starting a transaction, and extent buffers of the
+        * chunk tree if we call read_one_dev() while holding a lock on an
+        * extent buffer of the chunk tree. Since we are mounting the filesystem
+        * and at this point there can't be any concurrent task modifying the
+        * chunk tree, to keep it simple, just skip locking on the chunk tree.
+        */
+       ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
+       path->skip_locking = 1;
+
+       /*
         * Read all device items, and then all the chunk items. All
         * device items are found before any chunk item (their object id
         * is smaller than the lowest possible object id for a chunk
@@ -7506,10 +7569,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
                                goto error;
                        break;
                }
-               /*
-                * The nodes on level 1 are not locked but we don't need to do
-                * that during mount time as nothing else can access the tree
-                */
                node = path->nodes[1];
                if (node) {
                        if (last_ra_node != node->start) {
@@ -7537,7 +7596,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
                         * requirement for chunk allocation, see the comment on
                         * top of btrfs_chunk_alloc() for details.
                         */
-                       ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
                        ret = read_one_chunk(&found_key, leaf, chunk);
                        if (ret)
@@ -7551,12 +7609,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
         * do another round of validation checks.
         */
        if (total_dev != fs_info->fs_devices->total_devices) {
-               btrfs_err(fs_info,
-          "super_num_devices %llu mismatch with num_devices %llu found here",
+               btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
                          btrfs_super_num_devices(fs_info->super_copy),
                          total_dev);
-               ret = -EINVAL;
-               goto error;
+               fs_info->fs_devices->total_devices = total_dev;
+               btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
        }
        if (btrfs_super_total_bytes(fs_info->super_copy) <
            fs_info->fs_devices->total_rw_bytes) {
@@ -8173,10 +8231,12 @@ static int relocating_repair_kthread(void *data)
        target = cache->start;
        btrfs_put_block_group(cache);
 
+       sb_start_write(fs_info->sb);
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
                btrfs_info(fs_info,
                           "zoned: skip relocating block group %llu to repair: EBUSY",
                           target);
+               sb_end_write(fs_info->sb);
                return -EBUSY;
        }
 
@@ -8204,6 +8264,7 @@ out:
                btrfs_put_block_group(cache);
        mutex_unlock(&fs_info->reclaim_bgs_lock);
        btrfs_exclop_finish(fs_info);
+       sb_end_write(fs_info->sb);
 
        return ret;
 }