btrfs: fix readahead hang and use-after-free after removing a device

[platform/kernel/linux-starfive.git] / fs / btrfs / reada.c
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index 243a2e4..d9a166e 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -421,6 +421,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                 if (!dev->bdev)
                         continue;
  
+               if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
+                       continue;
+
                 if (dev_replace_is_ongoing &&
                     dev == fs_info->dev_replace.tgtdev) {
                         /*
@@ -445,6 +448,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
                 }
                 have_zone = 1;
         }
+       if (!have_zone)
+               radix_tree_delete(&fs_info->reada_tree, index);
         spin_unlock(&fs_info->reada_lock);
         up_read(&fs_info->dev_replace.rwsem);
  
@@ -767,31 +772,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
         kfree(rmw);
  }
  
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
  {
-       struct btrfs_device *device;
-       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
         u64 enqueued;
         u64 total = 0;
-       int i;
+       struct btrfs_device *device;
  
-again:
         do {
                 enqueued = 0;
-               mutex_lock(&fs_devices->device_list_mutex);
                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
                         if (atomic_read(&device->reada_in_flight) <
                             MAX_IN_FLIGHT)
                                 enqueued += reada_start_machine_dev(device);
                 }
-               mutex_unlock(&fs_devices->device_list_mutex);
                 total += enqueued;
         } while (enqueued && total < 10000);
-       if (fs_devices->seed) {
-               fs_devices = fs_devices->seed;
-               goto again;
-       }
  
+       return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+       int i;
+       u64 enqueued = 0;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+
+       enqueued += reada_start_for_fsdevs(fs_devices);
+       list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+               enqueued += reada_start_for_fsdevs(seed_devs);
+
+       mutex_unlock(&fs_devices->device_list_mutex);
         if (enqueued == 0)
                 return;
  
@@ -1012,3 +1025,45 @@ void btrfs_reada_detach(void *handle)
  
         kref_put(&rc->refcnt, reada_control_release);
  }
+
+/*
+ * Before removing a device (device replace or device remove ioctls), call this
+ * function to wait for all existing readahead requests on the device and to
+ * make sure no one queues more readahead requests for the device.
+ *
+ * Must be called without holding neither the device list mutex nor the device
+ * replace semaphore, otherwise it will deadlock.
+ */
+void btrfs_reada_remove_dev(struct btrfs_device *dev)
+{
+       struct btrfs_fs_info *fs_info = dev->fs_info;
+
+       /* Serialize with readahead extent creation at reada_find_extent(). */
+       spin_lock(&fs_info->reada_lock);
+       set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+       spin_unlock(&fs_info->reada_lock);
+
+       /*
+        * There might be readahead requests added to the radix trees which
+        * were not yet added to the readahead work queue. We need to start
+        * them and wait for their completion, otherwise we can end up with
+        * use-after-free problems when dropping the last reference on the
+        * readahead extents and their zones, as they need to access the
+        * device structure.
+        */
+       reada_start_machine(fs_info);
+       btrfs_flush_workqueue(fs_info->readahead_workers);
+}
+
+/*
+ * If when removing a device (device replace or device remove ioctls) an error
+ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
+ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
+ * called before.
+ */
+void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
+{
+       spin_lock(&dev->fs_info->reada_lock);
+       clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
+       spin_unlock(&dev->fs_info->reada_lock);
+}