fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/kthread.h>
  13 #include <linux/raid/pq.h>
  14 #include <linux/semaphore.h>
  15 #include <linux/uuid.h>
  16 #include <linux/list_sort.h>
  17 #include <linux/namei.h>
  18 #include "misc.h"
  19 #include "ctree.h"
  20 #include "extent_map.h"
  21 #include "disk-io.h"
  22 #include "transaction.h"
  23 #include "print-tree.h"
  24 #include "volumes.h"
  25 #include "raid56.h"
  26 #include "async-thread.h"
  27 #include "check-integrity.h"
  28 #include "rcu-string.h"
  29 #include "dev-replace.h"
  30 #include "sysfs.h"
  31 #include "tree-checker.h"
  32 #include "space-info.h"
  33 #include "block-group.h"
  34 #include "discard.h"
  35 #include "zoned.h"
  36
  37 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  38         [BTRFS_RAID_RAID10] = {
  39                 .sub_stripes    = 2,
  40                 .dev_stripes    = 1,
  41                 .devs_max       = 0,    /* 0 == as many as possible */
  42                 .devs_min       = 2,
  43                 .tolerated_failures = 1,
  44                 .devs_increment = 2,
  45                 .ncopies        = 2,
  46                 .nparity        = 0,
  47                 .raid_name      = "raid10",
  48                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  49                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  50         },
  51         [BTRFS_RAID_RAID1] = {
  52                 .sub_stripes    = 1,
  53                 .dev_stripes    = 1,
  54                 .devs_max       = 2,
  55                 .devs_min       = 2,
  56                 .tolerated_failures = 1,
  57                 .devs_increment = 2,
  58                 .ncopies        = 2,
  59                 .nparity        = 0,
  60                 .raid_name      = "raid1",
  61                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  62                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  63         },
  64         [BTRFS_RAID_RAID1C3] = {
  65                 .sub_stripes    = 1,
  66                 .dev_stripes    = 1,
  67                 .devs_max       = 3,
  68                 .devs_min       = 3,
  69                 .tolerated_failures = 2,
  70                 .devs_increment = 3,
  71                 .ncopies        = 3,
  72                 .nparity        = 0,
  73                 .raid_name      = "raid1c3",
  74                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  75                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  76         },
  77         [BTRFS_RAID_RAID1C4] = {
  78                 .sub_stripes    = 1,
  79                 .dev_stripes    = 1,
  80                 .devs_max       = 4,
  81                 .devs_min       = 4,
  82                 .tolerated_failures = 3,
  83                 .devs_increment = 4,
  84                 .ncopies        = 4,
  85                 .nparity        = 0,
  86                 .raid_name      = "raid1c4",
  87                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  88                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  89         },
  90         [BTRFS_RAID_DUP] = {
  91                 .sub_stripes    = 1,
  92                 .dev_stripes    = 2,
  93                 .devs_max       = 1,
  94                 .devs_min       = 1,
  95                 .tolerated_failures = 0,
  96                 .devs_increment = 1,
  97                 .ncopies        = 2,
  98                 .nparity        = 0,
  99                 .raid_name      = "dup",
 100                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 101                 .mindev_error   = 0,
 102         },
 103         [BTRFS_RAID_RAID0] = {
 104                 .sub_stripes    = 1,
 105                 .dev_stripes    = 1,
 106                 .devs_max       = 0,
 107                 .devs_min       = 1,
 108                 .tolerated_failures = 0,
 109                 .devs_increment = 1,
 110                 .ncopies        = 1,
 111                 .nparity        = 0,
 112                 .raid_name      = "raid0",
 113                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 114                 .mindev_error   = 0,
 115         },
 116         [BTRFS_RAID_SINGLE] = {
 117                 .sub_stripes    = 1,
 118                 .dev_stripes    = 1,
 119                 .devs_max       = 1,
 120                 .devs_min       = 1,
 121                 .tolerated_failures = 0,
 122                 .devs_increment = 1,
 123                 .ncopies        = 1,
 124                 .nparity        = 0,
 125                 .raid_name      = "single",
 126                 .bg_flag        = 0,
 127                 .mindev_error   = 0,
 128         },
 129         [BTRFS_RAID_RAID5] = {
 130                 .sub_stripes    = 1,
 131                 .dev_stripes    = 1,
 132                 .devs_max       = 0,
 133                 .devs_min       = 2,
 134                 .tolerated_failures = 1,
 135                 .devs_increment = 1,
 136                 .ncopies        = 1,
 137                 .nparity        = 1,
 138                 .raid_name      = "raid5",
 139                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 140                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 141         },
 142         [BTRFS_RAID_RAID6] = {
 143                 .sub_stripes    = 1,
 144                 .dev_stripes    = 1,
 145                 .devs_max       = 0,
 146                 .devs_min       = 3,
 147                 .tolerated_failures = 2,
 148                 .devs_increment = 1,
 149                 .ncopies        = 1,
 150                 .nparity        = 2,
 151                 .raid_name      = "raid6",
 152                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 153                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 154         },
 155 };
 156
 157 /*
 158  * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 159  * can be used as index to access btrfs_raid_array[].
 160  */
 161 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
 162 {
 163         if (flags & BTRFS_BLOCK_GROUP_RAID10)
 164                 return BTRFS_RAID_RAID10;
 165         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
 166                 return BTRFS_RAID_RAID1;
 167         else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
 168                 return BTRFS_RAID_RAID1C3;
 169         else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
 170                 return BTRFS_RAID_RAID1C4;
 171         else if (flags & BTRFS_BLOCK_GROUP_DUP)
 172                 return BTRFS_RAID_DUP;
 173         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
 174                 return BTRFS_RAID_RAID0;
 175         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
 176                 return BTRFS_RAID_RAID5;
 177         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
 178                 return BTRFS_RAID_RAID6;
 179
 180         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 181 }
 182
 183 const char *btrfs_bg_type_to_raid_name(u64 flags)
 184 {
 185         const int index = btrfs_bg_flags_to_raid_index(flags);
 186
 187         if (index >= BTRFS_NR_RAID_TYPES)
 188                 return NULL;
 189
 190         return btrfs_raid_array[index].raid_name;
 191 }
 192
 193 /*
 194  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 195  * bytes including terminating null byte.
 196  */
 197 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 198 {
 199         int i;
 200         int ret;
 201         char *bp = buf;
 202         u64 flags = bg_flags;
 203         u32 size_bp = size_buf;
 204
 205         if (!flags) {
 206                 strcpy(bp, "NONE");
 207                 return;
 208         }
 209
 210 #define DESCRIBE_FLAG(flag, desc)                                               \
 211         do {                                                            \
 212                 if (flags & (flag)) {                                   \
 213                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 214                         if (ret < 0 || ret >= size_bp)                  \
 215                                 goto out_overflow;                      \
 216                         size_bp -= ret;                                 \
 217                         bp += ret;                                      \
 218                         flags &= ~(flag);                               \
 219                 }                                                       \
 220         } while (0)
 221
 222         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 223         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 224         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 225
 226         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 227         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 228                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 229                               btrfs_raid_array[i].raid_name);
 230 #undef DESCRIBE_FLAG
 231
 232         if (flags) {
 233                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 234                 size_bp -= ret;
 235         }
 236
 237         if (size_bp < size_buf)
 238                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 239
 240         /*
 241          * The text is trimmed, it's up to the caller to provide sufficiently
 242          * large buffer
 243          */
 244 out_overflow:;
 245 }
 246
 247 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 248 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 249 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 250 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 251 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 252                              enum btrfs_map_op op,
 253                              u64 logical, u64 *length,
 254                              struct btrfs_bio **bbio_ret,
 255                              int mirror_num, int need_raid_map);
 256
 257 /*
 258  * Device locking
 259  * ==============
 260  *
 261  * There are several mutexes that protect manipulation of devices and low-level
 262  * structures like chunks but not block groups, extents or files
 263  *
 264  * uuid_mutex (global lock)
 265  * ------------------------
 266  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 267  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 268  * device) or requested by the device= mount option
 269  *
 270  * the mutex can be very coarse and can cover long-running operations
 271  *
 272  * protects: updates to fs_devices counters like missing devices, rw devices,
 273  * seeding, structure cloning, opening/closing devices at mount/umount time
 274  *
 275  * global::fs_devs - add, remove, updates to the global list
 276  *
 277  * does not protect: manipulation of the fs_devices::devices list in general
 278  * but in mount context it could be used to exclude list modifications by eg.
 279  * scan ioctl
 280  *
 281  * btrfs_device::name - renames (write side), read is RCU
 282  *
 283  * fs_devices::device_list_mutex (per-fs, with RCU)
 284  * ------------------------------------------------
 285  * protects updates to fs_devices::devices, ie. adding and deleting
 286  *
 287  * simple list traversal with read-only actions can be done with RCU protection
 288  *
 289  * may be used to exclude some operations from running concurrently without any
 290  * modifications to the list (see write_all_supers)
 291  *
 292  * Is not required at mount and close times, because our device list is
 293  * protected by the uuid_mutex at that point.
 294  *
 295  * balance_mutex
 296  * -------------
 297  * protects balance structures (status, state) and context accessed from
 298  * several places (internally, ioctl)
 299  *
 300  * chunk_mutex
 301  * -----------
 302  * protects chunks, adding or removing during allocation, trim or when a new
 303  * device is added/removed. Additionally it also protects post_commit_list of
 304  * individual devices, since they can be added to the transaction's
 305  * post_commit_list only with chunk_mutex held.
 306  *
 307  * cleaner_mutex
 308  * -------------
 309  * a big lock that is held by the cleaner thread and prevents running subvolume
 310  * cleaning together with relocation or delayed iputs
 311  *
 312  *
 313  * Lock nesting
 314  * ============
 315  *
 316  * uuid_mutex
 317  *   device_list_mutex
 318  *     chunk_mutex
 319  *   balance_mutex
 320  *
 321  *
 322  * Exclusive operations
 323  * ====================
 324  *
 325  * Maintains the exclusivity of the following operations that apply to the
 326  * whole filesystem and cannot run in parallel.
 327  *
 328  * - Balance (*)
 329  * - Device add
 330  * - Device remove
 331  * - Device replace (*)
 332  * - Resize
 333  *
 334  * The device operations (as above) can be in one of the following states:
 335  *
 336  * - Running state
 337  * - Paused state
 338  * - Completed state
 339  *
 340  * Only device operations marked with (*) can go into the Paused state for the
 341  * following reasons:
 342  *
 343  * - ioctl (only Balance can be Paused through ioctl)
 344  * - filesystem remounted as read-only
 345  * - filesystem unmounted and mounted as read-only
 346  * - system power-cycle and filesystem mounted as read-only
 347  * - filesystem or device errors leading to forced read-only
 348  *
 349  * The status of exclusive operation is set and cleared atomically.
 350  * During the course of Paused state, fs_info::exclusive_operation remains set.
 351  * A device operation in Paused or Running state can be canceled or resumed
 352  * either by ioctl (Balance only) or when remounted as read-write.
 353  * The exclusive status is cleared when the device operation is canceled or
 354  * completed.
 355  */
 356
 357 DEFINE_MUTEX(uuid_mutex);
 358 static LIST_HEAD(fs_uuids);
 359 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 360 {
 361         return &fs_uuids;
 362 }
 363
 364 /*
 365  * alloc_fs_devices - allocate struct btrfs_fs_devices
 366  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 367  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 368  *
 369  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 370  * The returned struct is not linked onto any lists and can be destroyed with
 371  * kfree() right away.
 372  */
 373 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 374                                                  const u8 *metadata_fsid)
 375 {
 376         struct btrfs_fs_devices *fs_devs;
 377
 378         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 379         if (!fs_devs)
 380                 return ERR_PTR(-ENOMEM);
 381
 382         mutex_init(&fs_devs->device_list_mutex);
 383
 384         INIT_LIST_HEAD(&fs_devs->devices);
 385         INIT_LIST_HEAD(&fs_devs->alloc_list);
 386         INIT_LIST_HEAD(&fs_devs->fs_list);
 387         INIT_LIST_HEAD(&fs_devs->seed_list);
 388         if (fsid)
 389                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 390
 391         if (metadata_fsid)
 392                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 393         else if (fsid)
 394                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 395
 396         return fs_devs;
 397 }
 398
 399 void btrfs_free_device(struct btrfs_device *device)
 400 {
 401         WARN_ON(!list_empty(&device->post_commit_list));
 402         rcu_string_free(device->name);
 403         extent_io_tree_release(&device->alloc_state);
 404         bio_put(device->flush_bio);
 405         btrfs_destroy_dev_zone_info(device);
 406         kfree(device);
 407 }
 408
 409 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 410 {
 411         struct btrfs_device *device;
 412         WARN_ON(fs_devices->opened);
 413         while (!list_empty(&fs_devices->devices)) {
 414                 device = list_entry(fs_devices->devices.next,
 415                                     struct btrfs_device, dev_list);
 416                 list_del(&device->dev_list);
 417                 btrfs_free_device(device);
 418         }
 419         kfree(fs_devices);
 420 }
 421
 422 void __exit btrfs_cleanup_fs_uuids(void)
 423 {
 424         struct btrfs_fs_devices *fs_devices;
 425
 426         while (!list_empty(&fs_uuids)) {
 427                 fs_devices = list_entry(fs_uuids.next,
 428                                         struct btrfs_fs_devices, fs_list);
 429                 list_del(&fs_devices->fs_list);
 430                 free_fs_devices(fs_devices);
 431         }
 432 }
 433
 434 static noinline struct btrfs_fs_devices *find_fsid(
 435                 const u8 *fsid, const u8 *metadata_fsid)
 436 {
 437         struct btrfs_fs_devices *fs_devices;
 438
 439         ASSERT(fsid);
 440
 441         /* Handle non-split brain cases */
 442         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 443                 if (metadata_fsid) {
 444                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 445                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 446                                       BTRFS_FSID_SIZE) == 0)
 447                                 return fs_devices;
 448                 } else {
 449                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 450                                 return fs_devices;
 451                 }
 452         }
 453         return NULL;
 454 }
 455
 456 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 457                                 struct btrfs_super_block *disk_super)
 458 {
 459
 460         struct btrfs_fs_devices *fs_devices;
 461
 462         /*
 463          * Handle scanned device having completed its fsid change but
 464          * belonging to a fs_devices that was created by first scanning
 465          * a device which didn't have its fsid/metadata_uuid changed
 466          * at all and the CHANGING_FSID_V2 flag set.
 467          */
 468         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 469                 if (fs_devices->fsid_change &&
 470                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 471                            BTRFS_FSID_SIZE) == 0 &&
 472                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 473                            BTRFS_FSID_SIZE) == 0) {
 474                         return fs_devices;
 475                 }
 476         }
 477         /*
 478          * Handle scanned device having completed its fsid change but
 479          * belonging to a fs_devices that was created by a device that
 480          * has an outdated pair of fsid/metadata_uuid and
 481          * CHANGING_FSID_V2 flag set.
 482          */
 483         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 484                 if (fs_devices->fsid_change &&
 485                     memcmp(fs_devices->metadata_uuid,
 486                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 487                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 488                            BTRFS_FSID_SIZE) == 0) {
 489                         return fs_devices;
 490                 }
 491         }
 492
 493         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 494 }
 495
 496
 497 static int
 498 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 499                       int flush, struct block_device **bdev,
 500                       struct btrfs_super_block **disk_super)
 501 {
 502         int ret;
 503
 504         *bdev = blkdev_get_by_path(device_path, flags, holder);
 505
 506         if (IS_ERR(*bdev)) {
 507                 ret = PTR_ERR(*bdev);
 508                 goto error;
 509         }
 510
 511         if (flush)
 512                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 513         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 514         if (ret) {
 515                 blkdev_put(*bdev, flags);
 516                 goto error;
 517         }
 518         invalidate_bdev(*bdev);
 519         *disk_super = btrfs_read_dev_super(*bdev);
 520         if (IS_ERR(*disk_super)) {
 521                 ret = PTR_ERR(*disk_super);
 522                 blkdev_put(*bdev, flags);
 523                 goto error;
 524         }
 525
 526         return 0;
 527
 528 error:
 529         *bdev = NULL;
 530         return ret;
 531 }
 532
 533 /*
 534  * Check if the device in the path matches the device in the given struct device.
 535  *
 536  * Returns:
 537  *   true  If it is the same device.
 538  *   false If it is not the same device or on error.
 539  */
 540 static bool device_matched(const struct btrfs_device *device, const char *path)
 541 {
 542         char *device_name;
 543         dev_t dev_old;
 544         dev_t dev_new;
 545         int ret;
 546
 547         /*
 548          * If we are looking for a device with the matching dev_t, then skip
 549          * device without a name (a missing device).
 550          */
 551         if (!device->name)
 552                 return false;
 553
 554         device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
 555         if (!device_name)
 556                 return false;
 557
 558         rcu_read_lock();
 559         scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
 560         rcu_read_unlock();
 561
 562         ret = lookup_bdev(device_name, &dev_old);
 563         kfree(device_name);
 564         if (ret)
 565                 return false;
 566
 567         ret = lookup_bdev(path, &dev_new);
 568         if (ret)
 569                 return false;
 570
 571         if (dev_old == dev_new)
 572                 return true;
 573
 574         return false;
 575 }
 576
 577 /*
 578  *  Search and remove all stale (devices which are not mounted) devices.
 579  *  When both inputs are NULL, it will search and release all stale devices.
 580  *  path:       Optional. When provided will it release all unmounted devices
 581  *              matching this path only.
 582  *  skip_dev:   Optional. Will skip this device when searching for the stale
 583  *              devices.
 584  *  Return:     0 for success or if @path is NULL.
 585  *              -EBUSY if @path is a mounted device.
 586  *              -ENOENT if @path does not match any device in the list.
 587  */
 588 static int btrfs_free_stale_devices(const char *path,
 589                                      struct btrfs_device *skip_device)
 590 {
 591         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 592         struct btrfs_device *device, *tmp_device;
 593         int ret = 0;
 594
 595         lockdep_assert_held(&uuid_mutex);
 596
 597         if (path)
 598                 ret = -ENOENT;
 599
 600         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 601
 602                 mutex_lock(&fs_devices->device_list_mutex);
 603                 list_for_each_entry_safe(device, tmp_device,
 604                                          &fs_devices->devices, dev_list) {
 605                         if (skip_device && skip_device == device)
 606                                 continue;
 607                         if (path && !device_matched(device, path))
 608                                 continue;
 609                         if (fs_devices->opened) {
 610                                 /* for an already deleted device return 0 */
 611                                 if (path && ret != 0)
 612                                         ret = -EBUSY;
 613                                 break;
 614                         }
 615
 616                         /* delete the stale device */
 617                         fs_devices->num_devices--;
 618                         list_del(&device->dev_list);
 619                         btrfs_free_device(device);
 620
 621                         ret = 0;
 622                 }
 623                 mutex_unlock(&fs_devices->device_list_mutex);
 624
 625                 if (fs_devices->num_devices == 0) {
 626                         btrfs_sysfs_remove_fsid(fs_devices);
 627                         list_del(&fs_devices->fs_list);
 628                         free_fs_devices(fs_devices);
 629                 }
 630         }
 631
 632         return ret;
 633 }
 634
 635 /*
 636  * This is only used on mount, and we are protected from competing things
 637  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 638  * fs_devices->device_list_mutex here.
 639  */
 640 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 641                         struct btrfs_device *device, fmode_t flags,
 642                         void *holder)
 643 {
 644         struct request_queue *q;
 645         struct block_device *bdev;
 646         struct btrfs_super_block *disk_super;
 647         u64 devid;
 648         int ret;
 649
 650         if (device->bdev)
 651                 return -EINVAL;
 652         if (!device->name)
 653                 return -EINVAL;
 654
 655         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 656                                     &bdev, &disk_super);
 657         if (ret)
 658                 return ret;
 659
 660         devid = btrfs_stack_device_id(&disk_super->dev_item);
 661         if (devid != device->devid)
 662                 goto error_free_page;
 663
 664         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 665                 goto error_free_page;
 666
 667         device->generation = btrfs_super_generation(disk_super);
 668
 669         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 670                 if (btrfs_super_incompat_flags(disk_super) &
 671                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 672                         pr_err(
 673                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 674                         goto error_free_page;
 675                 }
 676
 677                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 678                 fs_devices->seeding = true;
 679         } else {
 680                 if (bdev_read_only(bdev))
 681                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 682                 else
 683                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 684         }
 685
 686         q = bdev_get_queue(bdev);
 687         if (!blk_queue_nonrot(q))
 688                 fs_devices->rotating = true;
 689
 690         device->bdev = bdev;
 691         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 692         device->mode = flags;
 693
 694         fs_devices->open_devices++;
 695         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 696             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 697                 fs_devices->rw_devices++;
 698                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 699         }
 700         btrfs_release_disk_super(disk_super);
 701
 702         return 0;
 703
 704 error_free_page:
 705         btrfs_release_disk_super(disk_super);
 706         blkdev_put(bdev, flags);
 707
 708         return -EINVAL;
 709 }
 710
 711 /*
 712  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 713  * being created with a disk that has already completed its fsid change. Such
 714  * disk can belong to an fs which has its FSID changed or to one which doesn't.
 715  * Handle both cases here.
 716  */
 717 static struct btrfs_fs_devices *find_fsid_inprogress(
 718                                         struct btrfs_super_block *disk_super)
 719 {
 720         struct btrfs_fs_devices *fs_devices;
 721
 722         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 723                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 724                            BTRFS_FSID_SIZE) != 0 &&
 725                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 726                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 727                         return fs_devices;
 728                 }
 729         }
 730
 731         return find_fsid(disk_super->fsid, NULL);
 732 }
 733
 734
 735 static struct btrfs_fs_devices *find_fsid_changed(
 736                                         struct btrfs_super_block *disk_super)
 737 {
 738         struct btrfs_fs_devices *fs_devices;
 739
 740         /*
 741          * Handles the case where scanned device is part of an fs that had
 742          * multiple successful changes of FSID but currently device didn't
 743          * observe it. Meaning our fsid will be different than theirs. We need
 744          * to handle two subcases :
 745          *  1 - The fs still continues to have different METADATA/FSID uuids.
 746          *  2 - The fs is switched back to its original FSID (METADATA/FSID
 747          *  are equal).
 748          */
 749         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 750                 /* Changed UUIDs */
 751                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 752                            BTRFS_FSID_SIZE) != 0 &&
 753                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 754                            BTRFS_FSID_SIZE) == 0 &&
 755                     memcmp(fs_devices->fsid, disk_super->fsid,
 756                            BTRFS_FSID_SIZE) != 0)
 757                         return fs_devices;
 758
 759                 /* Unchanged UUIDs */
 760                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 761                            BTRFS_FSID_SIZE) == 0 &&
 762                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 763                            BTRFS_FSID_SIZE) == 0)
 764                         return fs_devices;
 765         }
 766
 767         return NULL;
 768 }
 769
 770 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 771                                 struct btrfs_super_block *disk_super)
 772 {
 773         struct btrfs_fs_devices *fs_devices;
 774
 775         /*
 776          * Handle the case where the scanned device is part of an fs whose last
 777          * metadata UUID change reverted it to the original FSID. At the same
 778          * time * fs_devices was first created by another constitutent device
 779          * which didn't fully observe the operation. This results in an
 780          * btrfs_fs_devices created with metadata/fsid different AND
 781          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 782          * fs_devices equal to the FSID of the disk.
 783          */
 784         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 785                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 786                            BTRFS_FSID_SIZE) != 0 &&
 787                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 788                            BTRFS_FSID_SIZE) == 0 &&
 789                     fs_devices->fsid_change)
 790                         return fs_devices;
 791         }
 792
 793         return NULL;
 794 }
 795 /*
 796  * Add new device to list of registered devices
 797  *
 798  * Returns:
 799  * device pointer which was just added or updated when successful
 800  * error pointer when failed
 801  */
 802 static noinline struct btrfs_device *device_list_add(const char *path,
 803                            struct btrfs_super_block *disk_super,
 804                            bool *new_device_added)
 805 {
 806         struct btrfs_device *device;
 807         struct btrfs_fs_devices *fs_devices = NULL;
 808         struct rcu_string *name;
 809         u64 found_transid = btrfs_super_generation(disk_super);
 810         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 811         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 812                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 813         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 814                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 815
 816         if (fsid_change_in_progress) {
 817                 if (!has_metadata_uuid)
 818                         fs_devices = find_fsid_inprogress(disk_super);
 819                 else
 820                         fs_devices = find_fsid_changed(disk_super);
 821         } else if (has_metadata_uuid) {
 822                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
 823         } else {
 824                 fs_devices = find_fsid_reverted_metadata(disk_super);
 825                 if (!fs_devices)
 826                         fs_devices = find_fsid(disk_super->fsid, NULL);
 827         }
 828
 829
 830         if (!fs_devices) {
 831                 if (has_metadata_uuid)
 832                         fs_devices = alloc_fs_devices(disk_super->fsid,
 833                                                       disk_super->metadata_uuid);
 834                 else
 835                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 836
 837                 if (IS_ERR(fs_devices))
 838                         return ERR_CAST(fs_devices);
 839
 840                 fs_devices->fsid_change = fsid_change_in_progress;
 841
 842                 mutex_lock(&fs_devices->device_list_mutex);
 843                 list_add(&fs_devices->fs_list, &fs_uuids);
 844
 845                 device = NULL;
 846         } else {
 847                 mutex_lock(&fs_devices->device_list_mutex);
 848                 device = btrfs_find_device(fs_devices, devid,
 849                                 disk_super->dev_item.uuid, NULL);
 850
 851                 /*
 852                  * If this disk has been pulled into an fs devices created by
 853                  * a device which had the CHANGING_FSID_V2 flag then replace the
 854                  * metadata_uuid/fsid values of the fs_devices.
 855                  */
 856                 if (fs_devices->fsid_change &&
 857                     found_transid > fs_devices->latest_generation) {
 858                         memcpy(fs_devices->fsid, disk_super->fsid,
 859                                         BTRFS_FSID_SIZE);
 860
 861                         if (has_metadata_uuid)
 862                                 memcpy(fs_devices->metadata_uuid,
 863                                        disk_super->metadata_uuid,
 864                                        BTRFS_FSID_SIZE);
 865                         else
 866                                 memcpy(fs_devices->metadata_uuid,
 867                                        disk_super->fsid, BTRFS_FSID_SIZE);
 868
 869                         fs_devices->fsid_change = false;
 870                 }
 871         }
 872
 873         if (!device) {
 874                 if (fs_devices->opened) {
 875                         mutex_unlock(&fs_devices->device_list_mutex);
 876                         return ERR_PTR(-EBUSY);
 877                 }
 878
 879                 device = btrfs_alloc_device(NULL, &devid,
 880                                             disk_super->dev_item.uuid);
 881                 if (IS_ERR(device)) {
 882                         mutex_unlock(&fs_devices->device_list_mutex);
 883                         /* we can safely leave the fs_devices entry around */
 884                         return device;
 885                 }
 886
 887                 name = rcu_string_strdup(path, GFP_NOFS);
 888                 if (!name) {
 889                         btrfs_free_device(device);
 890                         mutex_unlock(&fs_devices->device_list_mutex);
 891                         return ERR_PTR(-ENOMEM);
 892                 }
 893                 rcu_assign_pointer(device->name, name);
 894
 895                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 896                 fs_devices->num_devices++;
 897
 898                 device->fs_devices = fs_devices;
 899                 *new_device_added = true;
 900
 901                 if (disk_super->label[0])
 902                         pr_info(
 903         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 904                                 disk_super->label, devid, found_transid, path,
 905                                 current->comm, task_pid_nr(current));
 906                 else
 907                         pr_info(
 908         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 909                                 disk_super->fsid, devid, found_transid, path,
 910                                 current->comm, task_pid_nr(current));
 911
 912         } else if (!device->name || strcmp(device->name->str, path)) {
 913                 /*
 914                  * When FS is already mounted.
 915                  * 1. If you are here and if the device->name is NULL that
 916                  *    means this device was missing at time of FS mount.
 917                  * 2. If you are here and if the device->name is different
 918                  *    from 'path' that means either
 919                  *      a. The same device disappeared and reappeared with
 920                  *         different name. or
 921                  *      b. The missing-disk-which-was-replaced, has
 922                  *         reappeared now.
 923                  *
 924                  * We must allow 1 and 2a above. But 2b would be a spurious
 925                  * and unintentional.
 926                  *
 927                  * Further in case of 1 and 2a above, the disk at 'path'
 928                  * would have missed some transaction when it was away and
 929                  * in case of 2a the stale bdev has to be updated as well.
 930                  * 2b must not be allowed at all time.
 931                  */
 932
 933                 /*
 934                  * For now, we do allow update to btrfs_fs_device through the
 935                  * btrfs dev scan cli after FS has been mounted.  We're still
 936                  * tracking a problem where systems fail mount by subvolume id
 937                  * when we reject replacement on a mounted FS.
 938                  */
 939                 if (!fs_devices->opened && found_transid < device->generation) {
 940                         /*
 941                          * That is if the FS is _not_ mounted and if you
 942                          * are here, that means there is more than one
 943                          * disk with same uuid and devid.We keep the one
 944                          * with larger generation number or the last-in if
 945                          * generation are equal.
 946                          */
 947                         mutex_unlock(&fs_devices->device_list_mutex);
 948                         return ERR_PTR(-EEXIST);
 949                 }
 950
 951                 /*
 952                  * We are going to replace the device path for a given devid,
 953                  * make sure it's the same device if the device is mounted
 954                  */
 955                 if (device->bdev) {
 956                         int error;
 957                         dev_t path_dev;
 958
 959                         error = lookup_bdev(path, &path_dev);
 960                         if (error) {
 961                                 mutex_unlock(&fs_devices->device_list_mutex);
 962                                 return ERR_PTR(error);
 963                         }
 964
 965                         if (device->bdev->bd_dev != path_dev) {
 966                                 mutex_unlock(&fs_devices->device_list_mutex);
 967                                 /*
 968                                  * device->fs_info may not be reliable here, so
 969                                  * pass in a NULL instead. This avoids a
 970                                  * possible use-after-free when the fs_info and
 971                                  * fs_info->sb are already torn down.
 972                                  */
 973                                 btrfs_warn_in_rcu(NULL,
 974         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 975                                                   path, devid, found_transid,
 976                                                   current->comm,
 977                                                   task_pid_nr(current));
 978                                 return ERR_PTR(-EEXIST);
 979                         }
 980                         btrfs_info_in_rcu(device->fs_info,
 981         "devid %llu device path %s changed to %s scanned by %s (%d)",
 982                                           devid, rcu_str_deref(device->name),
 983                                           path, current->comm,
 984                                           task_pid_nr(current));
 985                 }
 986
 987                 name = rcu_string_strdup(path, GFP_NOFS);
 988                 if (!name) {
 989                         mutex_unlock(&fs_devices->device_list_mutex);
 990                         return ERR_PTR(-ENOMEM);
 991                 }
 992                 rcu_string_free(device->name);
 993                 rcu_assign_pointer(device->name, name);
 994                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 995                         fs_devices->missing_devices--;
 996                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 997                 }
 998         }
 999
1000         /*
1001          * Unmount does not free the btrfs_device struct but would zero
1002          * generation along with most of the other members. So just update
1003          * it back. We need it to pick the disk with largest generation
1004          * (as above).
1005          */
1006         if (!fs_devices->opened) {
1007                 device->generation = found_transid;
1008                 fs_devices->latest_generation = max_t(u64, found_transid,
1009                                                 fs_devices->latest_generation);
1010         }
1011
1012         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1013
1014         mutex_unlock(&fs_devices->device_list_mutex);
1015         return device;
1016 }
1017
1018 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1019 {
1020         struct btrfs_fs_devices *fs_devices;
1021         struct btrfs_device *device;
1022         struct btrfs_device *orig_dev;
1023         int ret = 0;
1024
1025         lockdep_assert_held(&uuid_mutex);
1026
1027         fs_devices = alloc_fs_devices(orig->fsid, NULL);
1028         if (IS_ERR(fs_devices))
1029                 return fs_devices;
1030
1031         fs_devices->total_devices = orig->total_devices;
1032
1033         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1034                 struct rcu_string *name;
1035
1036                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1037                                             orig_dev->uuid);
1038                 if (IS_ERR(device)) {
1039                         ret = PTR_ERR(device);
1040                         goto error;
1041                 }
1042
1043                 /*
1044                  * This is ok to do without rcu read locked because we hold the
1045                  * uuid mutex so nothing we touch in here is going to disappear.
1046                  */
1047                 if (orig_dev->name) {
1048                         name = rcu_string_strdup(orig_dev->name->str,
1049                                         GFP_KERNEL);
1050                         if (!name) {
1051                                 btrfs_free_device(device);
1052                                 ret = -ENOMEM;
1053                                 goto error;
1054                         }
1055                         rcu_assign_pointer(device->name, name);
1056                 }
1057
1058                 list_add(&device->dev_list, &fs_devices->devices);
1059                 device->fs_devices = fs_devices;
1060                 fs_devices->num_devices++;
1061         }
1062         return fs_devices;
1063 error:
1064         free_fs_devices(fs_devices);
1065         return ERR_PTR(ret);
1066 }
1067
1068 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1069                                       struct btrfs_device **latest_dev)
1070 {
1071         struct btrfs_device *device, *next;
1072
1073         /* This is the initialized path, it is safe to release the devices. */
1074         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1075                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1076                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1077                                       &device->dev_state) &&
1078                             !test_bit(BTRFS_DEV_STATE_MISSING,
1079                                       &device->dev_state) &&
1080                             (!*latest_dev ||
1081                              device->generation > (*latest_dev)->generation)) {
1082                                 *latest_dev = device;
1083                         }
1084                         continue;
1085                 }
1086
1087                 /*
1088                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1089                  * in btrfs_init_dev_replace() so just continue.
1090                  */
1091                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1092                         continue;
1093
1094                 if (device->bdev) {
1095                         blkdev_put(device->bdev, device->mode);
1096                         device->bdev = NULL;
1097                         fs_devices->open_devices--;
1098                 }
1099                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1100                         list_del_init(&device->dev_alloc_list);
1101                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1102                         fs_devices->rw_devices--;
1103                 }
1104                 list_del_init(&device->dev_list);
1105                 fs_devices->num_devices--;
1106                 btrfs_free_device(device);
1107         }
1108
1109 }
1110
1111 /*
1112  * After we have read the system tree and know devids belonging to this
1113  * filesystem, remove the device which does not belong there.
1114  */
1115 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1116 {
1117         struct btrfs_device *latest_dev = NULL;
1118         struct btrfs_fs_devices *seed_dev;
1119
1120         mutex_lock(&uuid_mutex);
1121         __btrfs_free_extra_devids(fs_devices, &latest_dev);
1122
1123         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1124                 __btrfs_free_extra_devids(seed_dev, &latest_dev);
1125
1126         fs_devices->latest_dev = latest_dev;
1127
1128         mutex_unlock(&uuid_mutex);
1129 }
1130
1131 static void btrfs_close_bdev(struct btrfs_device *device)
1132 {
1133         if (!device->bdev)
1134                 return;
1135
1136         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1137                 sync_blockdev(device->bdev);
1138                 invalidate_bdev(device->bdev);
1139         }
1140
1141         blkdev_put(device->bdev, device->mode);
1142 }
1143
1144 static void btrfs_close_one_device(struct btrfs_device *device)
1145 {
1146         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1147
1148         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1149             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1150                 list_del_init(&device->dev_alloc_list);
1151                 fs_devices->rw_devices--;
1152         }
1153
1154         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1155                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1156
1157         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1158                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1159                 fs_devices->missing_devices--;
1160         }
1161
1162         btrfs_close_bdev(device);
1163         if (device->bdev) {
1164                 fs_devices->open_devices--;
1165                 device->bdev = NULL;
1166         }
1167         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1168         btrfs_destroy_dev_zone_info(device);
1169
1170         device->fs_info = NULL;
1171         atomic_set(&device->dev_stats_ccnt, 0);
1172         extent_io_tree_release(&device->alloc_state);
1173
1174         /*
1175          * Reset the flush error record. We might have a transient flush error
1176          * in this mount, and if so we aborted the current transaction and set
1177          * the fs to an error state, guaranteeing no super blocks can be further
1178          * committed. However that error might be transient and if we unmount the
1179          * filesystem and mount it again, we should allow the mount to succeed
1180          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1181          * filesystem again we still get flush errors, then we will again abort
1182          * any transaction and set the error state, guaranteeing no commits of
1183          * unsafe super blocks.
1184          */
1185         device->last_flush_error = 0;
1186
1187         /* Verify the device is back in a pristine state  */
1188         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1189         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1190         ASSERT(list_empty(&device->dev_alloc_list));
1191         ASSERT(list_empty(&device->post_commit_list));
1192         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1193 }
1194
1195 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1196 {
1197         struct btrfs_device *device, *tmp;
1198
1199         lockdep_assert_held(&uuid_mutex);
1200
1201         if (--fs_devices->opened > 0)
1202                 return;
1203
1204         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1205                 btrfs_close_one_device(device);
1206
1207         WARN_ON(fs_devices->open_devices);
1208         WARN_ON(fs_devices->rw_devices);
1209         fs_devices->opened = 0;
1210         fs_devices->seeding = false;
1211         fs_devices->fs_info = NULL;
1212 }
1213
1214 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1215 {
1216         LIST_HEAD(list);
1217         struct btrfs_fs_devices *tmp;
1218
1219         mutex_lock(&uuid_mutex);
1220         close_fs_devices(fs_devices);
1221         if (!fs_devices->opened)
1222                 list_splice_init(&fs_devices->seed_list, &list);
1223
1224         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1225                 close_fs_devices(fs_devices);
1226                 list_del(&fs_devices->seed_list);
1227                 free_fs_devices(fs_devices);
1228         }
1229         mutex_unlock(&uuid_mutex);
1230 }
1231
1232 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1233                                 fmode_t flags, void *holder)
1234 {
1235         struct btrfs_device *device;
1236         struct btrfs_device *latest_dev = NULL;
1237         struct btrfs_device *tmp_device;
1238
1239         flags |= FMODE_EXCL;
1240
1241         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1242                                  dev_list) {
1243                 int ret;
1244
1245                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1246                 if (ret == 0 &&
1247                     (!latest_dev || device->generation > latest_dev->generation)) {
1248                         latest_dev = device;
1249                 } else if (ret == -ENODATA) {
1250                         fs_devices->num_devices--;
1251                         list_del(&device->dev_list);
1252                         btrfs_free_device(device);
1253                 }
1254         }
1255         if (fs_devices->open_devices == 0)
1256                 return -EINVAL;
1257
1258         fs_devices->opened = 1;
1259         fs_devices->latest_dev = latest_dev;
1260         fs_devices->total_rw_bytes = 0;
1261         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1262         fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1263
1264         return 0;
1265 }
1266
1267 static int devid_cmp(void *priv, const struct list_head *a,
1268                      const struct list_head *b)
1269 {
1270         const struct btrfs_device *dev1, *dev2;
1271
1272         dev1 = list_entry(a, struct btrfs_device, dev_list);
1273         dev2 = list_entry(b, struct btrfs_device, dev_list);
1274
1275         if (dev1->devid < dev2->devid)
1276                 return -1;
1277         else if (dev1->devid > dev2->devid)
1278                 return 1;
1279         return 0;
1280 }
1281
1282 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1283                        fmode_t flags, void *holder)
1284 {
1285         int ret;
1286
1287         lockdep_assert_held(&uuid_mutex);
1288         /*
1289          * The device_list_mutex cannot be taken here in case opening the
1290          * underlying device takes further locks like open_mutex.
1291          *
1292          * We also don't need the lock here as this is called during mount and
1293          * exclusion is provided by uuid_mutex
1294          */
1295
1296         if (fs_devices->opened) {
1297                 fs_devices->opened++;
1298                 ret = 0;
1299         } else {
1300                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1301                 ret = open_fs_devices(fs_devices, flags, holder);
1302         }
1303
1304         return ret;
1305 }
1306
1307 void btrfs_release_disk_super(struct btrfs_super_block *super)
1308 {
1309         struct page *page = virt_to_page(super);
1310
1311         put_page(page);
1312 }
1313
1314 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1315                                                        u64 bytenr, u64 bytenr_orig)
1316 {
1317         struct btrfs_super_block *disk_super;
1318         struct page *page;
1319         void *p;
1320         pgoff_t index;
1321
1322         /* make sure our super fits in the device */
1323         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1324                 return ERR_PTR(-EINVAL);
1325
1326         /* make sure our super fits in the page */
1327         if (sizeof(*disk_super) > PAGE_SIZE)
1328                 return ERR_PTR(-EINVAL);
1329
1330         /* make sure our super doesn't straddle pages on disk */
1331         index = bytenr >> PAGE_SHIFT;
1332         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1333                 return ERR_PTR(-EINVAL);
1334
1335         /* pull in the page with our super */
1336         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1337
1338         if (IS_ERR(page))
1339                 return ERR_CAST(page);
1340
1341         p = page_address(page);
1342
1343         /* align our pointer to the offset of the super block */
1344         disk_super = p + offset_in_page(bytenr);
1345
1346         if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1347             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1348                 btrfs_release_disk_super(p);
1349                 return ERR_PTR(-EINVAL);
1350         }
1351
1352         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1353                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1354
1355         return disk_super;
1356 }
1357
1358 int btrfs_forget_devices(const char *path)
1359 {
1360         int ret;
1361
1362         mutex_lock(&uuid_mutex);
1363         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1364         mutex_unlock(&uuid_mutex);
1365
1366         return ret;
1367 }
1368
1369 /*
1370  * Look for a btrfs signature on a device. This may be called out of the mount path
1371  * and we are not allowed to call set_blocksize during the scan. The superblock
1372  * is read via pagecache
1373  */
1374 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1375                                            void *holder)
1376 {
1377         struct btrfs_super_block *disk_super;
1378         bool new_device_added = false;
1379         struct btrfs_device *device = NULL;
1380         struct block_device *bdev;
1381         u64 bytenr, bytenr_orig;
1382         int ret;
1383
1384         lockdep_assert_held(&uuid_mutex);
1385
1386         /*
1387          * we would like to check all the supers, but that would make
1388          * a btrfs mount succeed after a mkfs from a different FS.
1389          * So, we need to add a special mount option to scan for
1390          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1391          */
1392         flags |= FMODE_EXCL;
1393
1394         bdev = blkdev_get_by_path(path, flags, holder);
1395         if (IS_ERR(bdev))
1396                 return ERR_CAST(bdev);
1397
1398         bytenr_orig = btrfs_sb_offset(0);
1399         ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1400         if (ret) {
1401                 device = ERR_PTR(ret);
1402                 goto error_bdev_put;
1403         }
1404
1405         disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1406         if (IS_ERR(disk_super)) {
1407                 device = ERR_CAST(disk_super);
1408                 goto error_bdev_put;
1409         }
1410
1411         device = device_list_add(path, disk_super, &new_device_added);
1412         if (!IS_ERR(device)) {
1413                 if (new_device_added)
1414                         btrfs_free_stale_devices(path, device);
1415         }
1416
1417         btrfs_release_disk_super(disk_super);
1418
1419 error_bdev_put:
1420         blkdev_put(bdev, flags);
1421
1422         return device;
1423 }
1424
1425 /*
1426  * Try to find a chunk that intersects [start, start + len] range and when one
1427  * such is found, record the end of it in *start
1428  */
1429 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1430                                     u64 len)
1431 {
1432         u64 physical_start, physical_end;
1433
1434         lockdep_assert_held(&device->fs_info->chunk_mutex);
1435
1436         if (!find_first_extent_bit(&device->alloc_state, *start,
1437                                    &physical_start, &physical_end,
1438                                    CHUNK_ALLOCATED, NULL)) {
1439
1440                 if (in_range(physical_start, *start, len) ||
1441                     in_range(*start, physical_start,
1442                              physical_end - physical_start)) {
1443                         *start = physical_end + 1;
1444                         return true;
1445                 }
1446         }
1447         return false;
1448 }
1449
1450 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1451 {
1452         switch (device->fs_devices->chunk_alloc_policy) {
1453         case BTRFS_CHUNK_ALLOC_REGULAR:
1454                 /*
1455                  * We don't want to overwrite the superblock on the drive nor
1456                  * any area used by the boot loader (grub for example), so we
1457                  * make sure to start at an offset of at least 1MB.
1458                  */
1459                 return max_t(u64, start, SZ_1M);
1460         case BTRFS_CHUNK_ALLOC_ZONED:
1461                 /*
1462                  * We don't care about the starting region like regular
1463                  * allocator, because we anyway use/reserve the first two zones
1464                  * for superblock logging.
1465                  */
1466                 return ALIGN(start, device->zone_info->zone_size);
1467         default:
1468                 BUG();
1469         }
1470 }
1471
1472 static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1473                                         u64 *hole_start, u64 *hole_size,
1474                                         u64 num_bytes)
1475 {
1476         u64 zone_size = device->zone_info->zone_size;
1477         u64 pos;
1478         int ret;
1479         bool changed = false;
1480
1481         ASSERT(IS_ALIGNED(*hole_start, zone_size));
1482
1483         while (*hole_size > 0) {
1484                 pos = btrfs_find_allocatable_zones(device, *hole_start,
1485                                                    *hole_start + *hole_size,
1486                                                    num_bytes);
1487                 if (pos != *hole_start) {
1488                         *hole_size = *hole_start + *hole_size - pos;
1489                         *hole_start = pos;
1490                         changed = true;
1491                         if (*hole_size < num_bytes)
1492                                 break;
1493                 }
1494
1495                 ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1496
1497                 /* Range is ensured to be empty */
1498                 if (!ret)
1499                         return changed;
1500
1501                 /* Given hole range was invalid (outside of device) */
1502                 if (ret == -ERANGE) {
1503                         *hole_start += *hole_size;
1504                         *hole_size = 0;
1505                         return true;
1506                 }
1507
1508                 *hole_start += zone_size;
1509                 *hole_size -= zone_size;
1510                 changed = true;
1511         }
1512
1513         return changed;
1514 }
1515
1516 /**
1517  * dev_extent_hole_check - check if specified hole is suitable for allocation
1518  * @device:     the device which we have the hole
1519  * @hole_start: starting position of the hole
1520  * @hole_size:  the size of the hole
1521  * @num_bytes:  the size of the free space that we need
1522  *
1523  * This function may modify @hole_start and @hole_size to reflect the suitable
1524  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1525  */
1526 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1527                                   u64 *hole_size, u64 num_bytes)
1528 {
1529         bool changed = false;
1530         u64 hole_end = *hole_start + *hole_size;
1531
1532         for (;;) {
1533                 /*
1534                  * Check before we set max_hole_start, otherwise we could end up
1535                  * sending back this offset anyway.
1536                  */
1537                 if (contains_pending_extent(device, hole_start, *hole_size)) {
1538                         if (hole_end >= *hole_start)
1539                                 *hole_size = hole_end - *hole_start;
1540                         else
1541                                 *hole_size = 0;
1542                         changed = true;
1543                 }
1544
1545                 switch (device->fs_devices->chunk_alloc_policy) {
1546                 case BTRFS_CHUNK_ALLOC_REGULAR:
1547                         /* No extra check */
1548                         break;
1549                 case BTRFS_CHUNK_ALLOC_ZONED:
1550                         if (dev_extent_hole_check_zoned(device, hole_start,
1551                                                         hole_size, num_bytes)) {
1552                                 changed = true;
1553                                 /*
1554                                  * The changed hole can contain pending extent.
1555                                  * Loop again to check that.
1556                                  */
1557                                 continue;
1558                         }
1559                         break;
1560                 default:
1561                         BUG();
1562                 }
1563
1564                 break;
1565         }
1566
1567         return changed;
1568 }
1569
1570 /*
1571  * find_free_dev_extent_start - find free space in the specified device
1572  * @device:       the device which we search the free space in
1573  * @num_bytes:    the size of the free space that we need
1574  * @search_start: the position from which to begin the search
1575  * @start:        store the start of the free space.
1576  * @len:          the size of the free space. that we find, or the size
1577  *                of the max free space if we don't find suitable free space
1578  *
1579  * this uses a pretty simple search, the expectation is that it is
1580  * called very infrequently and that a given device has a small number
1581  * of extents
1582  *
1583  * @start is used to store the start of the free space if we find. But if we
1584  * don't find suitable free space, it will be used to store the start position
1585  * of the max free space.
1586  *
1587  * @len is used to store the size of the free space that we find.
1588  * But if we don't find suitable free space, it is used to store the size of
1589  * the max free space.
1590  *
1591  * NOTE: This function will search *commit* root of device tree, and does extra
1592  * check to ensure dev extents are not double allocated.
1593  * This makes the function safe to allocate dev extents but may not report
1594  * correct usable device space, as device extent freed in current transaction
1595  * is not reported as available.
1596  */
1597 static int find_free_dev_extent_start(struct btrfs_device *device,
1598                                 u64 num_bytes, u64 search_start, u64 *start,
1599                                 u64 *len)
1600 {
1601         struct btrfs_fs_info *fs_info = device->fs_info;
1602         struct btrfs_root *root = fs_info->dev_root;
1603         struct btrfs_key key;
1604         struct btrfs_dev_extent *dev_extent;
1605         struct btrfs_path *path;
1606         u64 hole_size;
1607         u64 max_hole_start;
1608         u64 max_hole_size;
1609         u64 extent_end;
1610         u64 search_end = device->total_bytes;
1611         int ret;
1612         int slot;
1613         struct extent_buffer *l;
1614
1615         search_start = dev_extent_search_start(device, search_start);
1616
1617         WARN_ON(device->zone_info &&
1618                 !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1619
1620         path = btrfs_alloc_path();
1621         if (!path)
1622                 return -ENOMEM;
1623
1624         max_hole_start = search_start;
1625         max_hole_size = 0;
1626
1627 again:
1628         if (search_start >= search_end ||
1629                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1630                 ret = -ENOSPC;
1631                 goto out;
1632         }
1633
1634         path->reada = READA_FORWARD;
1635         path->search_commit_root = 1;
1636         path->skip_locking = 1;
1637
1638         key.objectid = device->devid;
1639         key.offset = search_start;
1640         key.type = BTRFS_DEV_EXTENT_KEY;
1641
1642         ret = btrfs_search_backwards(root, &key, path);
1643         if (ret < 0)
1644                 goto out;
1645
1646         while (1) {
1647                 l = path->nodes[0];
1648                 slot = path->slots[0];
1649                 if (slot >= btrfs_header_nritems(l)) {
1650                         ret = btrfs_next_leaf(root, path);
1651                         if (ret == 0)
1652                                 continue;
1653                         if (ret < 0)
1654                                 goto out;
1655
1656                         break;
1657                 }
1658                 btrfs_item_key_to_cpu(l, &key, slot);
1659
1660                 if (key.objectid < device->devid)
1661                         goto next;
1662
1663                 if (key.objectid > device->devid)
1664                         break;
1665
1666                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1667                         goto next;
1668
1669                 if (key.offset > search_start) {
1670                         hole_size = key.offset - search_start;
1671                         dev_extent_hole_check(device, &search_start, &hole_size,
1672                                               num_bytes);
1673
1674                         if (hole_size > max_hole_size) {
1675                                 max_hole_start = search_start;
1676                                 max_hole_size = hole_size;
1677                         }
1678
1679                         /*
1680                          * If this free space is greater than which we need,
1681                          * it must be the max free space that we have found
1682                          * until now, so max_hole_start must point to the start
1683                          * of this free space and the length of this free space
1684                          * is stored in max_hole_size. Thus, we return
1685                          * max_hole_start and max_hole_size and go back to the
1686                          * caller.
1687                          */
1688                         if (hole_size >= num_bytes) {
1689                                 ret = 0;
1690                                 goto out;
1691                         }
1692                 }
1693
1694                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1695                 extent_end = key.offset + btrfs_dev_extent_length(l,
1696                                                                   dev_extent);
1697                 if (extent_end > search_start)
1698                         search_start = extent_end;
1699 next:
1700                 path->slots[0]++;
1701                 cond_resched();
1702         }
1703
1704         /*
1705          * At this point, search_start should be the end of
1706          * allocated dev extents, and when shrinking the device,
1707          * search_end may be smaller than search_start.
1708          */
1709         if (search_end > search_start) {
1710                 hole_size = search_end - search_start;
1711                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1712                                           num_bytes)) {
1713                         btrfs_release_path(path);
1714                         goto again;
1715                 }
1716
1717                 if (hole_size > max_hole_size) {
1718                         max_hole_start = search_start;
1719                         max_hole_size = hole_size;
1720                 }
1721         }
1722
1723         /* See above. */
1724         if (max_hole_size < num_bytes)
1725                 ret = -ENOSPC;
1726         else
1727                 ret = 0;
1728
1729 out:
1730         btrfs_free_path(path);
1731         *start = max_hole_start;
1732         if (len)
1733                 *len = max_hole_size;
1734         return ret;
1735 }
1736
1737 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1738                          u64 *start, u64 *len)
1739 {
1740         /* FIXME use last free of some kind */
1741         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1742 }
1743
1744 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1745                           struct btrfs_device *device,
1746                           u64 start, u64 *dev_extent_len)
1747 {
1748         struct btrfs_fs_info *fs_info = device->fs_info;
1749         struct btrfs_root *root = fs_info->dev_root;
1750         int ret;
1751         struct btrfs_path *path;
1752         struct btrfs_key key;
1753         struct btrfs_key found_key;
1754         struct extent_buffer *leaf = NULL;
1755         struct btrfs_dev_extent *extent = NULL;
1756
1757         path = btrfs_alloc_path();
1758         if (!path)
1759                 return -ENOMEM;
1760
1761         key.objectid = device->devid;
1762         key.offset = start;
1763         key.type = BTRFS_DEV_EXTENT_KEY;
1764 again:
1765         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1766         if (ret > 0) {
1767                 ret = btrfs_previous_item(root, path, key.objectid,
1768                                           BTRFS_DEV_EXTENT_KEY);
1769                 if (ret)
1770                         goto out;
1771                 leaf = path->nodes[0];
1772                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1773                 extent = btrfs_item_ptr(leaf, path->slots[0],
1774                                         struct btrfs_dev_extent);
1775                 BUG_ON(found_key.offset > start || found_key.offset +
1776                        btrfs_dev_extent_length(leaf, extent) < start);
1777                 key = found_key;
1778                 btrfs_release_path(path);
1779                 goto again;
1780         } else if (ret == 0) {
1781                 leaf = path->nodes[0];
1782                 extent = btrfs_item_ptr(leaf, path->slots[0],
1783                                         struct btrfs_dev_extent);
1784         } else {
1785                 goto out;
1786         }
1787
1788         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1789
1790         ret = btrfs_del_item(trans, root, path);
1791         if (ret == 0)
1792                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1793 out:
1794         btrfs_free_path(path);
1795         return ret;
1796 }
1797
1798 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1799 {
1800         struct extent_map_tree *em_tree;
1801         struct extent_map *em;
1802         struct rb_node *n;
1803         u64 ret = 0;
1804
1805         em_tree = &fs_info->mapping_tree;
1806         read_lock(&em_tree->lock);
1807         n = rb_last(&em_tree->map.rb_root);
1808         if (n) {
1809                 em = rb_entry(n, struct extent_map, rb_node);
1810                 ret = em->start + em->len;
1811         }
1812         read_unlock(&em_tree->lock);
1813
1814         return ret;
1815 }
1816
1817 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1818                                     u64 *devid_ret)
1819 {
1820         int ret;
1821         struct btrfs_key key;
1822         struct btrfs_key found_key;
1823         struct btrfs_path *path;
1824
1825         path = btrfs_alloc_path();
1826         if (!path)
1827                 return -ENOMEM;
1828
1829         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1830         key.type = BTRFS_DEV_ITEM_KEY;
1831         key.offset = (u64)-1;
1832
1833         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1834         if (ret < 0)
1835                 goto error;
1836
1837         if (ret == 0) {
1838                 /* Corruption */
1839                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1840                 ret = -EUCLEAN;
1841                 goto error;
1842         }
1843
1844         ret = btrfs_previous_item(fs_info->chunk_root, path,
1845                                   BTRFS_DEV_ITEMS_OBJECTID,
1846                                   BTRFS_DEV_ITEM_KEY);
1847         if (ret) {
1848                 *devid_ret = 1;
1849         } else {
1850                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1851                                       path->slots[0]);
1852                 *devid_ret = found_key.offset + 1;
1853         }
1854         ret = 0;
1855 error:
1856         btrfs_free_path(path);
1857         return ret;
1858 }
1859
1860 /*
1861  * the device information is stored in the chunk root
1862  * the btrfs_device struct should be fully filled in
1863  */
1864 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1865                             struct btrfs_device *device)
1866 {
1867         int ret;
1868         struct btrfs_path *path;
1869         struct btrfs_dev_item *dev_item;
1870         struct extent_buffer *leaf;
1871         struct btrfs_key key;
1872         unsigned long ptr;
1873
1874         path = btrfs_alloc_path();
1875         if (!path)
1876                 return -ENOMEM;
1877
1878         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1879         key.type = BTRFS_DEV_ITEM_KEY;
1880         key.offset = device->devid;
1881
1882         btrfs_reserve_chunk_metadata(trans, true);
1883         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1884                                       &key, sizeof(*dev_item));
1885         btrfs_trans_release_chunk_metadata(trans);
1886         if (ret)
1887                 goto out;
1888
1889         leaf = path->nodes[0];
1890         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1891
1892         btrfs_set_device_id(leaf, dev_item, device->devid);
1893         btrfs_set_device_generation(leaf, dev_item, 0);
1894         btrfs_set_device_type(leaf, dev_item, device->type);
1895         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1896         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1897         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1898         btrfs_set_device_total_bytes(leaf, dev_item,
1899                                      btrfs_device_get_disk_total_bytes(device));
1900         btrfs_set_device_bytes_used(leaf, dev_item,
1901                                     btrfs_device_get_bytes_used(device));
1902         btrfs_set_device_group(leaf, dev_item, 0);
1903         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1904         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1905         btrfs_set_device_start_offset(leaf, dev_item, 0);
1906
1907         ptr = btrfs_device_uuid(dev_item);
1908         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1909         ptr = btrfs_device_fsid(dev_item);
1910         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1911                             ptr, BTRFS_FSID_SIZE);
1912         btrfs_mark_buffer_dirty(leaf);
1913
1914         ret = 0;
1915 out:
1916         btrfs_free_path(path);
1917         return ret;
1918 }
1919
1920 /*
1921  * Function to update ctime/mtime for a given device path.
1922  * Mainly used for ctime/mtime based probe like libblkid.
1923  *
1924  * We don't care about errors here, this is just to be kind to userspace.
1925  */
1926 static void update_dev_time(const char *device_path)
1927 {
1928         struct path path;
1929         struct timespec64 now;
1930         int ret;
1931
1932         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1933         if (ret)
1934                 return;
1935
1936         now = current_time(d_inode(path.dentry));
1937         inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1938         path_put(&path);
1939 }
1940
1941 static int btrfs_rm_dev_item(struct btrfs_device *device)
1942 {
1943         struct btrfs_root *root = device->fs_info->chunk_root;
1944         int ret;
1945         struct btrfs_path *path;
1946         struct btrfs_key key;
1947         struct btrfs_trans_handle *trans;
1948
1949         path = btrfs_alloc_path();
1950         if (!path)
1951                 return -ENOMEM;
1952
1953         trans = btrfs_start_transaction(root, 0);
1954         if (IS_ERR(trans)) {
1955                 btrfs_free_path(path);
1956                 return PTR_ERR(trans);
1957         }
1958         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1959         key.type = BTRFS_DEV_ITEM_KEY;
1960         key.offset = device->devid;
1961
1962         btrfs_reserve_chunk_metadata(trans, false);
1963         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1964         btrfs_trans_release_chunk_metadata(trans);
1965         if (ret) {
1966                 if (ret > 0)
1967                         ret = -ENOENT;
1968                 btrfs_abort_transaction(trans, ret);
1969                 btrfs_end_transaction(trans);
1970                 goto out;
1971         }
1972
1973         ret = btrfs_del_item(trans, root, path);
1974         if (ret) {
1975                 btrfs_abort_transaction(trans, ret);
1976                 btrfs_end_transaction(trans);
1977         }
1978
1979 out:
1980         btrfs_free_path(path);
1981         if (!ret)
1982                 ret = btrfs_commit_transaction(trans);
1983         return ret;
1984 }
1985
1986 /*
1987  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1988  * filesystem. It's up to the caller to adjust that number regarding eg. device
1989  * replace.
1990  */
1991 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1992                 u64 num_devices)
1993 {
1994         u64 all_avail;
1995         unsigned seq;
1996         int i;
1997
1998         do {
1999                 seq = read_seqbegin(&fs_info->profiles_lock);
2000
2001                 all_avail = fs_info->avail_data_alloc_bits |
2002                             fs_info->avail_system_alloc_bits |
2003                             fs_info->avail_metadata_alloc_bits;
2004         } while (read_seqretry(&fs_info->profiles_lock, seq));
2005
2006         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2007                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
2008                         continue;
2009
2010                 if (num_devices < btrfs_raid_array[i].devs_min)
2011                         return btrfs_raid_array[i].mindev_error;
2012         }
2013
2014         return 0;
2015 }
2016
2017 static struct btrfs_device * btrfs_find_next_active_device(
2018                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2019 {
2020         struct btrfs_device *next_device;
2021
2022         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2023                 if (next_device != device &&
2024                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2025                     && next_device->bdev)
2026                         return next_device;
2027         }
2028
2029         return NULL;
2030 }
2031
2032 /*
2033  * Helper function to check if the given device is part of s_bdev / latest_dev
2034  * and replace it with the provided or the next active device, in the context
2035  * where this function called, there should be always be another device (or
2036  * this_dev) which is active.
2037  */
2038 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2039                                             struct btrfs_device *next_device)
2040 {
2041         struct btrfs_fs_info *fs_info = device->fs_info;
2042
2043         if (!next_device)
2044                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2045                                                             device);
2046         ASSERT(next_device);
2047
2048         if (fs_info->sb->s_bdev &&
2049                         (fs_info->sb->s_bdev == device->bdev))
2050                 fs_info->sb->s_bdev = next_device->bdev;
2051
2052         if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
2053                 fs_info->fs_devices->latest_dev = next_device;
2054 }
2055
2056 /*
2057  * Return btrfs_fs_devices::num_devices excluding the device that's being
2058  * currently replaced.
2059  */
2060 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2061 {
2062         u64 num_devices = fs_info->fs_devices->num_devices;
2063
2064         down_read(&fs_info->dev_replace.rwsem);
2065         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2066                 ASSERT(num_devices > 1);
2067                 num_devices--;
2068         }
2069         up_read(&fs_info->dev_replace.rwsem);
2070
2071         return num_devices;
2072 }
2073
2074 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2075                                struct block_device *bdev,
2076                                const char *device_path)
2077 {
2078         struct btrfs_super_block *disk_super;
2079         int copy_num;
2080
2081         if (!bdev)
2082                 return;
2083
2084         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2085                 struct page *page;
2086                 int ret;
2087
2088                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2089                 if (IS_ERR(disk_super))
2090                         continue;
2091
2092                 if (bdev_is_zoned(bdev)) {
2093                         btrfs_reset_sb_log_zones(bdev, copy_num);
2094                         continue;
2095                 }
2096
2097                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2098
2099                 page = virt_to_page(disk_super);
2100                 set_page_dirty(page);
2101                 lock_page(page);
2102                 /* write_on_page() unlocks the page */
2103                 ret = write_one_page(page);
2104                 if (ret)
2105                         btrfs_warn(fs_info,
2106                                 "error clearing superblock number %d (%d)",
2107                                 copy_num, ret);
2108                 btrfs_release_disk_super(disk_super);
2109
2110         }
2111
2112         /* Notify udev that device has changed */
2113         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2114
2115         /* Update ctime/mtime for device path for libblkid */
2116         update_dev_time(device_path);
2117 }
2118
2119 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2120                     u64 devid, struct block_device **bdev, fmode_t *mode)
2121 {
2122         struct btrfs_device *device;
2123         struct btrfs_fs_devices *cur_devices;
2124         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2125         u64 num_devices;
2126         int ret = 0;
2127
2128         /*
2129          * The device list in fs_devices is accessed without locks (neither
2130          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2131          * filesystem and another device rm cannot run.
2132          */
2133         num_devices = btrfs_num_devices(fs_info);
2134
2135         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2136         if (ret)
2137                 goto out;
2138
2139         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2140
2141         if (IS_ERR(device)) {
2142                 if (PTR_ERR(device) == -ENOENT &&
2143                     device_path && strcmp(device_path, "missing") == 0)
2144                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2145                 else
2146                         ret = PTR_ERR(device);
2147                 goto out;
2148         }
2149
2150         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2151                 btrfs_warn_in_rcu(fs_info,
2152                   "cannot remove device %s (devid %llu) due to active swapfile",
2153                                   rcu_str_deref(device->name), device->devid);
2154                 ret = -ETXTBSY;
2155                 goto out;
2156         }
2157
2158         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2159                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2160                 goto out;
2161         }
2162
2163         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2164             fs_info->fs_devices->rw_devices == 1) {
2165                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2166                 goto out;
2167         }
2168
2169         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2170                 mutex_lock(&fs_info->chunk_mutex);
2171                 list_del_init(&device->dev_alloc_list);
2172                 device->fs_devices->rw_devices--;
2173                 mutex_unlock(&fs_info->chunk_mutex);
2174         }
2175
2176         ret = btrfs_shrink_device(device, 0);
2177         if (!ret)
2178                 btrfs_reada_remove_dev(device);
2179         if (ret)
2180                 goto error_undo;
2181
2182         /*
2183          * TODO: the superblock still includes this device in its num_devices
2184          * counter although write_all_supers() is not locked out. This
2185          * could give a filesystem state which requires a degraded mount.
2186          */
2187         ret = btrfs_rm_dev_item(device);
2188         if (ret)
2189                 goto error_undo;
2190
2191         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2192         btrfs_scrub_cancel_dev(device);
2193
2194         /*
2195          * the device list mutex makes sure that we don't change
2196          * the device list while someone else is writing out all
2197          * the device supers. Whoever is writing all supers, should
2198          * lock the device list mutex before getting the number of
2199          * devices in the super block (super_copy). Conversely,
2200          * whoever updates the number of devices in the super block
2201          * (super_copy) should hold the device list mutex.
2202          */
2203
2204         /*
2205          * In normal cases the cur_devices == fs_devices. But in case
2206          * of deleting a seed device, the cur_devices should point to
2207          * its own fs_devices listed under the fs_devices->seed.
2208          */
2209         cur_devices = device->fs_devices;
2210         mutex_lock(&fs_devices->device_list_mutex);
2211         list_del_rcu(&device->dev_list);
2212
2213         cur_devices->num_devices--;
2214         cur_devices->total_devices--;
2215         /* Update total_devices of the parent fs_devices if it's seed */
2216         if (cur_devices != fs_devices)
2217                 fs_devices->total_devices--;
2218
2219         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2220                 cur_devices->missing_devices--;
2221
2222         btrfs_assign_next_active_device(device, NULL);
2223
2224         if (device->bdev) {
2225                 cur_devices->open_devices--;
2226                 /* remove sysfs entry */
2227                 btrfs_sysfs_remove_device(device);
2228         }
2229
2230         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2231         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2232         mutex_unlock(&fs_devices->device_list_mutex);
2233
2234         /*
2235          * At this point, the device is zero sized and detached from the
2236          * devices list.  All that's left is to zero out the old supers and
2237          * free the device.
2238          *
2239          * We cannot call btrfs_close_bdev() here because we're holding the sb
2240          * write lock, and blkdev_put() will pull in the ->open_mutex on the
2241          * block device and it's dependencies.  Instead just flush the device
2242          * and let the caller do the final blkdev_put.
2243          */
2244         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2245                 btrfs_scratch_superblocks(fs_info, device->bdev,
2246                                           device->name->str);
2247                 if (device->bdev) {
2248                         sync_blockdev(device->bdev);
2249                         invalidate_bdev(device->bdev);
2250                 }
2251         }
2252
2253         *bdev = device->bdev;
2254         *mode = device->mode;
2255         synchronize_rcu();
2256         btrfs_free_device(device);
2257
2258         if (cur_devices->open_devices == 0) {
2259                 list_del_init(&cur_devices->seed_list);
2260                 close_fs_devices(cur_devices);
2261                 free_fs_devices(cur_devices);
2262         }
2263
2264 out:
2265         return ret;
2266
2267 error_undo:
2268         btrfs_reada_undo_remove_dev(device);
2269         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2270                 mutex_lock(&fs_info->chunk_mutex);
2271                 list_add(&device->dev_alloc_list,
2272                          &fs_devices->alloc_list);
2273                 device->fs_devices->rw_devices++;
2274                 mutex_unlock(&fs_info->chunk_mutex);
2275         }
2276         goto out;
2277 }
2278
2279 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2280 {
2281         struct btrfs_fs_devices *fs_devices;
2282
2283         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2284
2285         /*
2286          * in case of fs with no seed, srcdev->fs_devices will point
2287          * to fs_devices of fs_info. However when the dev being replaced is
2288          * a seed dev it will point to the seed's local fs_devices. In short
2289          * srcdev will have its correct fs_devices in both the cases.
2290          */
2291         fs_devices = srcdev->fs_devices;
2292
2293         list_del_rcu(&srcdev->dev_list);
2294         list_del(&srcdev->dev_alloc_list);
2295         fs_devices->num_devices--;
2296         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2297                 fs_devices->missing_devices--;
2298
2299         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2300                 fs_devices->rw_devices--;
2301
2302         if (srcdev->bdev)
2303                 fs_devices->open_devices--;
2304 }
2305
2306 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2307 {
2308         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2309
2310         mutex_lock(&uuid_mutex);
2311
2312         btrfs_close_bdev(srcdev);
2313         synchronize_rcu();
2314         btrfs_free_device(srcdev);
2315
2316         /* if this is no devs we rather delete the fs_devices */
2317         if (!fs_devices->num_devices) {
2318                 /*
2319                  * On a mounted FS, num_devices can't be zero unless it's a
2320                  * seed. In case of a seed device being replaced, the replace
2321                  * target added to the sprout FS, so there will be no more
2322                  * device left under the seed FS.
2323                  */
2324                 ASSERT(fs_devices->seeding);
2325
2326                 list_del_init(&fs_devices->seed_list);
2327                 close_fs_devices(fs_devices);
2328                 free_fs_devices(fs_devices);
2329         }
2330         mutex_unlock(&uuid_mutex);
2331 }
2332
2333 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2334 {
2335         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2336
2337         mutex_lock(&fs_devices->device_list_mutex);
2338
2339         btrfs_sysfs_remove_device(tgtdev);
2340
2341         if (tgtdev->bdev)
2342                 fs_devices->open_devices--;
2343
2344         fs_devices->num_devices--;
2345
2346         btrfs_assign_next_active_device(tgtdev, NULL);
2347
2348         list_del_rcu(&tgtdev->dev_list);
2349
2350         mutex_unlock(&fs_devices->device_list_mutex);
2351
2352         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2353                                   tgtdev->name->str);
2354
2355         btrfs_close_bdev(tgtdev);
2356         synchronize_rcu();
2357         btrfs_free_device(tgtdev);
2358 }
2359
2360 static struct btrfs_device *btrfs_find_device_by_path(
2361                 struct btrfs_fs_info *fs_info, const char *device_path)
2362 {
2363         int ret = 0;
2364         struct btrfs_super_block *disk_super;
2365         u64 devid;
2366         u8 *dev_uuid;
2367         struct block_device *bdev;
2368         struct btrfs_device *device;
2369
2370         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2371                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2372         if (ret)
2373                 return ERR_PTR(ret);
2374
2375         devid = btrfs_stack_device_id(&disk_super->dev_item);
2376         dev_uuid = disk_super->dev_item.uuid;
2377         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2378                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2379                                            disk_super->metadata_uuid);
2380         else
2381                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2382                                            disk_super->fsid);
2383
2384         btrfs_release_disk_super(disk_super);
2385         if (!device)
2386                 device = ERR_PTR(-ENOENT);
2387         blkdev_put(bdev, FMODE_READ);
2388         return device;
2389 }
2390
2391 /*
2392  * Lookup a device given by device id, or the path if the id is 0.
2393  */
2394 struct btrfs_device *btrfs_find_device_by_devspec(
2395                 struct btrfs_fs_info *fs_info, u64 devid,
2396                 const char *device_path)
2397 {
2398         struct btrfs_device *device;
2399
2400         if (devid) {
2401                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2402                                            NULL);
2403                 if (!device)
2404                         return ERR_PTR(-ENOENT);
2405                 return device;
2406         }
2407
2408         if (!device_path || !device_path[0])
2409                 return ERR_PTR(-EINVAL);
2410
2411         if (strcmp(device_path, "missing") == 0) {
2412                 /* Find first missing device */
2413                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2414                                     dev_list) {
2415                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2416                                      &device->dev_state) && !device->bdev)
2417                                 return device;
2418                 }
2419                 return ERR_PTR(-ENOENT);
2420         }
2421
2422         return btrfs_find_device_by_path(fs_info, device_path);
2423 }
2424
2425 /*
2426  * does all the dirty work required for changing file system's UUID.
2427  */
2428 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2429 {
2430         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2431         struct btrfs_fs_devices *old_devices;
2432         struct btrfs_fs_devices *seed_devices;
2433         struct btrfs_super_block *disk_super = fs_info->super_copy;
2434         struct btrfs_device *device;
2435         u64 super_flags;
2436
2437         lockdep_assert_held(&uuid_mutex);
2438         if (!fs_devices->seeding)
2439                 return -EINVAL;
2440
2441         /*
2442          * Private copy of the seed devices, anchored at
2443          * fs_info->fs_devices->seed_list
2444          */
2445         seed_devices = alloc_fs_devices(NULL, NULL);
2446         if (IS_ERR(seed_devices))
2447                 return PTR_ERR(seed_devices);
2448
2449         /*
2450          * It's necessary to retain a copy of the original seed fs_devices in
2451          * fs_uuids so that filesystems which have been seeded can successfully
2452          * reference the seed device from open_seed_devices. This also supports
2453          * multiple fs seed.
2454          */
2455         old_devices = clone_fs_devices(fs_devices);
2456         if (IS_ERR(old_devices)) {
2457                 kfree(seed_devices);
2458                 return PTR_ERR(old_devices);
2459         }
2460
2461         list_add(&old_devices->fs_list, &fs_uuids);
2462
2463         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2464         seed_devices->opened = 1;
2465         INIT_LIST_HEAD(&seed_devices->devices);
2466         INIT_LIST_HEAD(&seed_devices->alloc_list);
2467         mutex_init(&seed_devices->device_list_mutex);
2468
2469         mutex_lock(&fs_devices->device_list_mutex);
2470         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2471                               synchronize_rcu);
2472         list_for_each_entry(device, &seed_devices->devices, dev_list)
2473                 device->fs_devices = seed_devices;
2474
2475         fs_devices->seeding = false;
2476         fs_devices->num_devices = 0;
2477         fs_devices->open_devices = 0;
2478         fs_devices->missing_devices = 0;
2479         fs_devices->rotating = false;
2480         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2481
2482         generate_random_uuid(fs_devices->fsid);
2483         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2484         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2485         mutex_unlock(&fs_devices->device_list_mutex);
2486
2487         super_flags = btrfs_super_flags(disk_super) &
2488                       ~BTRFS_SUPER_FLAG_SEEDING;
2489         btrfs_set_super_flags(disk_super, super_flags);
2490
2491         return 0;
2492 }
2493
2494 /*
2495  * Store the expected generation for seed devices in device items.
2496  */
2497 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2498 {
2499         struct btrfs_fs_info *fs_info = trans->fs_info;
2500         struct btrfs_root *root = fs_info->chunk_root;
2501         struct btrfs_path *path;
2502         struct extent_buffer *leaf;
2503         struct btrfs_dev_item *dev_item;
2504         struct btrfs_device *device;
2505         struct btrfs_key key;
2506         u8 fs_uuid[BTRFS_FSID_SIZE];
2507         u8 dev_uuid[BTRFS_UUID_SIZE];
2508         u64 devid;
2509         int ret;
2510
2511         path = btrfs_alloc_path();
2512         if (!path)
2513                 return -ENOMEM;
2514
2515         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2516         key.offset = 0;
2517         key.type = BTRFS_DEV_ITEM_KEY;
2518
2519         while (1) {
2520                 btrfs_reserve_chunk_metadata(trans, false);
2521                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2522                 btrfs_trans_release_chunk_metadata(trans);
2523                 if (ret < 0)
2524                         goto error;
2525
2526                 leaf = path->nodes[0];
2527 next_slot:
2528                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2529                         ret = btrfs_next_leaf(root, path);
2530                         if (ret > 0)
2531                                 break;
2532                         if (ret < 0)
2533                                 goto error;
2534                         leaf = path->nodes[0];
2535                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2536                         btrfs_release_path(path);
2537                         continue;
2538                 }
2539
2540                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2541                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2542                     key.type != BTRFS_DEV_ITEM_KEY)
2543                         break;
2544
2545                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2546                                           struct btrfs_dev_item);
2547                 devid = btrfs_device_id(leaf, dev_item);
2548                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2549                                    BTRFS_UUID_SIZE);
2550                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2551                                    BTRFS_FSID_SIZE);
2552                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2553                                            fs_uuid);
2554                 BUG_ON(!device); /* Logic error */
2555
2556                 if (device->fs_devices->seeding) {
2557                         btrfs_set_device_generation(leaf, dev_item,
2558                                                     device->generation);
2559                         btrfs_mark_buffer_dirty(leaf);
2560                 }
2561
2562                 path->slots[0]++;
2563                 goto next_slot;
2564         }
2565         ret = 0;
2566 error:
2567         btrfs_free_path(path);
2568         return ret;
2569 }
2570
2571 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2572 {
2573         struct btrfs_root *root = fs_info->dev_root;
2574         struct request_queue *q;
2575         struct btrfs_trans_handle *trans;
2576         struct btrfs_device *device;
2577         struct block_device *bdev;
2578         struct super_block *sb = fs_info->sb;
2579         struct rcu_string *name;
2580         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2581         u64 orig_super_total_bytes;
2582         u64 orig_super_num_devices;
2583         int seeding_dev = 0;
2584         int ret = 0;
2585         bool locked = false;
2586
2587         if (sb_rdonly(sb) && !fs_devices->seeding)
2588                 return -EROFS;
2589
2590         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2591                                   fs_info->bdev_holder);
2592         if (IS_ERR(bdev))
2593                 return PTR_ERR(bdev);
2594
2595         if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2596                 ret = -EINVAL;
2597                 goto error;
2598         }
2599
2600         if (fs_devices->seeding) {
2601                 seeding_dev = 1;
2602                 down_write(&sb->s_umount);
2603                 mutex_lock(&uuid_mutex);
2604                 locked = true;
2605         }
2606
2607         sync_blockdev(bdev);
2608
2609         rcu_read_lock();
2610         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2611                 if (device->bdev == bdev) {
2612                         ret = -EEXIST;
2613                         rcu_read_unlock();
2614                         goto error;
2615                 }
2616         }
2617         rcu_read_unlock();
2618
2619         device = btrfs_alloc_device(fs_info, NULL, NULL);
2620         if (IS_ERR(device)) {
2621                 /* we can safely leave the fs_devices entry around */
2622                 ret = PTR_ERR(device);
2623                 goto error;
2624         }
2625
2626         name = rcu_string_strdup(device_path, GFP_KERNEL);
2627         if (!name) {
2628                 ret = -ENOMEM;
2629                 goto error_free_device;
2630         }
2631         rcu_assign_pointer(device->name, name);
2632
2633         device->fs_info = fs_info;
2634         device->bdev = bdev;
2635
2636         ret = btrfs_get_dev_zone_info(device, false);
2637         if (ret)
2638                 goto error_free_device;
2639
2640         trans = btrfs_start_transaction(root, 0);
2641         if (IS_ERR(trans)) {
2642                 ret = PTR_ERR(trans);
2643                 goto error_free_zone;
2644         }
2645
2646         q = bdev_get_queue(bdev);
2647         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2648         device->generation = trans->transid;
2649         device->io_width = fs_info->sectorsize;
2650         device->io_align = fs_info->sectorsize;
2651         device->sector_size = fs_info->sectorsize;
2652         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2653                                          fs_info->sectorsize);
2654         device->disk_total_bytes = device->total_bytes;
2655         device->commit_total_bytes = device->total_bytes;
2656         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2657         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2658         device->mode = FMODE_EXCL;
2659         device->dev_stats_valid = 1;
2660         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2661
2662         if (seeding_dev) {
2663                 btrfs_clear_sb_rdonly(sb);
2664                 ret = btrfs_prepare_sprout(fs_info);
2665                 if (ret) {
2666                         btrfs_abort_transaction(trans, ret);
2667                         goto error_trans;
2668                 }
2669                 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
2670                                                 device);
2671         }
2672
2673         device->fs_devices = fs_devices;
2674
2675         mutex_lock(&fs_devices->device_list_mutex);
2676         mutex_lock(&fs_info->chunk_mutex);
2677         list_add_rcu(&device->dev_list, &fs_devices->devices);
2678         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2679         fs_devices->num_devices++;
2680         fs_devices->open_devices++;
2681         fs_devices->rw_devices++;
2682         fs_devices->total_devices++;
2683         fs_devices->total_rw_bytes += device->total_bytes;
2684
2685         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2686
2687         if (!blk_queue_nonrot(q))
2688                 fs_devices->rotating = true;
2689
2690         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2691         btrfs_set_super_total_bytes(fs_info->super_copy,
2692                 round_down(orig_super_total_bytes + device->total_bytes,
2693                            fs_info->sectorsize));
2694
2695         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2696         btrfs_set_super_num_devices(fs_info->super_copy,
2697                                     orig_super_num_devices + 1);
2698
2699         /*
2700          * we've got more storage, clear any full flags on the space
2701          * infos
2702          */
2703         btrfs_clear_space_info_full(fs_info);
2704
2705         mutex_unlock(&fs_info->chunk_mutex);
2706
2707         /* Add sysfs device entry */
2708         btrfs_sysfs_add_device(device);
2709
2710         mutex_unlock(&fs_devices->device_list_mutex);
2711
2712         if (seeding_dev) {
2713                 mutex_lock(&fs_info->chunk_mutex);
2714                 ret = init_first_rw_device(trans);
2715                 mutex_unlock(&fs_info->chunk_mutex);
2716                 if (ret) {
2717                         btrfs_abort_transaction(trans, ret);
2718                         goto error_sysfs;
2719                 }
2720         }
2721
2722         ret = btrfs_add_dev_item(trans, device);
2723         if (ret) {
2724                 btrfs_abort_transaction(trans, ret);
2725                 goto error_sysfs;
2726         }
2727
2728         if (seeding_dev) {
2729                 ret = btrfs_finish_sprout(trans);
2730                 if (ret) {
2731                         btrfs_abort_transaction(trans, ret);
2732                         goto error_sysfs;
2733                 }
2734
2735                 /*
2736                  * fs_devices now represents the newly sprouted filesystem and
2737                  * its fsid has been changed by btrfs_prepare_sprout
2738                  */
2739                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2740         }
2741
2742         ret = btrfs_commit_transaction(trans);
2743
2744         if (seeding_dev) {
2745                 mutex_unlock(&uuid_mutex);
2746                 up_write(&sb->s_umount);
2747                 locked = false;
2748
2749                 if (ret) /* transaction commit */
2750                         return ret;
2751
2752                 ret = btrfs_relocate_sys_chunks(fs_info);
2753                 if (ret < 0)
2754                         btrfs_handle_fs_error(fs_info, ret,
2755                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2756                 trans = btrfs_attach_transaction(root);
2757                 if (IS_ERR(trans)) {
2758                         if (PTR_ERR(trans) == -ENOENT)
2759                                 return 0;
2760                         ret = PTR_ERR(trans);
2761                         trans = NULL;
2762                         goto error_sysfs;
2763                 }
2764                 ret = btrfs_commit_transaction(trans);
2765         }
2766
2767         /*
2768          * Now that we have written a new super block to this device, check all
2769          * other fs_devices list if device_path alienates any other scanned
2770          * device.
2771          * We can ignore the return value as it typically returns -EINVAL and
2772          * only succeeds if the device was an alien.
2773          */
2774         btrfs_forget_devices(device_path);
2775
2776         /* Update ctime/mtime for blkid or udev */
2777         update_dev_time(device_path);
2778
2779         return ret;
2780
2781 error_sysfs:
2782         btrfs_sysfs_remove_device(device);
2783         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2784         mutex_lock(&fs_info->chunk_mutex);
2785         list_del_rcu(&device->dev_list);
2786         list_del(&device->dev_alloc_list);
2787         fs_info->fs_devices->num_devices--;
2788         fs_info->fs_devices->open_devices--;
2789         fs_info->fs_devices->rw_devices--;
2790         fs_info->fs_devices->total_devices--;
2791         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2792         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2793         btrfs_set_super_total_bytes(fs_info->super_copy,
2794                                     orig_super_total_bytes);
2795         btrfs_set_super_num_devices(fs_info->super_copy,
2796                                     orig_super_num_devices);
2797         mutex_unlock(&fs_info->chunk_mutex);
2798         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2799 error_trans:
2800         if (seeding_dev)
2801                 btrfs_set_sb_rdonly(sb);
2802         if (trans)
2803                 btrfs_end_transaction(trans);
2804 error_free_zone:
2805         btrfs_destroy_dev_zone_info(device);
2806 error_free_device:
2807         btrfs_free_device(device);
2808 error:
2809         blkdev_put(bdev, FMODE_EXCL);
2810         if (locked) {
2811                 mutex_unlock(&uuid_mutex);
2812                 up_write(&sb->s_umount);
2813         }
2814         return ret;
2815 }
2816
2817 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2818                                         struct btrfs_device *device)
2819 {
2820         int ret;
2821         struct btrfs_path *path;
2822         struct btrfs_root *root = device->fs_info->chunk_root;
2823         struct btrfs_dev_item *dev_item;
2824         struct extent_buffer *leaf;
2825         struct btrfs_key key;
2826
2827         path = btrfs_alloc_path();
2828         if (!path)
2829                 return -ENOMEM;
2830
2831         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2832         key.type = BTRFS_DEV_ITEM_KEY;
2833         key.offset = device->devid;
2834
2835         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2836         if (ret < 0)
2837                 goto out;
2838
2839         if (ret > 0) {
2840                 ret = -ENOENT;
2841                 goto out;
2842         }
2843
2844         leaf = path->nodes[0];
2845         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2846
2847         btrfs_set_device_id(leaf, dev_item, device->devid);
2848         btrfs_set_device_type(leaf, dev_item, device->type);
2849         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2850         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2851         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2852         btrfs_set_device_total_bytes(leaf, dev_item,
2853                                      btrfs_device_get_disk_total_bytes(device));
2854         btrfs_set_device_bytes_used(leaf, dev_item,
2855                                     btrfs_device_get_bytes_used(device));
2856         btrfs_mark_buffer_dirty(leaf);
2857
2858 out:
2859         btrfs_free_path(path);
2860         return ret;
2861 }
2862
2863 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2864                       struct btrfs_device *device, u64 new_size)
2865 {
2866         struct btrfs_fs_info *fs_info = device->fs_info;
2867         struct btrfs_super_block *super_copy = fs_info->super_copy;
2868         u64 old_total;
2869         u64 diff;
2870         int ret;
2871
2872         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2873                 return -EACCES;
2874
2875         new_size = round_down(new_size, fs_info->sectorsize);
2876
2877         mutex_lock(&fs_info->chunk_mutex);
2878         old_total = btrfs_super_total_bytes(super_copy);
2879         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2880
2881         if (new_size <= device->total_bytes ||
2882             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2883                 mutex_unlock(&fs_info->chunk_mutex);
2884                 return -EINVAL;
2885         }
2886
2887         btrfs_set_super_total_bytes(super_copy,
2888                         round_down(old_total + diff, fs_info->sectorsize));
2889         device->fs_devices->total_rw_bytes += diff;
2890
2891         btrfs_device_set_total_bytes(device, new_size);
2892         btrfs_device_set_disk_total_bytes(device, new_size);
2893         btrfs_clear_space_info_full(device->fs_info);
2894         if (list_empty(&device->post_commit_list))
2895                 list_add_tail(&device->post_commit_list,
2896                               &trans->transaction->dev_update_list);
2897         mutex_unlock(&fs_info->chunk_mutex);
2898
2899         btrfs_reserve_chunk_metadata(trans, false);
2900         ret = btrfs_update_device(trans, device);
2901         btrfs_trans_release_chunk_metadata(trans);
2902
2903         return ret;
2904 }
2905
2906 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2907 {
2908         struct btrfs_fs_info *fs_info = trans->fs_info;
2909         struct btrfs_root *root = fs_info->chunk_root;
2910         int ret;
2911         struct btrfs_path *path;
2912         struct btrfs_key key;
2913
2914         path = btrfs_alloc_path();
2915         if (!path)
2916                 return -ENOMEM;
2917
2918         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2919         key.offset = chunk_offset;
2920         key.type = BTRFS_CHUNK_ITEM_KEY;
2921
2922         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2923         if (ret < 0)
2924                 goto out;
2925         else if (ret > 0) { /* Logic error or corruption */
2926                 btrfs_handle_fs_error(fs_info, -ENOENT,
2927                                       "Failed lookup while freeing chunk.");
2928                 ret = -ENOENT;
2929                 goto out;
2930         }
2931
2932         ret = btrfs_del_item(trans, root, path);
2933         if (ret < 0)
2934                 btrfs_handle_fs_error(fs_info, ret,
2935                                       "Failed to delete chunk item.");
2936 out:
2937         btrfs_free_path(path);
2938         return ret;
2939 }
2940
2941 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2942 {
2943         struct btrfs_super_block *super_copy = fs_info->super_copy;
2944         struct btrfs_disk_key *disk_key;
2945         struct btrfs_chunk *chunk;
2946         u8 *ptr;
2947         int ret = 0;
2948         u32 num_stripes;
2949         u32 array_size;
2950         u32 len = 0;
2951         u32 cur;
2952         struct btrfs_key key;
2953
2954         lockdep_assert_held(&fs_info->chunk_mutex);
2955         array_size = btrfs_super_sys_array_size(super_copy);
2956
2957         ptr = super_copy->sys_chunk_array;
2958         cur = 0;
2959
2960         while (cur < array_size) {
2961                 disk_key = (struct btrfs_disk_key *)ptr;
2962                 btrfs_disk_key_to_cpu(&key, disk_key);
2963
2964                 len = sizeof(*disk_key);
2965
2966                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2967                         chunk = (struct btrfs_chunk *)(ptr + len);
2968                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2969                         len += btrfs_chunk_item_size(num_stripes);
2970                 } else {
2971                         ret = -EIO;
2972                         break;
2973                 }
2974                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2975                     key.offset == chunk_offset) {
2976                         memmove(ptr, ptr + len, array_size - (cur + len));
2977                         array_size -= len;
2978                         btrfs_set_super_sys_array_size(super_copy, array_size);
2979                 } else {
2980                         ptr += len;
2981                         cur += len;
2982                 }
2983         }
2984         return ret;
2985 }
2986
2987 /*
2988  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2989  * @logical: Logical block offset in bytes.
2990  * @length: Length of extent in bytes.
2991  *
2992  * Return: Chunk mapping or ERR_PTR.
2993  */
2994 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2995                                        u64 logical, u64 length)
2996 {
2997         struct extent_map_tree *em_tree;
2998         struct extent_map *em;
2999
3000         em_tree = &fs_info->mapping_tree;
3001         read_lock(&em_tree->lock);
3002         em = lookup_extent_mapping(em_tree, logical, length);
3003         read_unlock(&em_tree->lock);
3004
3005         if (!em) {
3006                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3007                            logical, length);
3008                 return ERR_PTR(-EINVAL);
3009         }
3010
3011         if (em->start > logical || em->start + em->len < logical) {
3012                 btrfs_crit(fs_info,
3013                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3014                            logical, length, em->start, em->start + em->len);
3015                 free_extent_map(em);
3016                 return ERR_PTR(-EINVAL);
3017         }
3018
3019         /* callers are responsible for dropping em's ref. */
3020         return em;
3021 }
3022
3023 static int remove_chunk_item(struct btrfs_trans_handle *trans,
3024                              struct map_lookup *map, u64 chunk_offset)
3025 {
3026         int i;
3027
3028         /*
3029          * Removing chunk items and updating the device items in the chunks btree
3030          * requires holding the chunk_mutex.
3031          * See the comment at btrfs_chunk_alloc() for the details.
3032          */
3033         lockdep_assert_held(&trans->fs_info->chunk_mutex);
3034
3035         for (i = 0; i < map->num_stripes; i++) {
3036                 int ret;
3037
3038                 ret = btrfs_update_device(trans, map->stripes[i].dev);
3039                 if (ret)
3040                         return ret;
3041         }
3042
3043         return btrfs_free_chunk(trans, chunk_offset);
3044 }
3045
3046 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3047 {
3048         struct btrfs_fs_info *fs_info = trans->fs_info;
3049         struct extent_map *em;
3050         struct map_lookup *map;
3051         u64 dev_extent_len = 0;
3052         int i, ret = 0;
3053         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3054
3055         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3056         if (IS_ERR(em)) {
3057                 /*
3058                  * This is a logic error, but we don't want to just rely on the
3059                  * user having built with ASSERT enabled, so if ASSERT doesn't
3060                  * do anything we still error out.
3061                  */
3062                 ASSERT(0);
3063                 return PTR_ERR(em);
3064         }
3065         map = em->map_lookup;
3066
3067         /*
3068          * First delete the device extent items from the devices btree.
3069          * We take the device_list_mutex to avoid racing with the finishing phase
3070          * of a device replace operation. See the comment below before acquiring
3071          * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3072          * because that can result in a deadlock when deleting the device extent
3073          * items from the devices btree - COWing an extent buffer from the btree
3074          * may result in allocating a new metadata chunk, which would attempt to
3075          * lock again fs_info->chunk_mutex.
3076          */
3077         mutex_lock(&fs_devices->device_list_mutex);
3078         for (i = 0; i < map->num_stripes; i++) {
3079                 struct btrfs_device *device = map->stripes[i].dev;
3080                 ret = btrfs_free_dev_extent(trans, device,
3081                                             map->stripes[i].physical,
3082                                             &dev_extent_len);
3083                 if (ret) {
3084                         mutex_unlock(&fs_devices->device_list_mutex);
3085                         btrfs_abort_transaction(trans, ret);
3086                         goto out;
3087                 }
3088
3089                 if (device->bytes_used > 0) {
3090                         mutex_lock(&fs_info->chunk_mutex);
3091                         btrfs_device_set_bytes_used(device,
3092                                         device->bytes_used - dev_extent_len);
3093                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3094                         btrfs_clear_space_info_full(fs_info);
3095                         mutex_unlock(&fs_info->chunk_mutex);
3096                 }
3097         }
3098         mutex_unlock(&fs_devices->device_list_mutex);
3099
3100         /*
3101          * We acquire fs_info->chunk_mutex for 2 reasons:
3102          *
3103          * 1) Just like with the first phase of the chunk allocation, we must
3104          *    reserve system space, do all chunk btree updates and deletions, and
3105          *    update the system chunk array in the superblock while holding this
3106          *    mutex. This is for similar reasons as explained on the comment at
3107          *    the top of btrfs_chunk_alloc();
3108          *
3109          * 2) Prevent races with the final phase of a device replace operation
3110          *    that replaces the device object associated with the map's stripes,
3111          *    because the device object's id can change at any time during that
3112          *    final phase of the device replace operation
3113          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3114          *    replaced device and then see it with an ID of
3115          *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3116          *    the device item, which does not exists on the chunk btree.
3117          *    The finishing phase of device replace acquires both the
3118          *    device_list_mutex and the chunk_mutex, in that order, so we are
3119          *    safe by just acquiring the chunk_mutex.
3120          */
3121         trans->removing_chunk = true;
3122         mutex_lock(&fs_info->chunk_mutex);
3123
3124         check_system_chunk(trans, map->type);
3125
3126         ret = remove_chunk_item(trans, map, chunk_offset);
3127         /*
3128          * Normally we should not get -ENOSPC since we reserved space before
3129          * through the call to check_system_chunk().
3130          *
3131          * Despite our system space_info having enough free space, we may not
3132          * be able to allocate extents from its block groups, because all have
3133          * an incompatible profile, which will force us to allocate a new system
3134          * block group with the right profile, or right after we called
3135          * check_system_space() above, a scrub turned the only system block group
3136          * with enough free space into RO mode.
3137          * This is explained with more detail at do_chunk_alloc().
3138          *
3139          * So if we get -ENOSPC, allocate a new system chunk and retry once.
3140          */
3141         if (ret == -ENOSPC) {
3142                 const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3143                 struct btrfs_block_group *sys_bg;
3144
3145                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3146                 if (IS_ERR(sys_bg)) {
3147                         ret = PTR_ERR(sys_bg);
3148                         btrfs_abort_transaction(trans, ret);
3149                         goto out;
3150                 }
3151
3152                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3153                 if (ret) {
3154                         btrfs_abort_transaction(trans, ret);
3155                         goto out;
3156                 }
3157
3158                 ret = remove_chunk_item(trans, map, chunk_offset);
3159                 if (ret) {
3160                         btrfs_abort_transaction(trans, ret);
3161                         goto out;
3162                 }
3163         } else if (ret) {
3164                 btrfs_abort_transaction(trans, ret);
3165                 goto out;
3166         }
3167
3168         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3169
3170         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3171                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3172                 if (ret) {
3173                         btrfs_abort_transaction(trans, ret);
3174                         goto out;
3175                 }
3176         }
3177
3178         mutex_unlock(&fs_info->chunk_mutex);
3179         trans->removing_chunk = false;
3180
3181         /*
3182          * We are done with chunk btree updates and deletions, so release the
3183          * system space we previously reserved (with check_system_chunk()).
3184          */
3185         btrfs_trans_release_chunk_metadata(trans);
3186
3187         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3188         if (ret) {
3189                 btrfs_abort_transaction(trans, ret);
3190                 goto out;
3191         }
3192
3193 out:
3194         if (trans->removing_chunk) {
3195                 mutex_unlock(&fs_info->chunk_mutex);
3196                 trans->removing_chunk = false;
3197         }
3198         /* once for us */
3199         free_extent_map(em);
3200         return ret;
3201 }
3202
3203 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3204 {
3205         struct btrfs_root *root = fs_info->chunk_root;
3206         struct btrfs_trans_handle *trans;
3207         struct btrfs_block_group *block_group;
3208         u64 length;
3209         int ret;
3210
3211         /*
3212          * Prevent races with automatic removal of unused block groups.
3213          * After we relocate and before we remove the chunk with offset
3214          * chunk_offset, automatic removal of the block group can kick in,
3215          * resulting in a failure when calling btrfs_remove_chunk() below.
3216          *
3217          * Make sure to acquire this mutex before doing a tree search (dev
3218          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3219          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3220          * we release the path used to search the chunk/dev tree and before
3221          * the current task acquires this mutex and calls us.
3222          */
3223         lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3224
3225         /* step one, relocate all the extents inside this chunk */
3226         btrfs_scrub_pause(fs_info);
3227         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3228         btrfs_scrub_continue(fs_info);
3229         if (ret)
3230                 return ret;
3231
3232         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3233         if (!block_group)
3234                 return -ENOENT;
3235         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3236         length = block_group->length;
3237         btrfs_put_block_group(block_group);
3238
3239         /*
3240          * On a zoned file system, discard the whole block group, this will
3241          * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3242          * resetting the zone fails, don't treat it as a fatal problem from the
3243          * filesystem's point of view.
3244          */
3245         if (btrfs_is_zoned(fs_info)) {
3246                 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3247                 if (ret)
3248                         btrfs_info(fs_info,
3249                                 "failed to reset zone %llu after relocation",
3250                                 chunk_offset);
3251         }
3252
3253         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3254                                                      chunk_offset);
3255         if (IS_ERR(trans)) {
3256                 ret = PTR_ERR(trans);
3257                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3258                 return ret;
3259         }
3260
3261         /*
3262          * step two, delete the device extents and the
3263          * chunk tree entries
3264          */
3265         ret = btrfs_remove_chunk(trans, chunk_offset);
3266         btrfs_end_transaction(trans);
3267         return ret;
3268 }
3269
3270 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3271 {
3272         struct btrfs_root *chunk_root = fs_info->chunk_root;
3273         struct btrfs_path *path;
3274         struct extent_buffer *leaf;
3275         struct btrfs_chunk *chunk;
3276         struct btrfs_key key;
3277         struct btrfs_key found_key;
3278         u64 chunk_type;
3279         bool retried = false;
3280         int failed = 0;
3281         int ret;
3282
3283         path = btrfs_alloc_path();
3284         if (!path)
3285                 return -ENOMEM;
3286
3287 again:
3288         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3289         key.offset = (u64)-1;
3290         key.type = BTRFS_CHUNK_ITEM_KEY;
3291
3292         while (1) {
3293                 mutex_lock(&fs_info->reclaim_bgs_lock);
3294                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3295                 if (ret < 0) {
3296                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3297                         goto error;
3298                 }
3299                 BUG_ON(ret == 0); /* Corruption */
3300
3301                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3302                                           key.type);
3303                 if (ret)
3304                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3305                 if (ret < 0)
3306                         goto error;
3307                 if (ret > 0)
3308                         break;
3309
3310                 leaf = path->nodes[0];
3311                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3312
3313                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3314                                        struct btrfs_chunk);
3315                 chunk_type = btrfs_chunk_type(leaf, chunk);
3316                 btrfs_release_path(path);
3317
3318                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3319                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3320                         if (ret == -ENOSPC)
3321                                 failed++;
3322                         else
3323                                 BUG_ON(ret);
3324                 }
3325                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3326
3327                 if (found_key.offset == 0)
3328                         break;
3329                 key.offset = found_key.offset - 1;
3330         }
3331         ret = 0;
3332         if (failed && !retried) {
3333                 failed = 0;
3334                 retried = true;
3335                 goto again;
3336         } else if (WARN_ON(failed && retried)) {
3337                 ret = -ENOSPC;
3338         }
3339 error:
3340         btrfs_free_path(path);
3341         return ret;
3342 }
3343
3344 /*
3345  * return 1 : allocate a data chunk successfully,
3346  * return <0: errors during allocating a data chunk,
3347  * return 0 : no need to allocate a data chunk.
3348  */
3349 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3350                                       u64 chunk_offset)
3351 {
3352         struct btrfs_block_group *cache;
3353         u64 bytes_used;
3354         u64 chunk_type;
3355
3356         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3357         ASSERT(cache);
3358         chunk_type = cache->flags;
3359         btrfs_put_block_group(cache);
3360
3361         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3362                 return 0;
3363
3364         spin_lock(&fs_info->data_sinfo->lock);
3365         bytes_used = fs_info->data_sinfo->bytes_used;
3366         spin_unlock(&fs_info->data_sinfo->lock);
3367
3368         if (!bytes_used) {
3369                 struct btrfs_trans_handle *trans;
3370                 int ret;
3371
3372                 trans = btrfs_join_transaction(fs_info->tree_root);
3373                 if (IS_ERR(trans))
3374                         return PTR_ERR(trans);
3375
3376                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3377                 btrfs_end_transaction(trans);
3378                 if (ret < 0)
3379                         return ret;
3380                 return 1;
3381         }
3382
3383         return 0;
3384 }
3385
3386 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3387                                struct btrfs_balance_control *bctl)
3388 {
3389         struct btrfs_root *root = fs_info->tree_root;
3390         struct btrfs_trans_handle *trans;
3391         struct btrfs_balance_item *item;
3392         struct btrfs_disk_balance_args disk_bargs;
3393         struct btrfs_path *path;
3394         struct extent_buffer *leaf;
3395         struct btrfs_key key;
3396         int ret, err;
3397
3398         path = btrfs_alloc_path();
3399         if (!path)
3400                 return -ENOMEM;
3401
3402         trans = btrfs_start_transaction(root, 0);
3403         if (IS_ERR(trans)) {
3404                 btrfs_free_path(path);
3405                 return PTR_ERR(trans);
3406         }
3407
3408         key.objectid = BTRFS_BALANCE_OBJECTID;
3409         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3410         key.offset = 0;
3411
3412         ret = btrfs_insert_empty_item(trans, root, path, &key,
3413                                       sizeof(*item));
3414         if (ret)
3415                 goto out;
3416
3417         leaf = path->nodes[0];
3418         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3419
3420         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3421
3422         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3423         btrfs_set_balance_data(leaf, item, &disk_bargs);
3424         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3425         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3426         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3427         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3428
3429         btrfs_set_balance_flags(leaf, item, bctl->flags);
3430
3431         btrfs_mark_buffer_dirty(leaf);
3432 out:
3433         btrfs_free_path(path);
3434         err = btrfs_commit_transaction(trans);
3435         if (err && !ret)
3436                 ret = err;
3437         return ret;
3438 }
3439
3440 static int del_balance_item(struct btrfs_fs_info *fs_info)
3441 {
3442         struct btrfs_root *root = fs_info->tree_root;
3443         struct btrfs_trans_handle *trans;
3444         struct btrfs_path *path;
3445         struct btrfs_key key;
3446         int ret, err;
3447
3448         path = btrfs_alloc_path();
3449         if (!path)
3450                 return -ENOMEM;
3451
3452         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3453         if (IS_ERR(trans)) {
3454                 btrfs_free_path(path);
3455                 return PTR_ERR(trans);
3456         }
3457
3458         key.objectid = BTRFS_BALANCE_OBJECTID;
3459         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3460         key.offset = 0;
3461
3462         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3463         if (ret < 0)
3464                 goto out;
3465         if (ret > 0) {
3466                 ret = -ENOENT;
3467                 goto out;
3468         }
3469
3470         ret = btrfs_del_item(trans, root, path);
3471 out:
3472         btrfs_free_path(path);
3473         err = btrfs_commit_transaction(trans);
3474         if (err && !ret)
3475                 ret = err;
3476         return ret;
3477 }
3478
3479 /*
3480  * This is a heuristic used to reduce the number of chunks balanced on
3481  * resume after balance was interrupted.
3482  */
3483 static void update_balance_args(struct btrfs_balance_control *bctl)
3484 {
3485         /*
3486          * Turn on soft mode for chunk types that were being converted.
3487          */
3488         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3489                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3490         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3491                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3492         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3493                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3494
3495         /*
3496          * Turn on usage filter if is not already used.  The idea is
3497          * that chunks that we have already balanced should be
3498          * reasonably full.  Don't do it for chunks that are being
3499          * converted - that will keep us from relocating unconverted
3500          * (albeit full) chunks.
3501          */
3502         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3503             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3504             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3505                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3506                 bctl->data.usage = 90;
3507         }
3508         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3509             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3510             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3511                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3512                 bctl->sys.usage = 90;
3513         }
3514         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3515             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3516             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3517                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3518                 bctl->meta.usage = 90;
3519         }
3520 }
3521
3522 /*
3523  * Clear the balance status in fs_info and delete the balance item from disk.
3524  */
3525 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3526 {
3527         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3528         int ret;
3529
3530         BUG_ON(!fs_info->balance_ctl);
3531
3532         spin_lock(&fs_info->balance_lock);
3533         fs_info->balance_ctl = NULL;
3534         spin_unlock(&fs_info->balance_lock);
3535
3536         kfree(bctl);
3537         ret = del_balance_item(fs_info);
3538         if (ret)
3539                 btrfs_handle_fs_error(fs_info, ret, NULL);
3540 }
3541
3542 /*
3543  * Balance filters.  Return 1 if chunk should be filtered out
3544  * (should not be balanced).
3545  */
3546 static int chunk_profiles_filter(u64 chunk_type,
3547                                  struct btrfs_balance_args *bargs)
3548 {
3549         chunk_type = chunk_to_extended(chunk_type) &
3550                                 BTRFS_EXTENDED_PROFILE_MASK;
3551
3552         if (bargs->profiles & chunk_type)
3553                 return 0;
3554
3555         return 1;
3556 }
3557
3558 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3559                               struct btrfs_balance_args *bargs)
3560 {
3561         struct btrfs_block_group *cache;
3562         u64 chunk_used;
3563         u64 user_thresh_min;
3564         u64 user_thresh_max;
3565         int ret = 1;
3566
3567         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3568         chunk_used = cache->used;
3569
3570         if (bargs->usage_min == 0)
3571                 user_thresh_min = 0;
3572         else
3573                 user_thresh_min = div_factor_fine(cache->length,
3574                                                   bargs->usage_min);
3575
3576         if (bargs->usage_max == 0)
3577                 user_thresh_max = 1;
3578         else if (bargs->usage_max > 100)
3579                 user_thresh_max = cache->length;
3580         else
3581                 user_thresh_max = div_factor_fine(cache->length,
3582                                                   bargs->usage_max);
3583
3584         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3585                 ret = 0;
3586
3587         btrfs_put_block_group(cache);
3588         return ret;
3589 }
3590
3591 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3592                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3593 {
3594         struct btrfs_block_group *cache;
3595         u64 chunk_used, user_thresh;
3596         int ret = 1;
3597
3598         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3599         chunk_used = cache->used;
3600
3601         if (bargs->usage_min == 0)
3602                 user_thresh = 1;
3603         else if (bargs->usage > 100)
3604                 user_thresh = cache->length;
3605         else
3606                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3607
3608         if (chunk_used < user_thresh)
3609                 ret = 0;
3610
3611         btrfs_put_block_group(cache);
3612         return ret;
3613 }
3614
3615 static int chunk_devid_filter(struct extent_buffer *leaf,
3616                               struct btrfs_chunk *chunk,
3617                               struct btrfs_balance_args *bargs)
3618 {
3619         struct btrfs_stripe *stripe;
3620         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3621         int i;
3622
3623         for (i = 0; i < num_stripes; i++) {
3624                 stripe = btrfs_stripe_nr(chunk, i);
3625                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3626                         return 0;
3627         }
3628
3629         return 1;
3630 }
3631
3632 static u64 calc_data_stripes(u64 type, int num_stripes)
3633 {
3634         const int index = btrfs_bg_flags_to_raid_index(type);
3635         const int ncopies = btrfs_raid_array[index].ncopies;
3636         const int nparity = btrfs_raid_array[index].nparity;
3637
3638         return (num_stripes - nparity) / ncopies;
3639 }
3640
3641 /* [pstart, pend) */
3642 static int chunk_drange_filter(struct extent_buffer *leaf,
3643                                struct btrfs_chunk *chunk,
3644                                struct btrfs_balance_args *bargs)
3645 {
3646         struct btrfs_stripe *stripe;
3647         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3648         u64 stripe_offset;
3649         u64 stripe_length;
3650         u64 type;
3651         int factor;
3652         int i;
3653
3654         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3655                 return 0;
3656
3657         type = btrfs_chunk_type(leaf, chunk);
3658         factor = calc_data_stripes(type, num_stripes);
3659
3660         for (i = 0; i < num_stripes; i++) {
3661                 stripe = btrfs_stripe_nr(chunk, i);
3662                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3663                         continue;
3664
3665                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3666                 stripe_length = btrfs_chunk_length(leaf, chunk);
3667                 stripe_length = div_u64(stripe_length, factor);
3668
3669                 if (stripe_offset < bargs->pend &&
3670                     stripe_offset + stripe_length > bargs->pstart)
3671                         return 0;
3672         }
3673
3674         return 1;
3675 }
3676
3677 /* [vstart, vend) */
3678 static int chunk_vrange_filter(struct extent_buffer *leaf,
3679                                struct btrfs_chunk *chunk,
3680                                u64 chunk_offset,
3681                                struct btrfs_balance_args *bargs)
3682 {
3683         if (chunk_offset < bargs->vend &&
3684             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3685                 /* at least part of the chunk is inside this vrange */
3686                 return 0;
3687
3688         return 1;
3689 }
3690
3691 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3692                                struct btrfs_chunk *chunk,
3693                                struct btrfs_balance_args *bargs)
3694 {
3695         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3696
3697         if (bargs->stripes_min <= num_stripes
3698                         && num_stripes <= bargs->stripes_max)
3699                 return 0;
3700
3701         return 1;
3702 }
3703
3704 static int chunk_soft_convert_filter(u64 chunk_type,
3705                                      struct btrfs_balance_args *bargs)
3706 {
3707         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3708                 return 0;
3709
3710         chunk_type = chunk_to_extended(chunk_type) &
3711                                 BTRFS_EXTENDED_PROFILE_MASK;
3712
3713         if (bargs->target == chunk_type)
3714                 return 1;
3715
3716         return 0;
3717 }
3718
3719 static int should_balance_chunk(struct extent_buffer *leaf,
3720                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3721 {
3722         struct btrfs_fs_info *fs_info = leaf->fs_info;
3723         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3724         struct btrfs_balance_args *bargs = NULL;
3725         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3726
3727         /* type filter */
3728         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3729               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3730                 return 0;
3731         }
3732
3733         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3734                 bargs = &bctl->data;
3735         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3736                 bargs = &bctl->sys;
3737         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3738                 bargs = &bctl->meta;
3739
3740         /* profiles filter */
3741         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3742             chunk_profiles_filter(chunk_type, bargs)) {
3743                 return 0;
3744         }
3745
3746         /* usage filter */
3747         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3748             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3749                 return 0;
3750         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3751             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3752                 return 0;
3753         }
3754
3755         /* devid filter */
3756         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3757             chunk_devid_filter(leaf, chunk, bargs)) {
3758                 return 0;
3759         }
3760
3761         /* drange filter, makes sense only with devid filter */
3762         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3763             chunk_drange_filter(leaf, chunk, bargs)) {
3764                 return 0;
3765         }
3766
3767         /* vrange filter */
3768         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3769             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3770                 return 0;
3771         }
3772
3773         /* stripes filter */
3774         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3775             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3776                 return 0;
3777         }
3778
3779         /* soft profile changing mode */
3780         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3781             chunk_soft_convert_filter(chunk_type, bargs)) {
3782                 return 0;
3783         }
3784
3785         /*
3786          * limited by count, must be the last filter
3787          */
3788         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3789                 if (bargs->limit == 0)
3790                         return 0;
3791                 else
3792                         bargs->limit--;
3793         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3794                 /*
3795                  * Same logic as the 'limit' filter; the minimum cannot be
3796                  * determined here because we do not have the global information
3797                  * about the count of all chunks that satisfy the filters.
3798                  */
3799                 if (bargs->limit_max == 0)
3800                         return 0;
3801                 else
3802                         bargs->limit_max--;
3803         }
3804
3805         return 1;
3806 }
3807
3808 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3809 {
3810         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3811         struct btrfs_root *chunk_root = fs_info->chunk_root;
3812         u64 chunk_type;
3813         struct btrfs_chunk *chunk;
3814         struct btrfs_path *path = NULL;
3815         struct btrfs_key key;
3816         struct btrfs_key found_key;
3817         struct extent_buffer *leaf;
3818         int slot;
3819         int ret;
3820         int enospc_errors = 0;
3821         bool counting = true;
3822         /* The single value limit and min/max limits use the same bytes in the */
3823         u64 limit_data = bctl->data.limit;
3824         u64 limit_meta = bctl->meta.limit;
3825         u64 limit_sys = bctl->sys.limit;
3826         u32 count_data = 0;
3827         u32 count_meta = 0;
3828         u32 count_sys = 0;
3829         int chunk_reserved = 0;
3830
3831         path = btrfs_alloc_path();
3832         if (!path) {
3833                 ret = -ENOMEM;
3834                 goto error;
3835         }
3836
3837         /* zero out stat counters */
3838         spin_lock(&fs_info->balance_lock);
3839         memset(&bctl->stat, 0, sizeof(bctl->stat));
3840         spin_unlock(&fs_info->balance_lock);
3841 again:
3842         if (!counting) {
3843                 /*
3844                  * The single value limit and min/max limits use the same bytes
3845                  * in the
3846                  */
3847                 bctl->data.limit = limit_data;
3848                 bctl->meta.limit = limit_meta;
3849                 bctl->sys.limit = limit_sys;
3850         }
3851         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3852         key.offset = (u64)-1;
3853         key.type = BTRFS_CHUNK_ITEM_KEY;
3854
3855         while (1) {
3856                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3857                     atomic_read(&fs_info->balance_cancel_req)) {
3858                         ret = -ECANCELED;
3859                         goto error;
3860                 }
3861
3862                 mutex_lock(&fs_info->reclaim_bgs_lock);
3863                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3864                 if (ret < 0) {
3865                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3866                         goto error;
3867                 }
3868
3869                 /*
3870                  * this shouldn't happen, it means the last relocate
3871                  * failed
3872                  */
3873                 if (ret == 0)
3874                         BUG(); /* FIXME break ? */
3875
3876                 ret = btrfs_previous_item(chunk_root, path, 0,
3877                                           BTRFS_CHUNK_ITEM_KEY);
3878                 if (ret) {
3879                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3880                         ret = 0;
3881                         break;
3882                 }
3883
3884                 leaf = path->nodes[0];
3885                 slot = path->slots[0];
3886                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3887
3888                 if (found_key.objectid != key.objectid) {
3889                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3890                         break;
3891                 }
3892
3893                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3894                 chunk_type = btrfs_chunk_type(leaf, chunk);
3895
3896                 if (!counting) {
3897                         spin_lock(&fs_info->balance_lock);
3898                         bctl->stat.considered++;
3899                         spin_unlock(&fs_info->balance_lock);
3900                 }
3901
3902                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3903
3904                 btrfs_release_path(path);
3905                 if (!ret) {
3906                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3907                         goto loop;
3908                 }
3909
3910                 if (counting) {
3911                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3912                         spin_lock(&fs_info->balance_lock);
3913                         bctl->stat.expected++;
3914                         spin_unlock(&fs_info->balance_lock);
3915
3916                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3917                                 count_data++;
3918                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3919                                 count_sys++;
3920                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3921                                 count_meta++;
3922
3923                         goto loop;
3924                 }
3925
3926                 /*
3927                  * Apply limit_min filter, no need to check if the LIMITS
3928                  * filter is used, limit_min is 0 by default
3929                  */
3930                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3931                                         count_data < bctl->data.limit_min)
3932                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3933                                         count_meta < bctl->meta.limit_min)
3934                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3935                                         count_sys < bctl->sys.limit_min)) {
3936                         mutex_unlock(&fs_info->reclaim_bgs_lock);
3937                         goto loop;
3938                 }
3939
3940                 if (!chunk_reserved) {
3941                         /*
3942                          * We may be relocating the only data chunk we have,
3943                          * which could potentially end up with losing data's
3944                          * raid profile, so lets allocate an empty one in
3945                          * advance.
3946                          */
3947                         ret = btrfs_may_alloc_data_chunk(fs_info,
3948                                                          found_key.offset);
3949                         if (ret < 0) {
3950                                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3951                                 goto error;
3952                         } else if (ret == 1) {
3953                                 chunk_reserved = 1;
3954                         }
3955                 }
3956
3957                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3958                 mutex_unlock(&fs_info->reclaim_bgs_lock);
3959                 if (ret == -ENOSPC) {
3960                         enospc_errors++;
3961                 } else if (ret == -ETXTBSY) {
3962                         btrfs_info(fs_info,
3963            "skipping relocation of block group %llu due to active swapfile",
3964                                    found_key.offset);
3965                         ret = 0;
3966                 } else if (ret) {
3967                         goto error;
3968                 } else {
3969                         spin_lock(&fs_info->balance_lock);
3970                         bctl->stat.completed++;
3971                         spin_unlock(&fs_info->balance_lock);
3972                 }
3973 loop:
3974                 if (found_key.offset == 0)
3975                         break;
3976                 key.offset = found_key.offset - 1;
3977         }
3978
3979         if (counting) {
3980                 btrfs_release_path(path);
3981                 counting = false;
3982                 goto again;
3983         }
3984 error:
3985         btrfs_free_path(path);
3986         if (enospc_errors) {
3987                 btrfs_info(fs_info, "%d enospc errors during balance",
3988                            enospc_errors);
3989                 if (!ret)
3990                         ret = -ENOSPC;
3991         }
3992
3993         return ret;
3994 }
3995
3996 /**
3997  * alloc_profile_is_valid - see if a given profile is valid and reduced
3998  * @flags: profile to validate
3999  * @extended: if true @flags is treated as an extended profile
4000  */
4001 static int alloc_profile_is_valid(u64 flags, int extended)
4002 {
4003         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4004                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
4005
4006         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4007
4008         /* 1) check that all other bits are zeroed */
4009         if (flags & ~mask)
4010                 return 0;
4011
4012         /* 2) see if profile is reduced */
4013         if (flags == 0)
4014                 return !extended; /* "0" is valid for usual profiles */
4015
4016         return has_single_bit_set(flags);
4017 }
4018
4019 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
4020 {
4021         /* cancel requested || normal exit path */
4022         return atomic_read(&fs_info->balance_cancel_req) ||
4023                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
4024                  atomic_read(&fs_info->balance_cancel_req) == 0);
4025 }
4026
4027 /*
4028  * Validate target profile against allowed profiles and return true if it's OK.
4029  * Otherwise print the error message and return false.
4030  */
4031 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4032                 const struct btrfs_balance_args *bargs,
4033                 u64 allowed, const char *type)
4034 {
4035         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4036                 return true;
4037
4038         if (fs_info->sectorsize < PAGE_SIZE &&
4039                 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
4040                 btrfs_err(fs_info,
4041                 "RAID56 is not yet supported for sectorsize %u with page size %lu",
4042                           fs_info->sectorsize, PAGE_SIZE);
4043                 return false;
4044         }
4045         /* Profile is valid and does not have bits outside of the allowed set */
4046         if (alloc_profile_is_valid(bargs->target, 1) &&
4047             (bargs->target & ~allowed) == 0)
4048                 return true;
4049
4050         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4051                         type, btrfs_bg_type_to_raid_name(bargs->target));
4052         return false;
4053 }
4054
4055 /*
4056  * Fill @buf with textual description of balance filter flags @bargs, up to
4057  * @size_buf including the terminating null. The output may be trimmed if it
4058  * does not fit into the provided buffer.
4059  */
4060 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4061                                  u32 size_buf)
4062 {
4063         int ret;
4064         u32 size_bp = size_buf;
4065         char *bp = buf;
4066         u64 flags = bargs->flags;
4067         char tmp_buf[128] = {'\0'};
4068
4069         if (!flags)
4070                 return;
4071
4072 #define CHECK_APPEND_NOARG(a)                                           \
4073         do {                                                            \
4074                 ret = snprintf(bp, size_bp, (a));                       \
4075                 if (ret < 0 || ret >= size_bp)                          \
4076                         goto out_overflow;                              \
4077                 size_bp -= ret;                                         \
4078                 bp += ret;                                              \
4079         } while (0)
4080
4081 #define CHECK_APPEND_1ARG(a, v1)                                        \
4082         do {                                                            \
4083                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4084                 if (ret < 0 || ret >= size_bp)                          \
4085                         goto out_overflow;                              \
4086                 size_bp -= ret;                                         \
4087                 bp += ret;                                              \
4088         } while (0)
4089
4090 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
4091         do {                                                            \
4092                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
4093                 if (ret < 0 || ret >= size_bp)                          \
4094                         goto out_overflow;                              \
4095                 size_bp -= ret;                                         \
4096                 bp += ret;                                              \
4097         } while (0)
4098
4099         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4100                 CHECK_APPEND_1ARG("convert=%s,",
4101                                   btrfs_bg_type_to_raid_name(bargs->target));
4102
4103         if (flags & BTRFS_BALANCE_ARGS_SOFT)
4104                 CHECK_APPEND_NOARG("soft,");
4105
4106         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4107                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4108                                             sizeof(tmp_buf));
4109                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4110         }
4111
4112         if (flags & BTRFS_BALANCE_ARGS_USAGE)
4113                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4114
4115         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4116                 CHECK_APPEND_2ARG("usage=%u..%u,",
4117                                   bargs->usage_min, bargs->usage_max);
4118
4119         if (flags & BTRFS_BALANCE_ARGS_DEVID)
4120                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4121
4122         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4123                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4124                                   bargs->pstart, bargs->pend);
4125
4126         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4127                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4128                                   bargs->vstart, bargs->vend);
4129
4130         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4131                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4132
4133         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4134                 CHECK_APPEND_2ARG("limit=%u..%u,",
4135                                 bargs->limit_min, bargs->limit_max);
4136
4137         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4138                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4139                                   bargs->stripes_min, bargs->stripes_max);
4140
4141 #undef CHECK_APPEND_2ARG
4142 #undef CHECK_APPEND_1ARG
4143 #undef CHECK_APPEND_NOARG
4144
4145 out_overflow:
4146
4147         if (size_bp < size_buf)
4148                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4149         else
4150                 buf[0] = '\0';
4151 }
4152
4153 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4154 {
4155         u32 size_buf = 1024;
4156         char tmp_buf[192] = {'\0'};
4157         char *buf;
4158         char *bp;
4159         u32 size_bp = size_buf;
4160         int ret;
4161         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4162
4163         buf = kzalloc(size_buf, GFP_KERNEL);
4164         if (!buf)
4165                 return;
4166
4167         bp = buf;
4168
4169 #define CHECK_APPEND_1ARG(a, v1)                                        \
4170         do {                                                            \
4171                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4172                 if (ret < 0 || ret >= size_bp)                          \
4173                         goto out_overflow;                              \
4174                 size_bp -= ret;                                         \
4175                 bp += ret;                                              \
4176         } while (0)
4177
4178         if (bctl->flags & BTRFS_BALANCE_FORCE)
4179                 CHECK_APPEND_1ARG("%s", "-f ");
4180
4181         if (bctl->flags & BTRFS_BALANCE_DATA) {
4182                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4183                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4184         }
4185
4186         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4187                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4188                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4189         }
4190
4191         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4192                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4193                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4194         }
4195
4196 #undef CHECK_APPEND_1ARG
4197
4198 out_overflow:
4199
4200         if (size_bp < size_buf)
4201                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4202         btrfs_info(fs_info, "balance: %s %s",
4203                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4204                    "resume" : "start", buf);
4205
4206         kfree(buf);
4207 }
4208
4209 /*
4210  * Should be called with balance mutexe held
4211  */
4212 int btrfs_balance(struct btrfs_fs_info *fs_info,
4213                   struct btrfs_balance_control *bctl,
4214                   struct btrfs_ioctl_balance_args *bargs)
4215 {
4216         u64 meta_target, data_target;
4217         u64 allowed;
4218         int mixed = 0;
4219         int ret;
4220         u64 num_devices;
4221         unsigned seq;
4222         bool reducing_redundancy;
4223         int i;
4224
4225         if (btrfs_fs_closing(fs_info) ||
4226             atomic_read(&fs_info->balance_pause_req) ||
4227             btrfs_should_cancel_balance(fs_info)) {
4228                 ret = -EINVAL;
4229                 goto out;
4230         }
4231
4232         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4233         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4234                 mixed = 1;
4235
4236         /*
4237          * In case of mixed groups both data and meta should be picked,
4238          * and identical options should be given for both of them.
4239          */
4240         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4241         if (mixed && (bctl->flags & allowed)) {
4242                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4243                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4244                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4245                         btrfs_err(fs_info,
4246           "balance: mixed groups data and metadata options must be the same");
4247                         ret = -EINVAL;
4248                         goto out;
4249                 }
4250         }
4251
4252         /*
4253          * rw_devices will not change at the moment, device add/delete/replace
4254          * are exclusive
4255          */
4256         num_devices = fs_info->fs_devices->rw_devices;
4257
4258         /*
4259          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4260          * special bit for it, to make it easier to distinguish.  Thus we need
4261          * to set it manually, or balance would refuse the profile.
4262          */
4263         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4264         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4265                 if (num_devices >= btrfs_raid_array[i].devs_min)
4266                         allowed |= btrfs_raid_array[i].bg_flag;
4267
4268         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4269             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4270             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4271                 ret = -EINVAL;
4272                 goto out;
4273         }
4274
4275         /*
4276          * Allow to reduce metadata or system integrity only if force set for
4277          * profiles with redundancy (copies, parity)
4278          */
4279         allowed = 0;
4280         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4281                 if (btrfs_raid_array[i].ncopies >= 2 ||
4282                     btrfs_raid_array[i].tolerated_failures >= 1)
4283                         allowed |= btrfs_raid_array[i].bg_flag;
4284         }
4285         do {
4286                 seq = read_seqbegin(&fs_info->profiles_lock);
4287
4288                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4289                      (fs_info->avail_system_alloc_bits & allowed) &&
4290                      !(bctl->sys.target & allowed)) ||
4291                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4292                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4293                      !(bctl->meta.target & allowed)))
4294                         reducing_redundancy = true;
4295                 else
4296                         reducing_redundancy = false;
4297
4298                 /* if we're not converting, the target field is uninitialized */
4299                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4300                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4301                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4302                         bctl->data.target : fs_info->avail_data_alloc_bits;
4303         } while (read_seqretry(&fs_info->profiles_lock, seq));
4304
4305         if (reducing_redundancy) {
4306                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4307                         btrfs_info(fs_info,
4308                            "balance: force reducing metadata redundancy");
4309                 } else {
4310                         btrfs_err(fs_info,
4311         "balance: reduces metadata redundancy, use --force if you want this");
4312                         ret = -EINVAL;
4313                         goto out;
4314                 }
4315         }
4316
4317         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4318                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4319                 btrfs_warn(fs_info,
4320         "balance: metadata profile %s has lower redundancy than data profile %s",
4321                                 btrfs_bg_type_to_raid_name(meta_target),
4322                                 btrfs_bg_type_to_raid_name(data_target));
4323         }
4324
4325         ret = insert_balance_item(fs_info, bctl);
4326         if (ret && ret != -EEXIST)
4327                 goto out;
4328
4329         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4330                 BUG_ON(ret == -EEXIST);
4331                 BUG_ON(fs_info->balance_ctl);
4332                 spin_lock(&fs_info->balance_lock);
4333                 fs_info->balance_ctl = bctl;
4334                 spin_unlock(&fs_info->balance_lock);
4335         } else {
4336                 BUG_ON(ret != -EEXIST);
4337                 spin_lock(&fs_info->balance_lock);
4338                 update_balance_args(bctl);
4339                 spin_unlock(&fs_info->balance_lock);
4340         }
4341
4342         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4343         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4344         describe_balance_start_or_resume(fs_info);
4345         mutex_unlock(&fs_info->balance_mutex);
4346
4347         ret = __btrfs_balance(fs_info);
4348
4349         mutex_lock(&fs_info->balance_mutex);
4350         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4351                 btrfs_info(fs_info, "balance: paused");
4352         /*
4353          * Balance can be canceled by:
4354          *
4355          * - Regular cancel request
4356          *   Then ret == -ECANCELED and balance_cancel_req > 0
4357          *
4358          * - Fatal signal to "btrfs" process
4359          *   Either the signal caught by wait_reserve_ticket() and callers
4360          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4361          *   got -ECANCELED.
4362          *   Either way, in this case balance_cancel_req = 0, and
4363          *   ret == -EINTR or ret == -ECANCELED.
4364          *
4365          * So here we only check the return value to catch canceled balance.
4366          */
4367         else if (ret == -ECANCELED || ret == -EINTR)
4368                 btrfs_info(fs_info, "balance: canceled");
4369         else
4370                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4371
4372         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4373
4374         if (bargs) {
4375                 memset(bargs, 0, sizeof(*bargs));
4376                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4377         }
4378
4379         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4380             balance_need_close(fs_info)) {
4381                 reset_balance_state(fs_info);
4382                 btrfs_exclop_finish(fs_info);
4383         }
4384
4385         wake_up(&fs_info->balance_wait_q);
4386
4387         return ret;
4388 out:
4389         if (bctl->flags & BTRFS_BALANCE_RESUME)
4390                 reset_balance_state(fs_info);
4391         else
4392                 kfree(bctl);
4393         btrfs_exclop_finish(fs_info);
4394
4395         return ret;
4396 }
4397
4398 static int balance_kthread(void *data)
4399 {
4400         struct btrfs_fs_info *fs_info = data;
4401         int ret = 0;
4402
4403         sb_start_write(fs_info->sb);
4404         mutex_lock(&fs_info->balance_mutex);
4405         if (fs_info->balance_ctl)
4406                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4407         mutex_unlock(&fs_info->balance_mutex);
4408         sb_end_write(fs_info->sb);
4409
4410         return ret;
4411 }
4412
4413 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4414 {
4415         struct task_struct *tsk;
4416
4417         mutex_lock(&fs_info->balance_mutex);
4418         if (!fs_info->balance_ctl) {
4419                 mutex_unlock(&fs_info->balance_mutex);
4420                 return 0;
4421         }
4422         mutex_unlock(&fs_info->balance_mutex);
4423
4424         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4425                 btrfs_info(fs_info, "balance: resume skipped");
4426                 return 0;
4427         }
4428
4429         /*
4430          * A ro->rw remount sequence should continue with the paused balance
4431          * regardless of who pauses it, system or the user as of now, so set
4432          * the resume flag.
4433          */
4434         spin_lock(&fs_info->balance_lock);
4435         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4436         spin_unlock(&fs_info->balance_lock);
4437
4438         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4439         return PTR_ERR_OR_ZERO(tsk);
4440 }
4441
4442 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4443 {
4444         struct btrfs_balance_control *bctl;
4445         struct btrfs_balance_item *item;
4446         struct btrfs_disk_balance_args disk_bargs;
4447         struct btrfs_path *path;
4448         struct extent_buffer *leaf;
4449         struct btrfs_key key;
4450         int ret;
4451
4452         path = btrfs_alloc_path();
4453         if (!path)
4454                 return -ENOMEM;
4455
4456         key.objectid = BTRFS_BALANCE_OBJECTID;
4457         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4458         key.offset = 0;
4459
4460         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4461         if (ret < 0)
4462                 goto out;
4463         if (ret > 0) { /* ret = -ENOENT; */
4464                 ret = 0;
4465                 goto out;
4466         }
4467
4468         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4469         if (!bctl) {
4470                 ret = -ENOMEM;
4471                 goto out;
4472         }
4473
4474         leaf = path->nodes[0];
4475         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4476
4477         bctl->flags = btrfs_balance_flags(leaf, item);
4478         bctl->flags |= BTRFS_BALANCE_RESUME;
4479
4480         btrfs_balance_data(leaf, item, &disk_bargs);
4481         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4482         btrfs_balance_meta(leaf, item, &disk_bargs);
4483         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4484         btrfs_balance_sys(leaf, item, &disk_bargs);
4485         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4486
4487         /*
4488          * This should never happen, as the paused balance state is recovered
4489          * during mount without any chance of other exclusive ops to collide.
4490          *
4491          * This gives the exclusive op status to balance and keeps in paused
4492          * state until user intervention (cancel or umount). If the ownership
4493          * cannot be assigned, show a message but do not fail. The balance
4494          * is in a paused state and must have fs_info::balance_ctl properly
4495          * set up.
4496          */
4497         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4498                 btrfs_warn(fs_info,
4499         "balance: cannot set exclusive op status, resume manually");
4500
4501         btrfs_release_path(path);
4502
4503         mutex_lock(&fs_info->balance_mutex);
4504         BUG_ON(fs_info->balance_ctl);
4505         spin_lock(&fs_info->balance_lock);
4506         fs_info->balance_ctl = bctl;
4507         spin_unlock(&fs_info->balance_lock);
4508         mutex_unlock(&fs_info->balance_mutex);
4509 out:
4510         btrfs_free_path(path);
4511         return ret;
4512 }
4513
4514 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4515 {
4516         int ret = 0;
4517
4518         mutex_lock(&fs_info->balance_mutex);
4519         if (!fs_info->balance_ctl) {
4520                 mutex_unlock(&fs_info->balance_mutex);
4521                 return -ENOTCONN;
4522         }
4523
4524         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4525                 atomic_inc(&fs_info->balance_pause_req);
4526                 mutex_unlock(&fs_info->balance_mutex);
4527
4528                 wait_event(fs_info->balance_wait_q,
4529                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4530
4531                 mutex_lock(&fs_info->balance_mutex);
4532                 /* we are good with balance_ctl ripped off from under us */
4533                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4534                 atomic_dec(&fs_info->balance_pause_req);
4535         } else {
4536                 ret = -ENOTCONN;
4537         }
4538
4539         mutex_unlock(&fs_info->balance_mutex);
4540         return ret;
4541 }
4542
4543 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4544 {
4545         mutex_lock(&fs_info->balance_mutex);
4546         if (!fs_info->balance_ctl) {
4547                 mutex_unlock(&fs_info->balance_mutex);
4548                 return -ENOTCONN;
4549         }
4550
4551         /*
4552          * A paused balance with the item stored on disk can be resumed at
4553          * mount time if the mount is read-write. Otherwise it's still paused
4554          * and we must not allow cancelling as it deletes the item.
4555          */
4556         if (sb_rdonly(fs_info->sb)) {
4557                 mutex_unlock(&fs_info->balance_mutex);
4558                 return -EROFS;
4559         }
4560
4561         atomic_inc(&fs_info->balance_cancel_req);
4562         /*
4563          * if we are running just wait and return, balance item is
4564          * deleted in btrfs_balance in this case
4565          */
4566         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4567                 mutex_unlock(&fs_info->balance_mutex);
4568                 wait_event(fs_info->balance_wait_q,
4569                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4570                 mutex_lock(&fs_info->balance_mutex);
4571         } else {
4572                 mutex_unlock(&fs_info->balance_mutex);
4573                 /*
4574                  * Lock released to allow other waiters to continue, we'll
4575                  * reexamine the status again.
4576                  */
4577                 mutex_lock(&fs_info->balance_mutex);
4578
4579                 if (fs_info->balance_ctl) {
4580                         reset_balance_state(fs_info);
4581                         btrfs_exclop_finish(fs_info);
4582                         btrfs_info(fs_info, "balance: canceled");
4583                 }
4584         }
4585
4586         BUG_ON(fs_info->balance_ctl ||
4587                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4588         atomic_dec(&fs_info->balance_cancel_req);
4589         mutex_unlock(&fs_info->balance_mutex);
4590         return 0;
4591 }
4592
4593 int btrfs_uuid_scan_kthread(void *data)
4594 {
4595         struct btrfs_fs_info *fs_info = data;
4596         struct btrfs_root *root = fs_info->tree_root;
4597         struct btrfs_key key;
4598         struct btrfs_path *path = NULL;
4599         int ret = 0;
4600         struct extent_buffer *eb;
4601         int slot;
4602         struct btrfs_root_item root_item;
4603         u32 item_size;
4604         struct btrfs_trans_handle *trans = NULL;
4605         bool closing = false;
4606
4607         path = btrfs_alloc_path();
4608         if (!path) {
4609                 ret = -ENOMEM;
4610                 goto out;
4611         }
4612
4613         key.objectid = 0;
4614         key.type = BTRFS_ROOT_ITEM_KEY;
4615         key.offset = 0;
4616
4617         while (1) {
4618                 if (btrfs_fs_closing(fs_info)) {
4619                         closing = true;
4620                         break;
4621                 }
4622                 ret = btrfs_search_forward(root, &key, path,
4623                                 BTRFS_OLDEST_GENERATION);
4624                 if (ret) {
4625                         if (ret > 0)
4626                                 ret = 0;
4627                         break;
4628                 }
4629
4630                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4631                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4632                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4633                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4634                         goto skip;
4635
4636                 eb = path->nodes[0];
4637                 slot = path->slots[0];
4638                 item_size = btrfs_item_size_nr(eb, slot);
4639                 if (item_size < sizeof(root_item))
4640                         goto skip;
4641
4642                 read_extent_buffer(eb, &root_item,
4643                                    btrfs_item_ptr_offset(eb, slot),
4644                                    (int)sizeof(root_item));
4645                 if (btrfs_root_refs(&root_item) == 0)
4646                         goto skip;
4647
4648                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4649                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4650                         if (trans)
4651                                 goto update_tree;
4652
4653                         btrfs_release_path(path);
4654                         /*
4655                          * 1 - subvol uuid item
4656                          * 1 - received_subvol uuid item
4657                          */
4658                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4659                         if (IS_ERR(trans)) {
4660                                 ret = PTR_ERR(trans);
4661                                 break;
4662                         }
4663                         continue;
4664                 } else {
4665                         goto skip;
4666                 }
4667 update_tree:
4668                 btrfs_release_path(path);
4669                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4670                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4671                                                   BTRFS_UUID_KEY_SUBVOL,
4672                                                   key.objectid);
4673                         if (ret < 0) {
4674                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4675                                         ret);
4676                                 break;
4677                         }
4678                 }
4679
4680                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4681                         ret = btrfs_uuid_tree_add(trans,
4682                                                   root_item.received_uuid,
4683                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4684                                                   key.objectid);
4685                         if (ret < 0) {
4686                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4687                                         ret);
4688                                 break;
4689                         }
4690                 }
4691
4692 skip:
4693                 btrfs_release_path(path);
4694                 if (trans) {
4695                         ret = btrfs_end_transaction(trans);
4696                         trans = NULL;
4697                         if (ret)
4698                                 break;
4699                 }
4700
4701                 if (key.offset < (u64)-1) {
4702                         key.offset++;
4703                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4704                         key.offset = 0;
4705                         key.type = BTRFS_ROOT_ITEM_KEY;
4706                 } else if (key.objectid < (u64)-1) {
4707                         key.offset = 0;
4708                         key.type = BTRFS_ROOT_ITEM_KEY;
4709                         key.objectid++;
4710                 } else {
4711                         break;
4712                 }
4713                 cond_resched();
4714         }
4715
4716 out:
4717         btrfs_free_path(path);
4718         if (trans && !IS_ERR(trans))
4719                 btrfs_end_transaction(trans);
4720         if (ret)
4721                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4722         else if (!closing)
4723                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4724         up(&fs_info->uuid_tree_rescan_sem);
4725         return 0;
4726 }
4727
4728 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4729 {
4730         struct btrfs_trans_handle *trans;
4731         struct btrfs_root *tree_root = fs_info->tree_root;
4732         struct btrfs_root *uuid_root;
4733         struct task_struct *task;
4734         int ret;
4735
4736         /*
4737          * 1 - root node
4738          * 1 - root item
4739          */
4740         trans = btrfs_start_transaction(tree_root, 2);
4741         if (IS_ERR(trans))
4742                 return PTR_ERR(trans);
4743
4744         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4745         if (IS_ERR(uuid_root)) {
4746                 ret = PTR_ERR(uuid_root);
4747                 btrfs_abort_transaction(trans, ret);
4748                 btrfs_end_transaction(trans);
4749                 return ret;
4750         }
4751
4752         fs_info->uuid_root = uuid_root;
4753
4754         ret = btrfs_commit_transaction(trans);
4755         if (ret)
4756                 return ret;
4757
4758         down(&fs_info->uuid_tree_rescan_sem);
4759         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4760         if (IS_ERR(task)) {
4761                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4762                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4763                 up(&fs_info->uuid_tree_rescan_sem);
4764                 return PTR_ERR(task);
4765         }
4766
4767         return 0;
4768 }
4769
4770 /*
4771  * shrinking a device means finding all of the device extents past
4772  * the new size, and then following the back refs to the chunks.
4773  * The chunk relocation code actually frees the device extent
4774  */
4775 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4776 {
4777         struct btrfs_fs_info *fs_info = device->fs_info;
4778         struct btrfs_root *root = fs_info->dev_root;
4779         struct btrfs_trans_handle *trans;
4780         struct btrfs_dev_extent *dev_extent = NULL;
4781         struct btrfs_path *path;
4782         u64 length;
4783         u64 chunk_offset;
4784         int ret;
4785         int slot;
4786         int failed = 0;
4787         bool retried = false;
4788         struct extent_buffer *l;
4789         struct btrfs_key key;
4790         struct btrfs_super_block *super_copy = fs_info->super_copy;
4791         u64 old_total = btrfs_super_total_bytes(super_copy);
4792         u64 old_size = btrfs_device_get_total_bytes(device);
4793         u64 diff;
4794         u64 start;
4795
4796         new_size = round_down(new_size, fs_info->sectorsize);
4797         start = new_size;
4798         diff = round_down(old_size - new_size, fs_info->sectorsize);
4799
4800         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4801                 return -EINVAL;
4802
4803         path = btrfs_alloc_path();
4804         if (!path)
4805                 return -ENOMEM;
4806
4807         path->reada = READA_BACK;
4808
4809         trans = btrfs_start_transaction(root, 0);
4810         if (IS_ERR(trans)) {
4811                 btrfs_free_path(path);
4812                 return PTR_ERR(trans);
4813         }
4814
4815         mutex_lock(&fs_info->chunk_mutex);
4816
4817         btrfs_device_set_total_bytes(device, new_size);
4818         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4819                 device->fs_devices->total_rw_bytes -= diff;
4820                 atomic64_sub(diff, &fs_info->free_chunk_space);
4821         }
4822
4823         /*
4824          * Once the device's size has been set to the new size, ensure all
4825          * in-memory chunks are synced to disk so that the loop below sees them
4826          * and relocates them accordingly.
4827          */
4828         if (contains_pending_extent(device, &start, diff)) {
4829                 mutex_unlock(&fs_info->chunk_mutex);
4830                 ret = btrfs_commit_transaction(trans);
4831                 if (ret)
4832                         goto done;
4833         } else {
4834                 mutex_unlock(&fs_info->chunk_mutex);
4835                 btrfs_end_transaction(trans);
4836         }
4837
4838 again:
4839         key.objectid = device->devid;
4840         key.offset = (u64)-1;
4841         key.type = BTRFS_DEV_EXTENT_KEY;
4842
4843         do {
4844                 mutex_lock(&fs_info->reclaim_bgs_lock);
4845                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4846                 if (ret < 0) {
4847                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4848                         goto done;
4849                 }
4850
4851                 ret = btrfs_previous_item(root, path, 0, key.type);
4852                 if (ret) {
4853                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4854                         if (ret < 0)
4855                                 goto done;
4856                         ret = 0;
4857                         btrfs_release_path(path);
4858                         break;
4859                 }
4860
4861                 l = path->nodes[0];
4862                 slot = path->slots[0];
4863                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4864
4865                 if (key.objectid != device->devid) {
4866                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4867                         btrfs_release_path(path);
4868                         break;
4869                 }
4870
4871                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4872                 length = btrfs_dev_extent_length(l, dev_extent);
4873
4874                 if (key.offset + length <= new_size) {
4875                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4876                         btrfs_release_path(path);
4877                         break;
4878                 }
4879
4880                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4881                 btrfs_release_path(path);
4882
4883                 /*
4884                  * We may be relocating the only data chunk we have,
4885                  * which could potentially end up with losing data's
4886                  * raid profile, so lets allocate an empty one in
4887                  * advance.
4888                  */
4889                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4890                 if (ret < 0) {
4891                         mutex_unlock(&fs_info->reclaim_bgs_lock);
4892                         goto done;
4893                 }
4894
4895                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4896                 mutex_unlock(&fs_info->reclaim_bgs_lock);
4897                 if (ret == -ENOSPC) {
4898                         failed++;
4899                 } else if (ret) {
4900                         if (ret == -ETXTBSY) {
4901                                 btrfs_warn(fs_info,
4902                    "could not shrink block group %llu due to active swapfile",
4903                                            chunk_offset);
4904                         }
4905                         goto done;
4906                 }
4907         } while (key.offset-- > 0);
4908
4909         if (failed && !retried) {
4910                 failed = 0;
4911                 retried = true;
4912                 goto again;
4913         } else if (failed && retried) {
4914                 ret = -ENOSPC;
4915                 goto done;
4916         }
4917
4918         /* Shrinking succeeded, else we would be at "done". */
4919         trans = btrfs_start_transaction(root, 0);
4920         if (IS_ERR(trans)) {
4921                 ret = PTR_ERR(trans);
4922                 goto done;
4923         }
4924
4925         mutex_lock(&fs_info->chunk_mutex);
4926         /* Clear all state bits beyond the shrunk device size */
4927         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4928                           CHUNK_STATE_MASK);
4929
4930         btrfs_device_set_disk_total_bytes(device, new_size);
4931         if (list_empty(&device->post_commit_list))
4932                 list_add_tail(&device->post_commit_list,
4933                               &trans->transaction->dev_update_list);
4934
4935         WARN_ON(diff > old_total);
4936         btrfs_set_super_total_bytes(super_copy,
4937                         round_down(old_total - diff, fs_info->sectorsize));
4938         mutex_unlock(&fs_info->chunk_mutex);
4939
4940         btrfs_reserve_chunk_metadata(trans, false);
4941         /* Now btrfs_update_device() will change the on-disk size. */
4942         ret = btrfs_update_device(trans, device);
4943         btrfs_trans_release_chunk_metadata(trans);
4944         if (ret < 0) {
4945                 btrfs_abort_transaction(trans, ret);
4946                 btrfs_end_transaction(trans);
4947         } else {
4948                 ret = btrfs_commit_transaction(trans);
4949         }
4950 done:
4951         btrfs_free_path(path);
4952         if (ret) {
4953                 mutex_lock(&fs_info->chunk_mutex);
4954                 btrfs_device_set_total_bytes(device, old_size);
4955                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4956                         device->fs_devices->total_rw_bytes += diff;
4957                 atomic64_add(diff, &fs_info->free_chunk_space);
4958                 mutex_unlock(&fs_info->chunk_mutex);
4959         }
4960         return ret;
4961 }
4962
4963 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4964                            struct btrfs_key *key,
4965                            struct btrfs_chunk *chunk, int item_size)
4966 {
4967         struct btrfs_super_block *super_copy = fs_info->super_copy;
4968         struct btrfs_disk_key disk_key;
4969         u32 array_size;
4970         u8 *ptr;
4971
4972         lockdep_assert_held(&fs_info->chunk_mutex);
4973
4974         array_size = btrfs_super_sys_array_size(super_copy);
4975         if (array_size + item_size + sizeof(disk_key)
4976                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4977                 return -EFBIG;
4978
4979         ptr = super_copy->sys_chunk_array + array_size;
4980         btrfs_cpu_key_to_disk(&disk_key, key);
4981         memcpy(ptr, &disk_key, sizeof(disk_key));
4982         ptr += sizeof(disk_key);
4983         memcpy(ptr, chunk, item_size);
4984         item_size += sizeof(disk_key);
4985         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4986
4987         return 0;
4988 }
4989
4990 /*
4991  * sort the devices in descending order by max_avail, total_avail
4992  */
4993 static int btrfs_cmp_device_info(const void *a, const void *b)
4994 {
4995         const struct btrfs_device_info *di_a = a;
4996         const struct btrfs_device_info *di_b = b;
4997
4998         if (di_a->max_avail > di_b->max_avail)
4999                 return -1;
5000         if (di_a->max_avail < di_b->max_avail)
5001                 return 1;
5002         if (di_a->total_avail > di_b->total_avail)
5003                 return -1;
5004         if (di_a->total_avail < di_b->total_avail)
5005                 return 1;
5006         return 0;
5007 }
5008
5009 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5010 {
5011         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5012                 return;
5013
5014         btrfs_set_fs_incompat(info, RAID56);
5015 }
5016
5017 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5018 {
5019         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
5020                 return;
5021
5022         btrfs_set_fs_incompat(info, RAID1C34);
5023 }
5024
5025 /*
5026  * Structure used internally for btrfs_create_chunk() function.
5027  * Wraps needed parameters.
5028  */
5029 struct alloc_chunk_ctl {
5030         u64 start;
5031         u64 type;
5032         /* Total number of stripes to allocate */
5033         int num_stripes;
5034         /* sub_stripes info for map */
5035         int sub_stripes;
5036         /* Stripes per device */
5037         int dev_stripes;
5038         /* Maximum number of devices to use */
5039         int devs_max;
5040         /* Minimum number of devices to use */
5041         int devs_min;
5042         /* ndevs has to be a multiple of this */
5043         int devs_increment;
5044         /* Number of copies */
5045         int ncopies;
5046         /* Number of stripes worth of bytes to store parity information */
5047         int nparity;
5048         u64 max_stripe_size;
5049         u64 max_chunk_size;
5050         u64 dev_extent_min;
5051         u64 stripe_size;
5052         u64 chunk_size;
5053         int ndevs;
5054 };
5055
5056 static void init_alloc_chunk_ctl_policy_regular(
5057                                 struct btrfs_fs_devices *fs_devices,
5058                                 struct alloc_chunk_ctl *ctl)
5059 {
5060         u64 type = ctl->type;
5061
5062         if (type & BTRFS_BLOCK_GROUP_DATA) {
5063                 ctl->max_stripe_size = SZ_1G;
5064                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
5065         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5066                 /* For larger filesystems, use larger metadata chunks */
5067                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
5068                         ctl->max_stripe_size = SZ_1G;
5069                 else
5070                         ctl->max_stripe_size = SZ_256M;
5071                 ctl->max_chunk_size = ctl->max_stripe_size;
5072         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5073                 ctl->max_stripe_size = SZ_32M;
5074                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5075                 ctl->devs_max = min_t(int, ctl->devs_max,
5076                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5077         } else {
5078                 BUG();
5079         }
5080
5081         /* We don't want a chunk larger than 10% of writable space */
5082         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5083                                   ctl->max_chunk_size);
5084         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5085 }
5086
5087 static void init_alloc_chunk_ctl_policy_zoned(
5088                                       struct btrfs_fs_devices *fs_devices,
5089                                       struct alloc_chunk_ctl *ctl)
5090 {
5091         u64 zone_size = fs_devices->fs_info->zone_size;
5092         u64 limit;
5093         int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5094         int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5095         u64 min_chunk_size = min_data_stripes * zone_size;
5096         u64 type = ctl->type;
5097
5098         ctl->max_stripe_size = zone_size;
5099         if (type & BTRFS_BLOCK_GROUP_DATA) {
5100                 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5101                                                  zone_size);
5102         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5103                 ctl->max_chunk_size = ctl->max_stripe_size;
5104         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5105                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5106                 ctl->devs_max = min_t(int, ctl->devs_max,
5107                                       BTRFS_MAX_DEVS_SYS_CHUNK);
5108         } else {
5109                 BUG();
5110         }
5111
5112         /* We don't want a chunk larger than 10% of writable space */
5113         limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5114                                zone_size),
5115                     min_chunk_size);
5116         ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5117         ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5118 }
5119
5120 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5121                                  struct alloc_chunk_ctl *ctl)
5122 {
5123         int index = btrfs_bg_flags_to_raid_index(ctl->type);
5124
5125         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5126         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5127         ctl->devs_max = btrfs_raid_array[index].devs_max;
5128         if (!ctl->devs_max)
5129                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5130         ctl->devs_min = btrfs_raid_array[index].devs_min;
5131         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5132         ctl->ncopies = btrfs_raid_array[index].ncopies;
5133         ctl->nparity = btrfs_raid_array[index].nparity;
5134         ctl->ndevs = 0;
5135
5136         switch (fs_devices->chunk_alloc_policy) {
5137         case BTRFS_CHUNK_ALLOC_REGULAR:
5138                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5139                 break;
5140         case BTRFS_CHUNK_ALLOC_ZONED:
5141                 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5142                 break;
5143         default:
5144                 BUG();
5145         }
5146 }
5147
5148 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5149                               struct alloc_chunk_ctl *ctl,
5150                               struct btrfs_device_info *devices_info)
5151 {
5152         struct btrfs_fs_info *info = fs_devices->fs_info;
5153         struct btrfs_device *device;
5154         u64 total_avail;
5155         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5156         int ret;
5157         int ndevs = 0;
5158         u64 max_avail;
5159         u64 dev_offset;
5160
5161         /*
5162          * in the first pass through the devices list, we gather information
5163          * about the available holes on each device.
5164          */
5165         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5166                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5167                         WARN(1, KERN_ERR
5168                                "BTRFS: read-only device in alloc_list\n");
5169                         continue;
5170                 }
5171
5172                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5173                                         &device->dev_state) ||
5174                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5175                         continue;
5176
5177                 if (device->total_bytes > device->bytes_used)
5178                         total_avail = device->total_bytes - device->bytes_used;
5179                 else
5180                         total_avail = 0;
5181
5182                 /* If there is no space on this device, skip it. */
5183                 if (total_avail < ctl->dev_extent_min)
5184                         continue;
5185
5186                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5187                                            &max_avail);
5188                 if (ret && ret != -ENOSPC)
5189                         return ret;
5190
5191                 if (ret == 0)
5192                         max_avail = dev_extent_want;
5193
5194                 if (max_avail < ctl->dev_extent_min) {
5195                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5196                                 btrfs_debug(info,
5197                         "%s: devid %llu has no free space, have=%llu want=%llu",
5198                                             __func__, device->devid, max_avail,
5199                                             ctl->dev_extent_min);
5200                         continue;
5201                 }
5202
5203                 if (ndevs == fs_devices->rw_devices) {
5204                         WARN(1, "%s: found more than %llu devices\n",
5205                              __func__, fs_devices->rw_devices);
5206                         break;
5207                 }
5208                 devices_info[ndevs].dev_offset = dev_offset;
5209                 devices_info[ndevs].max_avail = max_avail;
5210                 devices_info[ndevs].total_avail = total_avail;
5211                 devices_info[ndevs].dev = device;
5212                 ++ndevs;
5213         }
5214         ctl->ndevs = ndevs;
5215
5216         /*
5217          * now sort the devices by hole size / available space
5218          */
5219         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5220              btrfs_cmp_device_info, NULL);
5221
5222         return 0;
5223 }
5224
5225 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5226                                       struct btrfs_device_info *devices_info)
5227 {
5228         /* Number of stripes that count for block group size */
5229         int data_stripes;
5230
5231         /*
5232          * The primary goal is to maximize the number of stripes, so use as
5233          * many devices as possible, even if the stripes are not maximum sized.
5234          *
5235          * The DUP profile stores more than one stripe per device, the
5236          * max_avail is the total size so we have to adjust.
5237          */
5238         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5239                                    ctl->dev_stripes);
5240         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5241
5242         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5243         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5244
5245         /*
5246          * Use the number of data stripes to figure out how big this chunk is
5247          * really going to be in terms of logical address space, and compare
5248          * that answer with the max chunk size. If it's higher, we try to
5249          * reduce stripe_size.
5250          */
5251         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5252                 /*
5253                  * Reduce stripe_size, round it up to a 16MB boundary again and
5254                  * then use it, unless it ends up being even bigger than the
5255                  * previous value we had already.
5256                  */
5257                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5258                                                         data_stripes), SZ_16M),
5259                                        ctl->stripe_size);
5260         }
5261
5262         /* Align to BTRFS_STRIPE_LEN */
5263         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5264         ctl->chunk_size = ctl->stripe_size * data_stripes;
5265
5266         return 0;
5267 }
5268
5269 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5270                                     struct btrfs_device_info *devices_info)
5271 {
5272         u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5273         /* Number of stripes that count for block group size */
5274         int data_stripes;
5275
5276         /*
5277          * It should hold because:
5278          *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5279          */
5280         ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5281
5282         ctl->stripe_size = zone_size;
5283         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5284         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5285
5286         /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5287         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5288                 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5289                                              ctl->stripe_size) + ctl->nparity,
5290                                      ctl->dev_stripes);
5291                 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5292                 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5293                 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5294         }
5295
5296         ctl->chunk_size = ctl->stripe_size * data_stripes;
5297
5298         return 0;
5299 }
5300
5301 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5302                               struct alloc_chunk_ctl *ctl,
5303                               struct btrfs_device_info *devices_info)
5304 {
5305         struct btrfs_fs_info *info = fs_devices->fs_info;
5306
5307         /*
5308          * Round down to number of usable stripes, devs_increment can be any
5309          * number so we can't use round_down() that requires power of 2, while
5310          * rounddown is safe.
5311          */
5312         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5313
5314         if (ctl->ndevs < ctl->devs_min) {
5315                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5316                         btrfs_debug(info,
5317         "%s: not enough devices with free space: have=%d minimum required=%d",
5318                                     __func__, ctl->ndevs, ctl->devs_min);
5319                 }
5320                 return -ENOSPC;
5321         }
5322
5323         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5324
5325         switch (fs_devices->chunk_alloc_policy) {
5326         case BTRFS_CHUNK_ALLOC_REGULAR:
5327                 return decide_stripe_size_regular(ctl, devices_info);
5328         case BTRFS_CHUNK_ALLOC_ZONED:
5329                 return decide_stripe_size_zoned(ctl, devices_info);
5330         default:
5331                 BUG();
5332         }
5333 }
5334
5335 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5336                         struct alloc_chunk_ctl *ctl,
5337                         struct btrfs_device_info *devices_info)
5338 {
5339         struct btrfs_fs_info *info = trans->fs_info;
5340         struct map_lookup *map = NULL;
5341         struct extent_map_tree *em_tree;
5342         struct btrfs_block_group *block_group;
5343         struct extent_map *em;
5344         u64 start = ctl->start;
5345         u64 type = ctl->type;
5346         int ret;
5347         int i;
5348         int j;
5349
5350         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5351         if (!map)
5352                 return ERR_PTR(-ENOMEM);
5353         map->num_stripes = ctl->num_stripes;
5354
5355         for (i = 0; i < ctl->ndevs; ++i) {
5356                 for (j = 0; j < ctl->dev_stripes; ++j) {
5357                         int s = i * ctl->dev_stripes + j;
5358                         map->stripes[s].dev = devices_info[i].dev;
5359                         map->stripes[s].physical = devices_info[i].dev_offset +
5360                                                    j * ctl->stripe_size;
5361                 }
5362         }
5363         map->stripe_len = BTRFS_STRIPE_LEN;
5364         map->io_align = BTRFS_STRIPE_LEN;
5365         map->io_width = BTRFS_STRIPE_LEN;
5366         map->type = type;
5367         map->sub_stripes = ctl->sub_stripes;
5368
5369         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5370
5371         em = alloc_extent_map();
5372         if (!em) {
5373                 kfree(map);
5374                 return ERR_PTR(-ENOMEM);
5375         }
5376         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5377         em->map_lookup = map;
5378         em->start = start;
5379         em->len = ctl->chunk_size;
5380         em->block_start = 0;
5381         em->block_len = em->len;
5382         em->orig_block_len = ctl->stripe_size;
5383
5384         em_tree = &info->mapping_tree;
5385         write_lock(&em_tree->lock);
5386         ret = add_extent_mapping(em_tree, em, 0);
5387         if (ret) {
5388                 write_unlock(&em_tree->lock);
5389                 free_extent_map(em);
5390                 return ERR_PTR(ret);
5391         }
5392         write_unlock(&em_tree->lock);
5393
5394         block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5395         if (IS_ERR(block_group))
5396                 goto error_del_extent;
5397
5398         for (i = 0; i < map->num_stripes; i++) {
5399                 struct btrfs_device *dev = map->stripes[i].dev;
5400
5401                 btrfs_device_set_bytes_used(dev,
5402                                             dev->bytes_used + ctl->stripe_size);
5403                 if (list_empty(&dev->post_commit_list))
5404                         list_add_tail(&dev->post_commit_list,
5405                                       &trans->transaction->dev_update_list);
5406         }
5407
5408         atomic64_sub(ctl->stripe_size * map->num_stripes,
5409                      &info->free_chunk_space);
5410
5411         free_extent_map(em);
5412         check_raid56_incompat_flag(info, type);
5413         check_raid1c34_incompat_flag(info, type);
5414
5415         return block_group;
5416
5417 error_del_extent:
5418         write_lock(&em_tree->lock);
5419         remove_extent_mapping(em_tree, em);
5420         write_unlock(&em_tree->lock);
5421
5422         /* One for our allocation */
5423         free_extent_map(em);
5424         /* One for the tree reference */
5425         free_extent_map(em);
5426
5427         return block_group;
5428 }
5429
5430 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5431                                             u64 type)
5432 {
5433         struct btrfs_fs_info *info = trans->fs_info;
5434         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5435         struct btrfs_device_info *devices_info = NULL;
5436         struct alloc_chunk_ctl ctl;
5437         struct btrfs_block_group *block_group;
5438         int ret;
5439
5440         lockdep_assert_held(&info->chunk_mutex);
5441
5442         if (!alloc_profile_is_valid(type, 0)) {
5443                 ASSERT(0);
5444                 return ERR_PTR(-EINVAL);
5445         }
5446
5447         if (list_empty(&fs_devices->alloc_list)) {
5448                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5449                         btrfs_debug(info, "%s: no writable device", __func__);
5450                 return ERR_PTR(-ENOSPC);
5451         }
5452
5453         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5454                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5455                 ASSERT(0);
5456                 return ERR_PTR(-EINVAL);
5457         }
5458
5459         ctl.start = find_next_chunk(info);
5460         ctl.type = type;
5461         init_alloc_chunk_ctl(fs_devices, &ctl);
5462
5463         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5464                                GFP_NOFS);
5465         if (!devices_info)
5466                 return ERR_PTR(-ENOMEM);
5467
5468         ret = gather_device_info(fs_devices, &ctl, devices_info);
5469         if (ret < 0) {
5470                 block_group = ERR_PTR(ret);
5471                 goto out;
5472         }
5473
5474         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5475         if (ret < 0) {
5476                 block_group = ERR_PTR(ret);
5477                 goto out;
5478         }
5479
5480         block_group = create_chunk(trans, &ctl, devices_info);
5481
5482 out:
5483         kfree(devices_info);
5484         return block_group;
5485 }
5486
5487 /*
5488  * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5489  * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5490  * chunks.
5491  *
5492  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5493  * phases.
5494  */
5495 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5496                                      struct btrfs_block_group *bg)
5497 {
5498         struct btrfs_fs_info *fs_info = trans->fs_info;
5499         struct btrfs_root *extent_root = fs_info->extent_root;
5500         struct btrfs_root *chunk_root = fs_info->chunk_root;
5501         struct btrfs_key key;
5502         struct btrfs_chunk *chunk;
5503         struct btrfs_stripe *stripe;
5504         struct extent_map *em;
5505         struct map_lookup *map;
5506         size_t item_size;
5507         int i;
5508         int ret;
5509
5510         /*
5511          * We take the chunk_mutex for 2 reasons:
5512          *
5513          * 1) Updates and insertions in the chunk btree must be done while holding
5514          *    the chunk_mutex, as well as updating the system chunk array in the
5515          *    superblock. See the comment on top of btrfs_chunk_alloc() for the
5516          *    details;
5517          *
5518          * 2) To prevent races with the final phase of a device replace operation
5519          *    that replaces the device object associated with the map's stripes,
5520          *    because the device object's id can change at any time during that
5521          *    final phase of the device replace operation
5522          *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5523          *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5524          *    which would cause a failure when updating the device item, which does
5525          *    not exists, or persisting a stripe of the chunk item with such ID.
5526          *    Here we can't use the device_list_mutex because our caller already
5527          *    has locked the chunk_mutex, and the final phase of device replace
5528          *    acquires both mutexes - first the device_list_mutex and then the
5529          *    chunk_mutex. Using any of those two mutexes protects us from a
5530          *    concurrent device replace.
5531          */
5532         lockdep_assert_held(&fs_info->chunk_mutex);
5533
5534         em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5535         if (IS_ERR(em)) {
5536                 ret = PTR_ERR(em);
5537                 btrfs_abort_transaction(trans, ret);
5538                 return ret;
5539         }
5540
5541         map = em->map_lookup;
5542         item_size = btrfs_chunk_item_size(map->num_stripes);
5543
5544         chunk = kzalloc(item_size, GFP_NOFS);
5545         if (!chunk) {
5546                 ret = -ENOMEM;
5547                 btrfs_abort_transaction(trans, ret);
5548                 goto out;
5549         }
5550
5551         for (i = 0; i < map->num_stripes; i++) {
5552                 struct btrfs_device *device = map->stripes[i].dev;
5553
5554                 ret = btrfs_update_device(trans, device);
5555                 if (ret)
5556                         goto out;
5557         }
5558
5559         stripe = &chunk->stripe;
5560         for (i = 0; i < map->num_stripes; i++) {
5561                 struct btrfs_device *device = map->stripes[i].dev;
5562                 const u64 dev_offset = map->stripes[i].physical;
5563
5564                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5565                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5566                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5567                 stripe++;
5568         }
5569
5570         btrfs_set_stack_chunk_length(chunk, bg->length);
5571         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5572         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5573         btrfs_set_stack_chunk_type(chunk, map->type);
5574         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5575         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5576         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5577         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5578         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5579
5580         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5581         key.type = BTRFS_CHUNK_ITEM_KEY;
5582         key.offset = bg->start;
5583
5584         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5585         if (ret)
5586                 goto out;
5587
5588         bg->chunk_item_inserted = 1;
5589
5590         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5591                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5592                 if (ret)
5593                         goto out;
5594         }
5595
5596 out:
5597         kfree(chunk);
5598         free_extent_map(em);
5599         return ret;
5600 }
5601
5602 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5603 {
5604         struct btrfs_fs_info *fs_info = trans->fs_info;
5605         u64 alloc_profile;
5606         struct btrfs_block_group *meta_bg;
5607         struct btrfs_block_group *sys_bg;
5608
5609         /*
5610          * When adding a new device for sprouting, the seed device is read-only
5611          * so we must first allocate a metadata and a system chunk. But before
5612          * adding the block group items to the extent, device and chunk btrees,
5613          * we must first:
5614          *
5615          * 1) Create both chunks without doing any changes to the btrees, as
5616          *    otherwise we would get -ENOSPC since the block groups from the
5617          *    seed device are read-only;
5618          *
5619          * 2) Add the device item for the new sprout device - finishing the setup
5620          *    of a new block group requires updating the device item in the chunk
5621          *    btree, so it must exist when we attempt to do it. The previous step
5622          *    ensures this does not fail with -ENOSPC.
5623          *
5624          * After that we can add the block group items to their btrees:
5625          * update existing device item in the chunk btree, add a new block group
5626          * item to the extent btree, add a new chunk item to the chunk btree and
5627          * finally add the new device extent items to the devices btree.
5628          */
5629
5630         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5631         meta_bg = btrfs_create_chunk(trans, alloc_profile);
5632         if (IS_ERR(meta_bg))
5633                 return PTR_ERR(meta_bg);
5634
5635         alloc_profile = btrfs_system_alloc_profile(fs_info);
5636         sys_bg = btrfs_create_chunk(trans, alloc_profile);
5637         if (IS_ERR(sys_bg))
5638                 return PTR_ERR(sys_bg);
5639
5640         return 0;
5641 }
5642
5643 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5644 {
5645         const int index = btrfs_bg_flags_to_raid_index(map->type);
5646
5647         return btrfs_raid_array[index].tolerated_failures;
5648 }
5649
5650 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5651 {
5652         struct extent_map *em;
5653         struct map_lookup *map;
5654         int readonly = 0;
5655         int miss_ndevs = 0;
5656         int i;
5657
5658         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5659         if (IS_ERR(em))
5660                 return 1;
5661
5662         map = em->map_lookup;
5663         for (i = 0; i < map->num_stripes; i++) {
5664                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5665                                         &map->stripes[i].dev->dev_state)) {
5666                         miss_ndevs++;
5667                         continue;
5668                 }
5669                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5670                                         &map->stripes[i].dev->dev_state)) {
5671                         readonly = 1;
5672                         goto end;
5673                 }
5674         }
5675
5676         /*
5677          * If the number of missing devices is larger than max errors,
5678          * we can not write the data into that chunk successfully, so
5679          * set it readonly.
5680          */
5681         if (miss_ndevs > btrfs_chunk_max_errors(map))
5682                 readonly = 1;
5683 end:
5684         free_extent_map(em);
5685         return readonly;
5686 }
5687
5688 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5689 {
5690         struct extent_map *em;
5691
5692         while (1) {
5693                 write_lock(&tree->lock);
5694                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5695                 if (em)
5696                         remove_extent_mapping(tree, em);
5697                 write_unlock(&tree->lock);
5698                 if (!em)
5699                         break;
5700                 /* once for us */
5701                 free_extent_map(em);
5702                 /* once for the tree */
5703                 free_extent_map(em);
5704         }
5705 }
5706
5707 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5708 {
5709         struct extent_map *em;
5710         struct map_lookup *map;
5711         int ret;
5712
5713         em = btrfs_get_chunk_map(fs_info, logical, len);
5714         if (IS_ERR(em))
5715                 /*
5716                  * We could return errors for these cases, but that could get
5717                  * ugly and we'd probably do the same thing which is just not do
5718                  * anything else and exit, so return 1 so the callers don't try
5719                  * to use other copies.
5720                  */
5721                 return 1;
5722
5723         map = em->map_lookup;
5724         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5725                 ret = map->num_stripes;
5726         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5727                 ret = map->sub_stripes;
5728         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5729                 ret = 2;
5730         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5731                 /*
5732                  * There could be two corrupted data stripes, we need
5733                  * to loop retry in order to rebuild the correct data.
5734                  *
5735                  * Fail a stripe at a time on every retry except the
5736                  * stripe under reconstruction.
5737                  */
5738                 ret = map->num_stripes;
5739         else
5740                 ret = 1;
5741         free_extent_map(em);
5742
5743         down_read(&fs_info->dev_replace.rwsem);
5744         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5745             fs_info->dev_replace.tgtdev)
5746                 ret++;
5747         up_read(&fs_info->dev_replace.rwsem);
5748
5749         return ret;
5750 }
5751
5752 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5753                                     u64 logical)
5754 {
5755         struct extent_map *em;
5756         struct map_lookup *map;
5757         unsigned long len = fs_info->sectorsize;
5758
5759         em = btrfs_get_chunk_map(fs_info, logical, len);
5760
5761         if (!WARN_ON(IS_ERR(em))) {
5762                 map = em->map_lookup;
5763                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5764                         len = map->stripe_len * nr_data_stripes(map);
5765                 free_extent_map(em);
5766         }
5767         return len;
5768 }
5769
5770 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5771 {
5772         struct extent_map *em;
5773         struct map_lookup *map;
5774         int ret = 0;
5775
5776         em = btrfs_get_chunk_map(fs_info, logical, len);
5777
5778         if(!WARN_ON(IS_ERR(em))) {
5779                 map = em->map_lookup;
5780                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5781                         ret = 1;
5782                 free_extent_map(em);
5783         }
5784         return ret;
5785 }
5786
5787 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5788                             struct map_lookup *map, int first,
5789                             int dev_replace_is_ongoing)
5790 {
5791         int i;
5792         int num_stripes;
5793         int preferred_mirror;
5794         int tolerance;
5795         struct btrfs_device *srcdev;
5796
5797         ASSERT((map->type &
5798                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5799
5800         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5801                 num_stripes = map->sub_stripes;
5802         else
5803                 num_stripes = map->num_stripes;
5804
5805         switch (fs_info->fs_devices->read_policy) {
5806         default:
5807                 /* Shouldn't happen, just warn and use pid instead of failing */
5808                 btrfs_warn_rl(fs_info,
5809                               "unknown read_policy type %u, reset to pid",
5810                               fs_info->fs_devices->read_policy);
5811                 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5812                 fallthrough;
5813         case BTRFS_READ_POLICY_PID:
5814                 preferred_mirror = first + (current->pid % num_stripes);
5815                 break;
5816         }
5817
5818         if (dev_replace_is_ongoing &&
5819             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5820              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5821                 srcdev = fs_info->dev_replace.srcdev;
5822         else
5823                 srcdev = NULL;
5824
5825         /*
5826          * try to avoid the drive that is the source drive for a
5827          * dev-replace procedure, only choose it if no other non-missing
5828          * mirror is available
5829          */
5830         for (tolerance = 0; tolerance < 2; tolerance++) {
5831                 if (map->stripes[preferred_mirror].dev->bdev &&
5832                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5833                         return preferred_mirror;
5834                 for (i = first; i < first + num_stripes; i++) {
5835                         if (map->stripes[i].dev->bdev &&
5836                             (tolerance || map->stripes[i].dev != srcdev))
5837                                 return i;
5838                 }
5839         }
5840
5841         /* we couldn't find one that doesn't fail.  Just return something
5842          * and the io error handling code will clean up eventually
5843          */
5844         return preferred_mirror;
5845 }
5846
5847 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5848 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5849 {
5850         int i;
5851         int again = 1;
5852
5853         while (again) {
5854                 again = 0;
5855                 for (i = 0; i < num_stripes - 1; i++) {
5856                         /* Swap if parity is on a smaller index */
5857                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5858                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5859                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5860                                 again = 1;
5861                         }
5862                 }
5863         }
5864 }
5865
5866 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5867 {
5868         struct btrfs_bio *bbio = kzalloc(
5869                  /* the size of the btrfs_bio */
5870                 sizeof(struct btrfs_bio) +
5871                 /* plus the variable array for the stripes */
5872                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5873                 /* plus the variable array for the tgt dev */
5874                 sizeof(int) * (real_stripes) +
5875                 /*
5876                  * plus the raid_map, which includes both the tgt dev
5877                  * and the stripes
5878                  */
5879                 sizeof(u64) * (total_stripes),
5880                 GFP_NOFS|__GFP_NOFAIL);
5881
5882         atomic_set(&bbio->error, 0);
5883         refcount_set(&bbio->refs, 1);
5884
5885         bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5886         bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5887
5888         return bbio;
5889 }
5890
5891 void btrfs_get_bbio(struct btrfs_bio *bbio)
5892 {
5893         WARN_ON(!refcount_read(&bbio->refs));
5894         refcount_inc(&bbio->refs);
5895 }
5896
5897 void btrfs_put_bbio(struct btrfs_bio *bbio)
5898 {
5899         if (!bbio)
5900                 return;
5901         if (refcount_dec_and_test(&bbio->refs))
5902                 kfree(bbio);
5903 }
5904
5905 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5906 /*
5907  * Please note that, discard won't be sent to target device of device
5908  * replace.
5909  */
5910 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5911                                          u64 logical, u64 *length_ret,
5912                                          struct btrfs_bio **bbio_ret)
5913 {
5914         struct extent_map *em;
5915         struct map_lookup *map;
5916         struct btrfs_bio *bbio;
5917         u64 length = *length_ret;
5918         u64 offset;
5919         u64 stripe_nr;
5920         u64 stripe_nr_end;
5921         u64 stripe_end_offset;
5922         u64 stripe_cnt;
5923         u64 stripe_len;
5924         u64 stripe_offset;
5925         u64 num_stripes;
5926         u32 stripe_index;
5927         u32 factor = 0;
5928         u32 sub_stripes = 0;
5929         u64 stripes_per_dev = 0;
5930         u32 remaining_stripes = 0;
5931         u32 last_stripe = 0;
5932         int ret = 0;
5933         int i;
5934
5935         /* discard always return a bbio */
5936         ASSERT(bbio_ret);
5937
5938         em = btrfs_get_chunk_map(fs_info, logical, length);
5939         if (IS_ERR(em))
5940                 return PTR_ERR(em);
5941
5942         map = em->map_lookup;
5943         /* we don't discard raid56 yet */
5944         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5945                 ret = -EOPNOTSUPP;
5946                 goto out;
5947         }
5948
5949         offset = logical - em->start;
5950         length = min_t(u64, em->start + em->len - logical, length);
5951         *length_ret = length;
5952
5953         stripe_len = map->stripe_len;
5954         /*
5955          * stripe_nr counts the total number of stripes we have to stride
5956          * to get to this block
5957          */
5958         stripe_nr = div64_u64(offset, stripe_len);
5959
5960         /* stripe_offset is the offset of this block in its stripe */
5961         stripe_offset = offset - stripe_nr * stripe_len;
5962
5963         stripe_nr_end = round_up(offset + length, map->stripe_len);
5964         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5965         stripe_cnt = stripe_nr_end - stripe_nr;
5966         stripe_end_offset = stripe_nr_end * map->stripe_len -
5967                             (offset + length);
5968         /*
5969          * after this, stripe_nr is the number of stripes on this
5970          * device we have to walk to find the data, and stripe_index is
5971          * the number of our device in the stripe array
5972          */
5973         num_stripes = 1;
5974         stripe_index = 0;
5975         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5976                          BTRFS_BLOCK_GROUP_RAID10)) {
5977                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5978                         sub_stripes = 1;
5979                 else
5980                         sub_stripes = map->sub_stripes;
5981
5982                 factor = map->num_stripes / sub_stripes;
5983                 num_stripes = min_t(u64, map->num_stripes,
5984                                     sub_stripes * stripe_cnt);
5985                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5986                 stripe_index *= sub_stripes;
5987                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5988                                               &remaining_stripes);
5989                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5990                 last_stripe *= sub_stripes;
5991         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5992                                 BTRFS_BLOCK_GROUP_DUP)) {
5993                 num_stripes = map->num_stripes;
5994         } else {
5995                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5996                                         &stripe_index);
5997         }
5998
5999         bbio = alloc_btrfs_bio(num_stripes, 0);
6000         if (!bbio) {
6001                 ret = -ENOMEM;
6002                 goto out;
6003         }
6004
6005         for (i = 0; i < num_stripes; i++) {
6006                 bbio->stripes[i].physical =
6007                         map->stripes[stripe_index].physical +
6008                         stripe_offset + stripe_nr * map->stripe_len;
6009                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6010
6011                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
6012                                  BTRFS_BLOCK_GROUP_RAID10)) {
6013                         bbio->stripes[i].length = stripes_per_dev *
6014                                 map->stripe_len;
6015
6016                         if (i / sub_stripes < remaining_stripes)
6017                                 bbio->stripes[i].length +=
6018                                         map->stripe_len;
6019
6020                         /*
6021                          * Special for the first stripe and
6022                          * the last stripe:
6023                          *
6024                          * |-------|...|-------|
6025                          *     |----------|
6026                          *    off     end_off
6027                          */
6028                         if (i < sub_stripes)
6029                                 bbio->stripes[i].length -=
6030                                         stripe_offset;
6031
6032                         if (stripe_index >= last_stripe &&
6033                             stripe_index <= (last_stripe +
6034                                              sub_stripes - 1))
6035                                 bbio->stripes[i].length -=
6036                                         stripe_end_offset;
6037
6038                         if (i == sub_stripes - 1)
6039                                 stripe_offset = 0;
6040                 } else {
6041                         bbio->stripes[i].length = length;
6042                 }
6043
6044                 stripe_index++;
6045                 if (stripe_index == map->num_stripes) {
6046                         stripe_index = 0;
6047                         stripe_nr++;
6048                 }
6049         }
6050
6051         *bbio_ret = bbio;
6052         bbio->map_type = map->type;
6053         bbio->num_stripes = num_stripes;
6054 out:
6055         free_extent_map(em);
6056         return ret;
6057 }
6058
6059 /*
6060  * In dev-replace case, for repair case (that's the only case where the mirror
6061  * is selected explicitly when calling btrfs_map_block), blocks left of the
6062  * left cursor can also be read from the target drive.
6063  *
6064  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
6065  * array of stripes.
6066  * For READ, it also needs to be supported using the same mirror number.
6067  *
6068  * If the requested block is not left of the left cursor, EIO is returned. This
6069  * can happen because btrfs_num_copies() returns one more in the dev-replace
6070  * case.
6071  */
6072 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6073                                          u64 logical, u64 length,
6074                                          u64 srcdev_devid, int *mirror_num,
6075                                          u64 *physical)
6076 {
6077         struct btrfs_bio *bbio = NULL;
6078         int num_stripes;
6079         int index_srcdev = 0;
6080         int found = 0;
6081         u64 physical_of_found = 0;
6082         int i;
6083         int ret = 0;
6084
6085         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
6086                                 logical, &length, &bbio, 0, 0);
6087         if (ret) {
6088                 ASSERT(bbio == NULL);
6089                 return ret;
6090         }
6091
6092         num_stripes = bbio->num_stripes;
6093         if (*mirror_num > num_stripes) {
6094                 /*
6095                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
6096                  * that means that the requested area is not left of the left
6097                  * cursor
6098                  */
6099                 btrfs_put_bbio(bbio);
6100                 return -EIO;
6101         }
6102
6103         /*
6104          * process the rest of the function using the mirror_num of the source
6105          * drive. Therefore look it up first.  At the end, patch the device
6106          * pointer to the one of the target drive.
6107          */
6108         for (i = 0; i < num_stripes; i++) {
6109                 if (bbio->stripes[i].dev->devid != srcdev_devid)
6110                         continue;
6111
6112                 /*
6113                  * In case of DUP, in order to keep it simple, only add the
6114                  * mirror with the lowest physical address
6115                  */
6116                 if (found &&
6117                     physical_of_found <= bbio->stripes[i].physical)
6118                         continue;
6119
6120                 index_srcdev = i;
6121                 found = 1;
6122                 physical_of_found = bbio->stripes[i].physical;
6123         }
6124
6125         btrfs_put_bbio(bbio);
6126
6127         ASSERT(found);
6128         if (!found)
6129                 return -EIO;
6130
6131         *mirror_num = index_srcdev + 1;
6132         *physical = physical_of_found;
6133         return ret;
6134 }
6135
6136 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6137 {
6138         struct btrfs_block_group *cache;
6139         bool ret;
6140
6141         /* Non zoned filesystem does not use "to_copy" flag */
6142         if (!btrfs_is_zoned(fs_info))
6143                 return false;
6144
6145         cache = btrfs_lookup_block_group(fs_info, logical);
6146
6147         spin_lock(&cache->lock);
6148         ret = cache->to_copy;
6149         spin_unlock(&cache->lock);
6150
6151         btrfs_put_block_group(cache);
6152         return ret;
6153 }
6154
6155 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6156                                       struct btrfs_bio **bbio_ret,
6157                                       struct btrfs_dev_replace *dev_replace,
6158                                       u64 logical,
6159                                       int *num_stripes_ret, int *max_errors_ret)
6160 {
6161         struct btrfs_bio *bbio = *bbio_ret;
6162         u64 srcdev_devid = dev_replace->srcdev->devid;
6163         int tgtdev_indexes = 0;
6164         int num_stripes = *num_stripes_ret;
6165         int max_errors = *max_errors_ret;
6166         int i;
6167
6168         if (op == BTRFS_MAP_WRITE) {
6169                 int index_where_to_add;
6170
6171                 /*
6172                  * A block group which have "to_copy" set will eventually
6173                  * copied by dev-replace process. We can avoid cloning IO here.
6174                  */
6175                 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6176                         return;
6177
6178                 /*
6179                  * duplicate the write operations while the dev replace
6180                  * procedure is running. Since the copying of the old disk to
6181                  * the new disk takes place at run time while the filesystem is
6182                  * mounted writable, the regular write operations to the old
6183                  * disk have to be duplicated to go to the new disk as well.
6184                  *
6185                  * Note that device->missing is handled by the caller, and that
6186                  * the write to the old disk is already set up in the stripes
6187                  * array.
6188                  */
6189                 index_where_to_add = num_stripes;
6190                 for (i = 0; i < num_stripes; i++) {
6191                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
6192                                 /* write to new disk, too */
6193                                 struct btrfs_bio_stripe *new =
6194                                         bbio->stripes + index_where_to_add;
6195                                 struct btrfs_bio_stripe *old =
6196                                         bbio->stripes + i;
6197
6198                                 new->physical = old->physical;
6199                                 new->length = old->length;
6200                                 new->dev = dev_replace->tgtdev;
6201                                 bbio->tgtdev_map[i] = index_where_to_add;
6202                                 index_where_to_add++;
6203                                 max_errors++;
6204                                 tgtdev_indexes++;
6205                         }
6206                 }
6207                 num_stripes = index_where_to_add;
6208         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6209                 int index_srcdev = 0;
6210                 int found = 0;
6211                 u64 physical_of_found = 0;
6212
6213                 /*
6214                  * During the dev-replace procedure, the target drive can also
6215                  * be used to read data in case it is needed to repair a corrupt
6216                  * block elsewhere. This is possible if the requested area is
6217                  * left of the left cursor. In this area, the target drive is a
6218                  * full copy of the source drive.
6219                  */
6220                 for (i = 0; i < num_stripes; i++) {
6221                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
6222                                 /*
6223                                  * In case of DUP, in order to keep it simple,
6224                                  * only add the mirror with the lowest physical
6225                                  * address
6226                                  */
6227                                 if (found &&
6228                                     physical_of_found <=
6229                                      bbio->stripes[i].physical)
6230                                         continue;
6231                                 index_srcdev = i;
6232                                 found = 1;
6233                                 physical_of_found = bbio->stripes[i].physical;
6234                         }
6235                 }
6236                 if (found) {
6237                         struct btrfs_bio_stripe *tgtdev_stripe =
6238                                 bbio->stripes + num_stripes;
6239
6240                         tgtdev_stripe->physical = physical_of_found;
6241                         tgtdev_stripe->length =
6242                                 bbio->stripes[index_srcdev].length;
6243                         tgtdev_stripe->dev = dev_replace->tgtdev;
6244                         bbio->tgtdev_map[index_srcdev] = num_stripes;
6245
6246                         tgtdev_indexes++;
6247                         num_stripes++;
6248                 }
6249         }
6250
6251         *num_stripes_ret = num_stripes;
6252         *max_errors_ret = max_errors;
6253         bbio->num_tgtdevs = tgtdev_indexes;
6254         *bbio_ret = bbio;
6255 }
6256
6257 static bool need_full_stripe(enum btrfs_map_op op)
6258 {
6259         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6260 }
6261
6262 /*
6263  * Calculate the geometry of a particular (address, len) tuple. This
6264  * information is used to calculate how big a particular bio can get before it
6265  * straddles a stripe.
6266  *
6267  * @fs_info: the filesystem
6268  * @em:      mapping containing the logical extent
6269  * @op:      type of operation - write or read
6270  * @logical: address that we want to figure out the geometry of
6271  * @io_geom: pointer used to return values
6272  *
6273  * Returns < 0 in case a chunk for the given logical address cannot be found,
6274  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6275  */
6276 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6277                           enum btrfs_map_op op, u64 logical,
6278                           struct btrfs_io_geometry *io_geom)
6279 {
6280         struct map_lookup *map;
6281         u64 len;
6282         u64 offset;
6283         u64 stripe_offset;
6284         u64 stripe_nr;
6285         u64 stripe_len;
6286         u64 raid56_full_stripe_start = (u64)-1;
6287         int data_stripes;
6288
6289         ASSERT(op != BTRFS_MAP_DISCARD);
6290
6291         map = em->map_lookup;
6292         /* Offset of this logical address in the chunk */
6293         offset = logical - em->start;
6294         /* Len of a stripe in a chunk */
6295         stripe_len = map->stripe_len;
6296         /* Stripe where this block falls in */
6297         stripe_nr = div64_u64(offset, stripe_len);
6298         /* Offset of stripe in the chunk */
6299         stripe_offset = stripe_nr * stripe_len;
6300         if (offset < stripe_offset) {
6301                 btrfs_crit(fs_info,
6302 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6303                         stripe_offset, offset, em->start, logical, stripe_len);
6304                 return -EINVAL;
6305         }
6306
6307         /* stripe_offset is the offset of this block in its stripe */
6308         stripe_offset = offset - stripe_offset;
6309         data_stripes = nr_data_stripes(map);
6310
6311         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6312                 u64 max_len = stripe_len - stripe_offset;
6313
6314                 /*
6315                  * In case of raid56, we need to know the stripe aligned start
6316                  */
6317                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6318                         unsigned long full_stripe_len = stripe_len * data_stripes;
6319                         raid56_full_stripe_start = offset;
6320
6321                         /*
6322                          * Allow a write of a full stripe, but make sure we
6323                          * don't allow straddling of stripes
6324                          */
6325                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6326                                         full_stripe_len);
6327                         raid56_full_stripe_start *= full_stripe_len;
6328
6329                         /*
6330                          * For writes to RAID[56], allow a full stripeset across
6331                          * all disks. For other RAID types and for RAID[56]
6332                          * reads, just allow a single stripe (on a single disk).
6333                          */
6334                         if (op == BTRFS_MAP_WRITE) {
6335                                 max_len = stripe_len * data_stripes -
6336                                           (offset - raid56_full_stripe_start);
6337                         }
6338                 }
6339                 len = min_t(u64, em->len - offset, max_len);
6340         } else {
6341                 len = em->len - offset;
6342         }
6343
6344         io_geom->len = len;
6345         io_geom->offset = offset;
6346         io_geom->stripe_len = stripe_len;
6347         io_geom->stripe_nr = stripe_nr;
6348         io_geom->stripe_offset = stripe_offset;
6349         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6350
6351         return 0;
6352 }
6353
6354 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6355                              enum btrfs_map_op op,
6356                              u64 logical, u64 *length,
6357                              struct btrfs_bio **bbio_ret,
6358                              int mirror_num, int need_raid_map)
6359 {
6360         struct extent_map *em;
6361         struct map_lookup *map;
6362         u64 stripe_offset;
6363         u64 stripe_nr;
6364         u64 stripe_len;
6365         u32 stripe_index;
6366         int data_stripes;
6367         int i;
6368         int ret = 0;
6369         int num_stripes;
6370         int max_errors = 0;
6371         int tgtdev_indexes = 0;
6372         struct btrfs_bio *bbio = NULL;
6373         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6374         int dev_replace_is_ongoing = 0;
6375         int num_alloc_stripes;
6376         int patch_the_first_stripe_for_dev_replace = 0;
6377         u64 physical_to_patch_in_first_stripe = 0;
6378         u64 raid56_full_stripe_start = (u64)-1;
6379         struct btrfs_io_geometry geom;
6380
6381         ASSERT(bbio_ret);
6382         ASSERT(op != BTRFS_MAP_DISCARD);
6383
6384         em = btrfs_get_chunk_map(fs_info, logical, *length);
6385         ASSERT(!IS_ERR(em));
6386
6387         ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6388         if (ret < 0)
6389                 return ret;
6390
6391         map = em->map_lookup;
6392
6393         *length = geom.len;
6394         stripe_len = geom.stripe_len;
6395         stripe_nr = geom.stripe_nr;
6396         stripe_offset = geom.stripe_offset;
6397         raid56_full_stripe_start = geom.raid56_stripe_offset;
6398         data_stripes = nr_data_stripes(map);
6399
6400         down_read(&dev_replace->rwsem);
6401         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6402         /*
6403          * Hold the semaphore for read during the whole operation, write is
6404          * requested at commit time but must wait.
6405          */
6406         if (!dev_replace_is_ongoing)
6407                 up_read(&dev_replace->rwsem);
6408
6409         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6410             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6411                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6412                                                     dev_replace->srcdev->devid,
6413                                                     &mirror_num,
6414                                             &physical_to_patch_in_first_stripe);
6415                 if (ret)
6416                         goto out;
6417                 else
6418                         patch_the_first_stripe_for_dev_replace = 1;
6419         } else if (mirror_num > map->num_stripes) {
6420                 mirror_num = 0;
6421         }
6422
6423         num_stripes = 1;
6424         stripe_index = 0;
6425         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6426                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6427                                 &stripe_index);
6428                 if (!need_full_stripe(op))
6429                         mirror_num = 1;
6430         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6431                 if (need_full_stripe(op))
6432                         num_stripes = map->num_stripes;
6433                 else if (mirror_num)
6434                         stripe_index = mirror_num - 1;
6435                 else {
6436                         stripe_index = find_live_mirror(fs_info, map, 0,
6437                                             dev_replace_is_ongoing);
6438                         mirror_num = stripe_index + 1;
6439                 }
6440
6441         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6442                 if (need_full_stripe(op)) {
6443                         num_stripes = map->num_stripes;
6444                 } else if (mirror_num) {
6445                         stripe_index = mirror_num - 1;
6446                 } else {
6447                         mirror_num = 1;
6448                 }
6449
6450         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6451                 u32 factor = map->num_stripes / map->sub_stripes;
6452
6453                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6454                 stripe_index *= map->sub_stripes;
6455
6456                 if (need_full_stripe(op))
6457                         num_stripes = map->sub_stripes;
6458                 else if (mirror_num)
6459                         stripe_index += mirror_num - 1;
6460                 else {
6461                         int old_stripe_index = stripe_index;
6462                         stripe_index = find_live_mirror(fs_info, map,
6463                                               stripe_index,
6464                                               dev_replace_is_ongoing);
6465                         mirror_num = stripe_index - old_stripe_index + 1;
6466                 }
6467
6468         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6469                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6470                         /* push stripe_nr back to the start of the full stripe */
6471                         stripe_nr = div64_u64(raid56_full_stripe_start,
6472                                         stripe_len * data_stripes);
6473
6474                         /* RAID[56] write or recovery. Return all stripes */
6475                         num_stripes = map->num_stripes;
6476                         max_errors = nr_parity_stripes(map);
6477
6478                         *length = map->stripe_len;
6479                         stripe_index = 0;
6480                         stripe_offset = 0;
6481                 } else {
6482                         /*
6483                          * Mirror #0 or #1 means the original data block.
6484                          * Mirror #2 is RAID5 parity block.
6485                          * Mirror #3 is RAID6 Q block.
6486                          */
6487                         stripe_nr = div_u64_rem(stripe_nr,
6488                                         data_stripes, &stripe_index);
6489                         if (mirror_num > 1)
6490                                 stripe_index = data_stripes + mirror_num - 2;
6491
6492                         /* We distribute the parity blocks across stripes */
6493                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6494                                         &stripe_index);
6495                         if (!need_full_stripe(op) && mirror_num <= 1)
6496                                 mirror_num = 1;
6497                 }
6498         } else {
6499                 /*
6500                  * after this, stripe_nr is the number of stripes on this
6501                  * device we have to walk to find the data, and stripe_index is
6502                  * the number of our device in the stripe array
6503                  */
6504                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6505                                 &stripe_index);
6506                 mirror_num = stripe_index + 1;
6507         }
6508         if (stripe_index >= map->num_stripes) {
6509                 btrfs_crit(fs_info,
6510                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6511                            stripe_index, map->num_stripes);
6512                 ret = -EINVAL;
6513                 goto out;
6514         }
6515
6516         num_alloc_stripes = num_stripes;
6517         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6518                 if (op == BTRFS_MAP_WRITE)
6519                         num_alloc_stripes <<= 1;
6520                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6521                         num_alloc_stripes++;
6522                 tgtdev_indexes = num_stripes;
6523         }
6524
6525         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6526         if (!bbio) {
6527                 ret = -ENOMEM;
6528                 goto out;
6529         }
6530
6531         for (i = 0; i < num_stripes; i++) {
6532                 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6533                         stripe_offset + stripe_nr * map->stripe_len;
6534                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6535                 stripe_index++;
6536         }
6537
6538         /* build raid_map */
6539         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6540             (need_full_stripe(op) || mirror_num > 1)) {
6541                 u64 tmp;
6542                 unsigned rot;
6543
6544                 /* Work out the disk rotation on this stripe-set */
6545                 div_u64_rem(stripe_nr, num_stripes, &rot);
6546
6547                 /* Fill in the logical address of each stripe */
6548                 tmp = stripe_nr * data_stripes;
6549                 for (i = 0; i < data_stripes; i++)
6550                         bbio->raid_map[(i+rot) % num_stripes] =
6551                                 em->start + (tmp + i) * map->stripe_len;
6552
6553                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6554                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6555                         bbio->raid_map[(i+rot+1) % num_stripes] =
6556                                 RAID6_Q_STRIPE;
6557
6558                 sort_parity_stripes(bbio, num_stripes);
6559         }
6560
6561         if (need_full_stripe(op))
6562                 max_errors = btrfs_chunk_max_errors(map);
6563
6564         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6565             need_full_stripe(op)) {
6566                 handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
6567                                           &num_stripes, &max_errors);
6568         }
6569
6570         *bbio_ret = bbio;
6571         bbio->map_type = map->type;
6572         bbio->num_stripes = num_stripes;
6573         bbio->max_errors = max_errors;
6574         bbio->mirror_num = mirror_num;
6575
6576         /*
6577          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6578          * mirror_num == num_stripes + 1 && dev_replace target drive is
6579          * available as a mirror
6580          */
6581         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6582                 WARN_ON(num_stripes > 1);
6583                 bbio->stripes[0].dev = dev_replace->tgtdev;
6584                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6585                 bbio->mirror_num = map->num_stripes + 1;
6586         }
6587 out:
6588         if (dev_replace_is_ongoing) {
6589                 lockdep_assert_held(&dev_replace->rwsem);
6590                 /* Unlock and let waiting writers proceed */
6591                 up_read(&dev_replace->rwsem);
6592         }
6593         free_extent_map(em);
6594         return ret;
6595 }
6596
6597 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6598                       u64 logical, u64 *length,
6599                       struct btrfs_bio **bbio_ret, int mirror_num)
6600 {
6601         if (op == BTRFS_MAP_DISCARD)
6602                 return __btrfs_map_block_for_discard(fs_info, logical,
6603                                                      length, bbio_ret);
6604
6605         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6606                                  mirror_num, 0);
6607 }
6608
6609 /* For Scrub/replace */
6610 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6611                      u64 logical, u64 *length,
6612                      struct btrfs_bio **bbio_ret)
6613 {
6614         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6615 }
6616
6617 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6618 {
6619         bio->bi_private = bbio->private;
6620         bio->bi_end_io = bbio->end_io;
6621         bio_endio(bio);
6622
6623         btrfs_put_bbio(bbio);
6624 }
6625
6626 static void btrfs_end_bio(struct bio *bio)
6627 {
6628         struct btrfs_bio *bbio = bio->bi_private;
6629         int is_orig_bio = 0;
6630
6631         if (bio->bi_status) {
6632                 atomic_inc(&bbio->error);
6633                 if (bio->bi_status == BLK_STS_IOERR ||
6634                     bio->bi_status == BLK_STS_TARGET) {
6635                         struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6636
6637                         ASSERT(dev->bdev);
6638                         if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6639                                 btrfs_dev_stat_inc_and_print(dev,
6640                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6641                         else if (!(bio->bi_opf & REQ_RAHEAD))
6642                                 btrfs_dev_stat_inc_and_print(dev,
6643                                                 BTRFS_DEV_STAT_READ_ERRS);
6644                         if (bio->bi_opf & REQ_PREFLUSH)
6645                                 btrfs_dev_stat_inc_and_print(dev,
6646                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6647                 }
6648         }
6649
6650         if (bio == bbio->orig_bio)
6651                 is_orig_bio = 1;
6652
6653         btrfs_bio_counter_dec(bbio->fs_info);
6654
6655         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6656                 if (!is_orig_bio) {
6657                         bio_put(bio);
6658                         bio = bbio->orig_bio;
6659                 }
6660
6661                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6662                 /* only send an error to the higher layers if it is
6663                  * beyond the tolerance of the btrfs bio
6664                  */
6665                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6666                         bio->bi_status = BLK_STS_IOERR;
6667                 } else {
6668                         /*
6669                          * this bio is actually up to date, we didn't
6670                          * go over the max number of errors
6671                          */
6672                         bio->bi_status = BLK_STS_OK;
6673                 }
6674
6675                 btrfs_end_bbio(bbio, bio);
6676         } else if (!is_orig_bio) {
6677                 bio_put(bio);
6678         }
6679 }
6680
6681 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6682                               u64 physical, struct btrfs_device *dev)
6683 {
6684         struct btrfs_fs_info *fs_info = bbio->fs_info;
6685
6686         bio->bi_private = bbio;
6687         btrfs_io_bio(bio)->device = dev;
6688         bio->bi_end_io = btrfs_end_bio;
6689         bio->bi_iter.bi_sector = physical >> 9;
6690         /*
6691          * For zone append writing, bi_sector must point the beginning of the
6692          * zone
6693          */
6694         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6695                 if (btrfs_dev_is_sequential(dev, physical)) {
6696                         u64 zone_start = round_down(physical, fs_info->zone_size);
6697
6698                         bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6699                 } else {
6700                         bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6701                         bio->bi_opf |= REQ_OP_WRITE;
6702                 }
6703         }
6704         btrfs_debug_in_rcu(fs_info,
6705         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6706                 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6707                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6708                 dev->devid, bio->bi_iter.bi_size);
6709         bio_set_dev(bio, dev->bdev);
6710
6711         btrfs_bio_counter_inc_noblocked(fs_info);
6712
6713         btrfsic_submit_bio(bio);
6714 }
6715
6716 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6717 {
6718         atomic_inc(&bbio->error);
6719         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6720                 /* Should be the original bio. */
6721                 WARN_ON(bio != bbio->orig_bio);
6722
6723                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6724                 bio->bi_iter.bi_sector = logical >> 9;
6725                 if (atomic_read(&bbio->error) > bbio->max_errors)
6726                         bio->bi_status = BLK_STS_IOERR;
6727                 else
6728                         bio->bi_status = BLK_STS_OK;
6729                 btrfs_end_bbio(bbio, bio);
6730         }
6731 }
6732
6733 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6734                            int mirror_num)
6735 {
6736         struct btrfs_device *dev;
6737         struct bio *first_bio = bio;
6738         u64 logical = bio->bi_iter.bi_sector << 9;
6739         u64 length = 0;
6740         u64 map_length;
6741         int ret;
6742         int dev_nr;
6743         int total_devs;
6744         struct btrfs_bio *bbio = NULL;
6745
6746         length = bio->bi_iter.bi_size;
6747         map_length = length;
6748
6749         btrfs_bio_counter_inc_blocked(fs_info);
6750         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6751                                 &map_length, &bbio, mirror_num, 1);
6752         if (ret) {
6753                 btrfs_bio_counter_dec(fs_info);
6754                 return errno_to_blk_status(ret);
6755         }
6756
6757         total_devs = bbio->num_stripes;
6758         bbio->orig_bio = first_bio;
6759         bbio->private = first_bio->bi_private;
6760         bbio->end_io = first_bio->bi_end_io;
6761         bbio->fs_info = fs_info;
6762         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6763
6764         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6765             ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6766                 /* In this case, map_length has been set to the length of
6767                    a single stripe; not the whole write */
6768                 if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6769                         ret = raid56_parity_write(fs_info, bio, bbio,
6770                                                   map_length);
6771                 } else {
6772                         ret = raid56_parity_recover(fs_info, bio, bbio,
6773                                                     map_length, mirror_num, 1);
6774                 }
6775
6776                 btrfs_bio_counter_dec(fs_info);
6777                 return errno_to_blk_status(ret);
6778         }
6779
6780         if (map_length < length) {
6781                 btrfs_crit(fs_info,
6782                            "mapping failed logical %llu bio len %llu len %llu",
6783                            logical, length, map_length);
6784                 BUG();
6785         }
6786
6787         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6788                 dev = bbio->stripes[dev_nr].dev;
6789                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6790                                                    &dev->dev_state) ||
6791                     (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6792                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6793                         bbio_error(bbio, first_bio, logical);
6794                         continue;
6795                 }
6796
6797                 if (dev_nr < total_devs - 1)
6798                         bio = btrfs_bio_clone(first_bio);
6799                 else
6800                         bio = first_bio;
6801
6802                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6803         }
6804         btrfs_bio_counter_dec(fs_info);
6805         return BLK_STS_OK;
6806 }
6807
6808 /*
6809  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6810  * return NULL.
6811  *
6812  * If devid and uuid are both specified, the match must be exact, otherwise
6813  * only devid is used.
6814  */
6815 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6816                                        u64 devid, u8 *uuid, u8 *fsid)
6817 {
6818         struct btrfs_device *device;
6819         struct btrfs_fs_devices *seed_devs;
6820
6821         if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6822                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6823                         if (device->devid == devid &&
6824                             (!uuid || memcmp(device->uuid, uuid,
6825                                              BTRFS_UUID_SIZE) == 0))
6826                                 return device;
6827                 }
6828         }
6829
6830         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6831                 if (!fsid ||
6832                     !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6833                         list_for_each_entry(device, &seed_devs->devices,
6834                                             dev_list) {
6835                                 if (device->devid == devid &&
6836                                     (!uuid || memcmp(device->uuid, uuid,
6837                                                      BTRFS_UUID_SIZE) == 0))
6838                                         return device;
6839                         }
6840                 }
6841         }
6842
6843         return NULL;
6844 }
6845
6846 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6847                                             u64 devid, u8 *dev_uuid)
6848 {
6849         struct btrfs_device *device;
6850         unsigned int nofs_flag;
6851
6852         /*
6853          * We call this under the chunk_mutex, so we want to use NOFS for this
6854          * allocation, however we don't want to change btrfs_alloc_device() to
6855          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6856          * places.
6857          */
6858         nofs_flag = memalloc_nofs_save();
6859         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6860         memalloc_nofs_restore(nofs_flag);
6861         if (IS_ERR(device))
6862                 return device;
6863
6864         list_add(&device->dev_list, &fs_devices->devices);
6865         device->fs_devices = fs_devices;
6866         fs_devices->num_devices++;
6867
6868         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6869         fs_devices->missing_devices++;
6870
6871         return device;
6872 }
6873
6874 /**
6875  * btrfs_alloc_device - allocate struct btrfs_device
6876  * @fs_info:    used only for generating a new devid, can be NULL if
6877  *              devid is provided (i.e. @devid != NULL).
6878  * @devid:      a pointer to devid for this device.  If NULL a new devid
6879  *              is generated.
6880  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6881  *              is generated.
6882  *
6883  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6884  * on error.  Returned struct is not linked onto any lists and must be
6885  * destroyed with btrfs_free_device.
6886  */
6887 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6888                                         const u64 *devid,
6889                                         const u8 *uuid)
6890 {
6891         struct btrfs_device *dev;
6892         u64 tmp;
6893
6894         if (WARN_ON(!devid && !fs_info))
6895                 return ERR_PTR(-EINVAL);
6896
6897         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6898         if (!dev)
6899                 return ERR_PTR(-ENOMEM);
6900
6901         /*
6902          * Preallocate a bio that's always going to be used for flushing device
6903          * barriers and matches the device lifespan
6904          */
6905         dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
6906         if (!dev->flush_bio) {
6907                 kfree(dev);
6908                 return ERR_PTR(-ENOMEM);
6909         }
6910
6911         INIT_LIST_HEAD(&dev->dev_list);
6912         INIT_LIST_HEAD(&dev->dev_alloc_list);
6913         INIT_LIST_HEAD(&dev->post_commit_list);
6914
6915         atomic_set(&dev->reada_in_flight, 0);
6916         atomic_set(&dev->dev_stats_ccnt, 0);
6917         btrfs_device_data_ordered_init(dev);
6918         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6919         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6920         extent_io_tree_init(fs_info, &dev->alloc_state,
6921                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
6922
6923         if (devid)
6924                 tmp = *devid;
6925         else {
6926                 int ret;
6927
6928                 ret = find_next_devid(fs_info, &tmp);
6929                 if (ret) {
6930                         btrfs_free_device(dev);
6931                         return ERR_PTR(ret);
6932                 }
6933         }
6934         dev->devid = tmp;
6935
6936         if (uuid)
6937                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6938         else
6939                 generate_random_uuid(dev->uuid);
6940
6941         return dev;
6942 }
6943
6944 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6945                                         u64 devid, u8 *uuid, bool error)
6946 {
6947         if (error)
6948                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6949                               devid, uuid);
6950         else
6951                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6952                               devid, uuid);
6953 }
6954
6955 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6956 {
6957         const int data_stripes = calc_data_stripes(type, num_stripes);
6958
6959         return div_u64(chunk_len, data_stripes);
6960 }
6961
6962 #if BITS_PER_LONG == 32
6963 /*
6964  * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6965  * can't be accessed on 32bit systems.
6966  *
6967  * This function do mount time check to reject the fs if it already has
6968  * metadata chunk beyond that limit.
6969  */
6970 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6971                                   u64 logical, u64 length, u64 type)
6972 {
6973         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6974                 return 0;
6975
6976         if (logical + length < MAX_LFS_FILESIZE)
6977                 return 0;
6978
6979         btrfs_err_32bit_limit(fs_info);
6980         return -EOVERFLOW;
6981 }
6982
6983 /*
6984  * This is to give early warning for any metadata chunk reaching
6985  * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6986  * Although we can still access the metadata, it's not going to be possible
6987  * once the limit is reached.
6988  */
6989 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6990                                   u64 logical, u64 length, u64 type)
6991 {
6992         if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6993                 return;
6994
6995         if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6996                 return;
6997
6998         btrfs_warn_32bit_limit(fs_info);
6999 }
7000 #endif
7001
7002 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
7003                           struct btrfs_chunk *chunk)
7004 {
7005         struct btrfs_fs_info *fs_info = leaf->fs_info;
7006         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7007         struct map_lookup *map;
7008         struct extent_map *em;
7009         u64 logical;
7010         u64 length;
7011         u64 devid;
7012         u64 type;
7013         u8 uuid[BTRFS_UUID_SIZE];
7014         int num_stripes;
7015         int ret;
7016         int i;
7017
7018         logical = key->offset;
7019         length = btrfs_chunk_length(leaf, chunk);
7020         type = btrfs_chunk_type(leaf, chunk);
7021         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
7022
7023 #if BITS_PER_LONG == 32
7024         ret = check_32bit_meta_chunk(fs_info, logical, length, type);
7025         if (ret < 0)
7026                 return ret;
7027         warn_32bit_meta_chunk(fs_info, logical, length, type);
7028 #endif
7029
7030         /*
7031          * Only need to verify chunk item if we're reading from sys chunk array,
7032          * as chunk item in tree block is already verified by tree-checker.
7033          */
7034         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
7035                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
7036                 if (ret)
7037                         return ret;
7038         }
7039
7040         read_lock(&map_tree->lock);
7041         em = lookup_extent_mapping(map_tree, logical, 1);
7042         read_unlock(&map_tree->lock);
7043
7044         /* already mapped? */
7045         if (em && em->start <= logical && em->start + em->len > logical) {
7046                 free_extent_map(em);
7047                 return 0;
7048         } else if (em) {
7049                 free_extent_map(em);
7050         }
7051
7052         em = alloc_extent_map();
7053         if (!em)
7054                 return -ENOMEM;
7055         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
7056         if (!map) {
7057                 free_extent_map(em);
7058                 return -ENOMEM;
7059         }
7060
7061         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
7062         em->map_lookup = map;
7063         em->start = logical;
7064         em->len = length;
7065         em->orig_start = 0;
7066         em->block_start = 0;
7067         em->block_len = em->len;
7068
7069         map->num_stripes = num_stripes;
7070         map->io_width = btrfs_chunk_io_width(leaf, chunk);
7071         map->io_align = btrfs_chunk_io_align(leaf, chunk);
7072         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7073         map->type = type;
7074         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7075         map->verified_stripes = 0;
7076         em->orig_block_len = calc_stripe_length(type, em->len,
7077                                                 map->num_stripes);
7078         for (i = 0; i < num_stripes; i++) {
7079                 map->stripes[i].physical =
7080                         btrfs_stripe_offset_nr(leaf, chunk, i);
7081                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7082                 read_extent_buffer(leaf, uuid, (unsigned long)
7083                                    btrfs_stripe_dev_uuid_nr(chunk, i),
7084                                    BTRFS_UUID_SIZE);
7085                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
7086                                                         devid, uuid, NULL);
7087                 if (!map->stripes[i].dev &&
7088                     !btrfs_test_opt(fs_info, DEGRADED)) {
7089                         free_extent_map(em);
7090                         btrfs_report_missing_device(fs_info, devid, uuid, true);
7091                         return -ENOENT;
7092                 }
7093                 if (!map->stripes[i].dev) {
7094                         map->stripes[i].dev =
7095                                 add_missing_dev(fs_info->fs_devices, devid,
7096                                                 uuid);
7097                         if (IS_ERR(map->stripes[i].dev)) {
7098                                 free_extent_map(em);
7099                                 btrfs_err(fs_info,
7100                                         "failed to init missing dev %llu: %ld",
7101                                         devid, PTR_ERR(map->stripes[i].dev));
7102                                 return PTR_ERR(map->stripes[i].dev);
7103                         }
7104                         btrfs_report_missing_device(fs_info, devid, uuid, false);
7105                 }
7106                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7107                                 &(map->stripes[i].dev->dev_state));
7108
7109         }
7110
7111         write_lock(&map_tree->lock);
7112         ret = add_extent_mapping(map_tree, em, 0);
7113         write_unlock(&map_tree->lock);
7114         if (ret < 0) {
7115                 btrfs_err(fs_info,
7116                           "failed to add chunk map, start=%llu len=%llu: %d",
7117                           em->start, em->len, ret);
7118         }
7119         free_extent_map(em);
7120
7121         return ret;
7122 }
7123
7124 static void fill_device_from_item(struct extent_buffer *leaf,
7125                                  struct btrfs_dev_item *dev_item,
7126                                  struct btrfs_device *device)
7127 {
7128         unsigned long ptr;
7129
7130         device->devid = btrfs_device_id(leaf, dev_item);
7131         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7132         device->total_bytes = device->disk_total_bytes;
7133         device->commit_total_bytes = device->disk_total_bytes;
7134         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7135         device->commit_bytes_used = device->bytes_used;
7136         device->type = btrfs_device_type(leaf, dev_item);
7137         device->io_align = btrfs_device_io_align(leaf, dev_item);
7138         device->io_width = btrfs_device_io_width(leaf, dev_item);
7139         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7140         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7141         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7142
7143         ptr = btrfs_device_uuid(dev_item);
7144         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7145 }
7146
7147 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7148                                                   u8 *fsid)
7149 {
7150         struct btrfs_fs_devices *fs_devices;
7151         int ret;
7152
7153         lockdep_assert_held(&uuid_mutex);
7154         ASSERT(fsid);
7155
7156         /* This will match only for multi-device seed fs */
7157         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7158                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7159                         return fs_devices;
7160
7161
7162         fs_devices = find_fsid(fsid, NULL);
7163         if (!fs_devices) {
7164                 if (!btrfs_test_opt(fs_info, DEGRADED))
7165                         return ERR_PTR(-ENOENT);
7166
7167                 fs_devices = alloc_fs_devices(fsid, NULL);
7168                 if (IS_ERR(fs_devices))
7169                         return fs_devices;
7170
7171                 fs_devices->seeding = true;
7172                 fs_devices->opened = 1;
7173                 return fs_devices;
7174         }
7175
7176         /*
7177          * Upon first call for a seed fs fsid, just create a private copy of the
7178          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7179          */
7180         fs_devices = clone_fs_devices(fs_devices);
7181         if (IS_ERR(fs_devices))
7182                 return fs_devices;
7183
7184         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7185         if (ret) {
7186                 free_fs_devices(fs_devices);
7187                 return ERR_PTR(ret);
7188         }
7189
7190         if (!fs_devices->seeding) {
7191                 close_fs_devices(fs_devices);
7192                 free_fs_devices(fs_devices);
7193                 return ERR_PTR(-EINVAL);
7194         }
7195
7196         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7197
7198         return fs_devices;
7199 }
7200
7201 static int read_one_dev(struct extent_buffer *leaf,
7202                         struct btrfs_dev_item *dev_item)
7203 {
7204         struct btrfs_fs_info *fs_info = leaf->fs_info;
7205         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7206         struct btrfs_device *device;
7207         u64 devid;
7208         int ret;
7209         u8 fs_uuid[BTRFS_FSID_SIZE];
7210         u8 dev_uuid[BTRFS_UUID_SIZE];
7211
7212         devid = btrfs_device_id(leaf, dev_item);
7213         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7214                            BTRFS_UUID_SIZE);
7215         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7216                            BTRFS_FSID_SIZE);
7217
7218         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7219                 fs_devices = open_seed_devices(fs_info, fs_uuid);
7220                 if (IS_ERR(fs_devices))
7221                         return PTR_ERR(fs_devices);
7222         }
7223
7224         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7225                                    fs_uuid);
7226         if (!device) {
7227                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
7228                         btrfs_report_missing_device(fs_info, devid,
7229                                                         dev_uuid, true);
7230                         return -ENOENT;
7231                 }
7232
7233                 device = add_missing_dev(fs_devices, devid, dev_uuid);
7234                 if (IS_ERR(device)) {
7235                         btrfs_err(fs_info,
7236                                 "failed to add missing dev %llu: %ld",
7237                                 devid, PTR_ERR(device));
7238                         return PTR_ERR(device);
7239                 }
7240                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7241         } else {
7242                 if (!device->bdev) {
7243                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
7244                                 btrfs_report_missing_device(fs_info,
7245                                                 devid, dev_uuid, true);
7246                                 return -ENOENT;
7247                         }
7248                         btrfs_report_missing_device(fs_info, devid,
7249                                                         dev_uuid, false);
7250                 }
7251
7252                 if (!device->bdev &&
7253                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7254                         /*
7255                          * this happens when a device that was properly setup
7256                          * in the device info lists suddenly goes bad.
7257                          * device->bdev is NULL, and so we have to set
7258                          * device->missing to one here
7259                          */
7260                         device->fs_devices->missing_devices++;
7261                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7262                 }
7263
7264                 /* Move the device to its own fs_devices */
7265                 if (device->fs_devices != fs_devices) {
7266                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7267                                                         &device->dev_state));
7268
7269                         list_move(&device->dev_list, &fs_devices->devices);
7270                         device->fs_devices->num_devices--;
7271                         fs_devices->num_devices++;
7272
7273                         device->fs_devices->missing_devices--;
7274                         fs_devices->missing_devices++;
7275
7276                         device->fs_devices = fs_devices;
7277                 }
7278         }
7279
7280         if (device->fs_devices != fs_info->fs_devices) {
7281                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7282                 if (device->generation !=
7283                     btrfs_device_generation(leaf, dev_item))
7284                         return -EINVAL;
7285         }
7286
7287         fill_device_from_item(leaf, dev_item, device);
7288         if (device->bdev) {
7289                 u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
7290
7291                 if (device->total_bytes > max_total_bytes) {
7292                         btrfs_err(fs_info,
7293                         "device total_bytes should be at most %llu but found %llu",
7294                                   max_total_bytes, device->total_bytes);
7295                         return -EINVAL;
7296                 }
7297         }
7298         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7299         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7300            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7301                 device->fs_devices->total_rw_bytes += device->total_bytes;
7302                 atomic64_add(device->total_bytes - device->bytes_used,
7303                                 &fs_info->free_chunk_space);
7304         }
7305         ret = 0;
7306         return ret;
7307 }
7308
7309 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7310 {
7311         struct btrfs_root *root = fs_info->tree_root;
7312         struct btrfs_super_block *super_copy = fs_info->super_copy;
7313         struct extent_buffer *sb;
7314         struct btrfs_disk_key *disk_key;
7315         struct btrfs_chunk *chunk;
7316         u8 *array_ptr;
7317         unsigned long sb_array_offset;
7318         int ret = 0;
7319         u32 num_stripes;
7320         u32 array_size;
7321         u32 len = 0;
7322         u32 cur_offset;
7323         u64 type;
7324         struct btrfs_key key;
7325
7326         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7327         /*
7328          * This will create extent buffer of nodesize, superblock size is
7329          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7330          * overallocate but we can keep it as-is, only the first page is used.
7331          */
7332         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
7333                                           root->root_key.objectid, 0);
7334         if (IS_ERR(sb))
7335                 return PTR_ERR(sb);
7336         set_extent_buffer_uptodate(sb);
7337         /*
7338          * The sb extent buffer is artificial and just used to read the system array.
7339          * set_extent_buffer_uptodate() call does not properly mark all it's
7340          * pages up-to-date when the page is larger: extent does not cover the
7341          * whole page and consequently check_page_uptodate does not find all
7342          * the page's extents up-to-date (the hole beyond sb),
7343          * write_extent_buffer then triggers a WARN_ON.
7344          *
7345          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7346          * but sb spans only this function. Add an explicit SetPageUptodate call
7347          * to silence the warning eg. on PowerPC 64.
7348          */
7349         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7350                 SetPageUptodate(sb->pages[0]);
7351
7352         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7353         array_size = btrfs_super_sys_array_size(super_copy);
7354
7355         array_ptr = super_copy->sys_chunk_array;
7356         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7357         cur_offset = 0;
7358
7359         while (cur_offset < array_size) {
7360                 disk_key = (struct btrfs_disk_key *)array_ptr;
7361                 len = sizeof(*disk_key);
7362                 if (cur_offset + len > array_size)
7363                         goto out_short_read;
7364
7365                 btrfs_disk_key_to_cpu(&key, disk_key);
7366
7367                 array_ptr += len;
7368                 sb_array_offset += len;
7369                 cur_offset += len;
7370
7371                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7372                         btrfs_err(fs_info,
7373                             "unexpected item type %u in sys_array at offset %u",
7374                                   (u32)key.type, cur_offset);
7375                         ret = -EIO;
7376                         break;
7377                 }
7378
7379                 chunk = (struct btrfs_chunk *)sb_array_offset;
7380                 /*
7381                  * At least one btrfs_chunk with one stripe must be present,
7382                  * exact stripe count check comes afterwards
7383                  */
7384                 len = btrfs_chunk_item_size(1);
7385                 if (cur_offset + len > array_size)
7386                         goto out_short_read;
7387
7388                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7389                 if (!num_stripes) {
7390                         btrfs_err(fs_info,
7391                         "invalid number of stripes %u in sys_array at offset %u",
7392                                   num_stripes, cur_offset);
7393                         ret = -EIO;
7394                         break;
7395                 }
7396
7397                 type = btrfs_chunk_type(sb, chunk);
7398                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7399                         btrfs_err(fs_info,
7400                         "invalid chunk type %llu in sys_array at offset %u",
7401                                   type, cur_offset);
7402                         ret = -EIO;
7403                         break;
7404                 }
7405
7406                 len = btrfs_chunk_item_size(num_stripes);
7407                 if (cur_offset + len > array_size)
7408                         goto out_short_read;
7409
7410                 ret = read_one_chunk(&key, sb, chunk);
7411                 if (ret)
7412                         break;
7413
7414                 array_ptr += len;
7415                 sb_array_offset += len;
7416                 cur_offset += len;
7417         }
7418         clear_extent_buffer_uptodate(sb);
7419         free_extent_buffer_stale(sb);
7420         return ret;
7421
7422 out_short_read:
7423         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7424                         len, cur_offset);
7425         clear_extent_buffer_uptodate(sb);
7426         free_extent_buffer_stale(sb);
7427         return -EIO;
7428 }
7429
7430 /*
7431  * Check if all chunks in the fs are OK for read-write degraded mount
7432  *
7433  * If the @failing_dev is specified, it's accounted as missing.
7434  *
7435  * Return true if all chunks meet the minimal RW mount requirements.
7436  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7437  */
7438 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7439                                         struct btrfs_device *failing_dev)
7440 {
7441         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7442         struct extent_map *em;
7443         u64 next_start = 0;
7444         bool ret = true;
7445
7446         read_lock(&map_tree->lock);
7447         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7448         read_unlock(&map_tree->lock);
7449         /* No chunk at all? Return false anyway */
7450         if (!em) {
7451                 ret = false;
7452                 goto out;
7453         }
7454         while (em) {
7455                 struct map_lookup *map;
7456                 int missing = 0;
7457                 int max_tolerated;
7458                 int i;
7459
7460                 map = em->map_lookup;
7461                 max_tolerated =
7462                         btrfs_get_num_tolerated_disk_barrier_failures(
7463                                         map->type);
7464                 for (i = 0; i < map->num_stripes; i++) {
7465                         struct btrfs_device *dev = map->stripes[i].dev;
7466
7467                         if (!dev || !dev->bdev ||
7468                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7469                             dev->last_flush_error)
7470                                 missing++;
7471                         else if (failing_dev && failing_dev == dev)
7472                                 missing++;
7473                 }
7474                 if (missing > max_tolerated) {
7475                         if (!failing_dev)
7476                                 btrfs_warn(fs_info,
7477         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7478                                    em->start, missing, max_tolerated);
7479                         free_extent_map(em);
7480                         ret = false;
7481                         goto out;
7482                 }
7483                 next_start = extent_map_end(em);
7484                 free_extent_map(em);
7485
7486                 read_lock(&map_tree->lock);
7487                 em = lookup_extent_mapping(map_tree, next_start,
7488                                            (u64)(-1) - next_start);
7489                 read_unlock(&map_tree->lock);
7490         }
7491 out:
7492         return ret;
7493 }
7494
7495 static void readahead_tree_node_children(struct extent_buffer *node)
7496 {
7497         int i;
7498         const int nr_items = btrfs_header_nritems(node);
7499
7500         for (i = 0; i < nr_items; i++)
7501                 btrfs_readahead_node_child(node, i);
7502 }
7503
7504 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7505 {
7506         struct btrfs_root *root = fs_info->chunk_root;
7507         struct btrfs_path *path;
7508         struct extent_buffer *leaf;
7509         struct btrfs_key key;
7510         struct btrfs_key found_key;
7511         int ret;
7512         int slot;
7513         u64 total_dev = 0;
7514         u64 last_ra_node = 0;
7515
7516         path = btrfs_alloc_path();
7517         if (!path)
7518                 return -ENOMEM;
7519
7520         /*
7521          * uuid_mutex is needed only if we are mounting a sprout FS
7522          * otherwise we don't need it.
7523          */
7524         mutex_lock(&uuid_mutex);
7525
7526         /*
7527          * It is possible for mount and umount to race in such a way that
7528          * we execute this code path, but open_fs_devices failed to clear
7529          * total_rw_bytes. We certainly want it cleared before reading the
7530          * device items, so clear it here.
7531          */
7532         fs_info->fs_devices->total_rw_bytes = 0;
7533
7534         /*
7535          * Lockdep complains about possible circular locking dependency between
7536          * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7537          * used for freeze procection of a fs (struct super_block.s_writers),
7538          * which we take when starting a transaction, and extent buffers of the
7539          * chunk tree if we call read_one_dev() while holding a lock on an
7540          * extent buffer of the chunk tree. Since we are mounting the filesystem
7541          * and at this point there can't be any concurrent task modifying the
7542          * chunk tree, to keep it simple, just skip locking on the chunk tree.
7543          */
7544         ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7545         path->skip_locking = 1;
7546
7547         /*
7548          * Read all device items, and then all the chunk items. All
7549          * device items are found before any chunk item (their object id
7550          * is smaller than the lowest possible object id for a chunk
7551          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7552          */
7553         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7554         key.offset = 0;
7555         key.type = 0;
7556         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7557         if (ret < 0)
7558                 goto error;
7559         while (1) {
7560                 struct extent_buffer *node;
7561
7562                 leaf = path->nodes[0];
7563                 slot = path->slots[0];
7564                 if (slot >= btrfs_header_nritems(leaf)) {
7565                         ret = btrfs_next_leaf(root, path);
7566                         if (ret == 0)
7567                                 continue;
7568                         if (ret < 0)
7569                                 goto error;
7570                         break;
7571                 }
7572                 node = path->nodes[1];
7573                 if (node) {
7574                         if (last_ra_node != node->start) {
7575                                 readahead_tree_node_children(node);
7576                                 last_ra_node = node->start;
7577                         }
7578                 }
7579                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7580                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7581                         struct btrfs_dev_item *dev_item;
7582                         dev_item = btrfs_item_ptr(leaf, slot,
7583                                                   struct btrfs_dev_item);
7584                         ret = read_one_dev(leaf, dev_item);
7585                         if (ret)
7586                                 goto error;
7587                         total_dev++;
7588                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7589                         struct btrfs_chunk *chunk;
7590
7591                         /*
7592                          * We are only called at mount time, so no need to take
7593                          * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7594                          * we always lock first fs_info->chunk_mutex before
7595                          * acquiring any locks on the chunk tree. This is a
7596                          * requirement for chunk allocation, see the comment on
7597                          * top of btrfs_chunk_alloc() for details.
7598                          */
7599                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7600                         ret = read_one_chunk(&found_key, leaf, chunk);
7601                         if (ret)
7602                                 goto error;
7603                 }
7604                 path->slots[0]++;
7605         }
7606
7607         /*
7608          * After loading chunk tree, we've got all device information,
7609          * do another round of validation checks.
7610          */
7611         if (total_dev != fs_info->fs_devices->total_devices) {
7612                 btrfs_warn(fs_info,
7613 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7614                           btrfs_super_num_devices(fs_info->super_copy),
7615                           total_dev);
7616                 fs_info->fs_devices->total_devices = total_dev;
7617                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7618         }
7619         if (btrfs_super_total_bytes(fs_info->super_copy) <
7620             fs_info->fs_devices->total_rw_bytes) {
7621                 btrfs_err(fs_info,
7622         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7623                           btrfs_super_total_bytes(fs_info->super_copy),
7624                           fs_info->fs_devices->total_rw_bytes);
7625                 ret = -EINVAL;
7626                 goto error;
7627         }
7628         ret = 0;
7629 error:
7630         mutex_unlock(&uuid_mutex);
7631
7632         btrfs_free_path(path);
7633         return ret;
7634 }
7635
7636 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7637 {
7638         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7639         struct btrfs_device *device;
7640
7641         fs_devices->fs_info = fs_info;
7642
7643         mutex_lock(&fs_devices->device_list_mutex);
7644         list_for_each_entry(device, &fs_devices->devices, dev_list)
7645                 device->fs_info = fs_info;
7646
7647         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7648                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7649                         device->fs_info = fs_info;
7650
7651                 seed_devs->fs_info = fs_info;
7652         }
7653         mutex_unlock(&fs_devices->device_list_mutex);
7654 }
7655
7656 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7657                                  const struct btrfs_dev_stats_item *ptr,
7658                                  int index)
7659 {
7660         u64 val;
7661
7662         read_extent_buffer(eb, &val,
7663                            offsetof(struct btrfs_dev_stats_item, values) +
7664                             ((unsigned long)ptr) + (index * sizeof(u64)),
7665                            sizeof(val));
7666         return val;
7667 }
7668
7669 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7670                                       struct btrfs_dev_stats_item *ptr,
7671                                       int index, u64 val)
7672 {
7673         write_extent_buffer(eb, &val,
7674                             offsetof(struct btrfs_dev_stats_item, values) +
7675                              ((unsigned long)ptr) + (index * sizeof(u64)),
7676                             sizeof(val));
7677 }
7678
7679 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7680                                        struct btrfs_path *path)
7681 {
7682         struct btrfs_dev_stats_item *ptr;
7683         struct extent_buffer *eb;
7684         struct btrfs_key key;
7685         int item_size;
7686         int i, ret, slot;
7687
7688         if (!device->fs_info->dev_root)
7689                 return 0;
7690
7691         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7692         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7693         key.offset = device->devid;
7694         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7695         if (ret) {
7696                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7697                         btrfs_dev_stat_set(device, i, 0);
7698                 device->dev_stats_valid = 1;
7699                 btrfs_release_path(path);
7700                 return ret < 0 ? ret : 0;
7701         }
7702         slot = path->slots[0];
7703         eb = path->nodes[0];
7704         item_size = btrfs_item_size_nr(eb, slot);
7705
7706         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7707
7708         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7709                 if (item_size >= (1 + i) * sizeof(__le64))
7710                         btrfs_dev_stat_set(device, i,
7711                                            btrfs_dev_stats_value(eb, ptr, i));
7712                 else
7713                         btrfs_dev_stat_set(device, i, 0);
7714         }
7715
7716         device->dev_stats_valid = 1;
7717         btrfs_dev_stat_print_on_load(device);
7718         btrfs_release_path(path);
7719
7720         return 0;
7721 }
7722
7723 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7724 {
7725         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7726         struct btrfs_device *device;
7727         struct btrfs_path *path = NULL;
7728         int ret = 0;
7729
7730         path = btrfs_alloc_path();
7731         if (!path)
7732                 return -ENOMEM;
7733
7734         mutex_lock(&fs_devices->device_list_mutex);
7735         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7736                 ret = btrfs_device_init_dev_stats(device, path);
7737                 if (ret)
7738                         goto out;
7739         }
7740         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7741                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7742                         ret = btrfs_device_init_dev_stats(device, path);
7743                         if (ret)
7744                                 goto out;
7745                 }
7746         }
7747 out:
7748         mutex_unlock(&fs_devices->device_list_mutex);
7749
7750         btrfs_free_path(path);
7751         return ret;
7752 }
7753
7754 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7755                                 struct btrfs_device *device)
7756 {
7757         struct btrfs_fs_info *fs_info = trans->fs_info;
7758         struct btrfs_root *dev_root = fs_info->dev_root;
7759         struct btrfs_path *path;
7760         struct btrfs_key key;
7761         struct extent_buffer *eb;
7762         struct btrfs_dev_stats_item *ptr;
7763         int ret;
7764         int i;
7765
7766         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7767         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7768         key.offset = device->devid;
7769
7770         path = btrfs_alloc_path();
7771         if (!path)
7772                 return -ENOMEM;
7773         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7774         if (ret < 0) {
7775                 btrfs_warn_in_rcu(fs_info,
7776                         "error %d while searching for dev_stats item for device %s",
7777                               ret, rcu_str_deref(device->name));
7778                 goto out;
7779         }
7780
7781         if (ret == 0 &&
7782             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7783                 /* need to delete old one and insert a new one */
7784                 ret = btrfs_del_item(trans, dev_root, path);
7785                 if (ret != 0) {
7786                         btrfs_warn_in_rcu(fs_info,
7787                                 "delete too small dev_stats item for device %s failed %d",
7788                                       rcu_str_deref(device->name), ret);
7789                         goto out;
7790                 }
7791                 ret = 1;
7792         }
7793
7794         if (ret == 1) {
7795                 /* need to insert a new item */
7796                 btrfs_release_path(path);
7797                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7798                                               &key, sizeof(*ptr));
7799                 if (ret < 0) {
7800                         btrfs_warn_in_rcu(fs_info,
7801                                 "insert dev_stats item for device %s failed %d",
7802                                 rcu_str_deref(device->name), ret);
7803                         goto out;
7804                 }
7805         }
7806
7807         eb = path->nodes[0];
7808         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7809         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7810                 btrfs_set_dev_stats_value(eb, ptr, i,
7811                                           btrfs_dev_stat_read(device, i));
7812         btrfs_mark_buffer_dirty(eb);
7813
7814 out:
7815         btrfs_free_path(path);
7816         return ret;
7817 }
7818
7819 /*
7820  * called from commit_transaction. Writes all changed device stats to disk.
7821  */
7822 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7823 {
7824         struct btrfs_fs_info *fs_info = trans->fs_info;
7825         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7826         struct btrfs_device *device;
7827         int stats_cnt;
7828         int ret = 0;
7829
7830         mutex_lock(&fs_devices->device_list_mutex);
7831         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7832                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7833                 if (!device->dev_stats_valid || stats_cnt == 0)
7834                         continue;
7835
7836
7837                 /*
7838                  * There is a LOAD-LOAD control dependency between the value of
7839                  * dev_stats_ccnt and updating the on-disk values which requires
7840                  * reading the in-memory counters. Such control dependencies
7841                  * require explicit read memory barriers.
7842                  *
7843                  * This memory barriers pairs with smp_mb__before_atomic in
7844                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7845                  * barrier implied by atomic_xchg in
7846                  * btrfs_dev_stats_read_and_reset
7847                  */
7848                 smp_rmb();
7849
7850                 ret = update_dev_stat_item(trans, device);
7851                 if (!ret)
7852                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7853         }
7854         mutex_unlock(&fs_devices->device_list_mutex);
7855
7856         return ret;
7857 }
7858
7859 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7860 {
7861         btrfs_dev_stat_inc(dev, index);
7862         btrfs_dev_stat_print_on_error(dev);
7863 }
7864
7865 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7866 {
7867         if (!dev->dev_stats_valid)
7868                 return;
7869         btrfs_err_rl_in_rcu(dev->fs_info,
7870                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7871                            rcu_str_deref(dev->name),
7872                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7873                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7874                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7875                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7876                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7877 }
7878
7879 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7880 {
7881         int i;
7882
7883         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7884                 if (btrfs_dev_stat_read(dev, i) != 0)
7885                         break;
7886         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7887                 return; /* all values == 0, suppress message */
7888
7889         btrfs_info_in_rcu(dev->fs_info,
7890                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7891                rcu_str_deref(dev->name),
7892                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7893                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7894                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7895                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7896                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7897 }
7898
7899 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7900                         struct btrfs_ioctl_get_dev_stats *stats)
7901 {
7902         struct btrfs_device *dev;
7903         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7904         int i;
7905
7906         mutex_lock(&fs_devices->device_list_mutex);
7907         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7908         mutex_unlock(&fs_devices->device_list_mutex);
7909
7910         if (!dev) {
7911                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7912                 return -ENODEV;
7913         } else if (!dev->dev_stats_valid) {
7914                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7915                 return -ENODEV;
7916         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7917                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7918                         if (stats->nr_items > i)
7919                                 stats->values[i] =
7920                                         btrfs_dev_stat_read_and_reset(dev, i);
7921                         else
7922                                 btrfs_dev_stat_set(dev, i, 0);
7923                 }
7924                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7925                            current->comm, task_pid_nr(current));
7926         } else {
7927                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7928                         if (stats->nr_items > i)
7929                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7930         }
7931         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7932                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7933         return 0;
7934 }
7935
7936 /*
7937  * Update the size and bytes used for each device where it changed.  This is
7938  * delayed since we would otherwise get errors while writing out the
7939  * superblocks.
7940  *
7941  * Must be invoked during transaction commit.
7942  */
7943 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7944 {
7945         struct btrfs_device *curr, *next;
7946
7947         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7948
7949         if (list_empty(&trans->dev_update_list))
7950                 return;
7951
7952         /*
7953          * We don't need the device_list_mutex here.  This list is owned by the
7954          * transaction and the transaction must complete before the device is
7955          * released.
7956          */
7957         mutex_lock(&trans->fs_info->chunk_mutex);
7958         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7959                                  post_commit_list) {
7960                 list_del_init(&curr->post_commit_list);
7961                 curr->commit_total_bytes = curr->disk_total_bytes;
7962                 curr->commit_bytes_used = curr->bytes_used;
7963         }
7964         mutex_unlock(&trans->fs_info->chunk_mutex);
7965 }
7966
7967 /*
7968  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7969  */
7970 int btrfs_bg_type_to_factor(u64 flags)
7971 {
7972         const int index = btrfs_bg_flags_to_raid_index(flags);
7973
7974         return btrfs_raid_array[index].ncopies;
7975 }
7976
7977
7978
7979 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7980                                  u64 chunk_offset, u64 devid,
7981                                  u64 physical_offset, u64 physical_len)
7982 {
7983         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7984         struct extent_map *em;
7985         struct map_lookup *map;
7986         struct btrfs_device *dev;
7987         u64 stripe_len;
7988         bool found = false;
7989         int ret = 0;
7990         int i;
7991
7992         read_lock(&em_tree->lock);
7993         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7994         read_unlock(&em_tree->lock);
7995
7996         if (!em) {
7997                 btrfs_err(fs_info,
7998 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7999                           physical_offset, devid);
8000                 ret = -EUCLEAN;
8001                 goto out;
8002         }
8003
8004         map = em->map_lookup;
8005         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
8006         if (physical_len != stripe_len) {
8007                 btrfs_err(fs_info,
8008 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
8009                           physical_offset, devid, em->start, physical_len,
8010                           stripe_len);
8011                 ret = -EUCLEAN;
8012                 goto out;
8013         }
8014
8015         for (i = 0; i < map->num_stripes; i++) {
8016                 if (map->stripes[i].dev->devid == devid &&
8017                     map->stripes[i].physical == physical_offset) {
8018                         found = true;
8019                         if (map->verified_stripes >= map->num_stripes) {
8020                                 btrfs_err(fs_info,
8021                                 "too many dev extents for chunk %llu found",
8022                                           em->start);
8023                                 ret = -EUCLEAN;
8024                                 goto out;
8025                         }
8026                         map->verified_stripes++;
8027                         break;
8028                 }
8029         }
8030         if (!found) {
8031                 btrfs_err(fs_info,
8032         "dev extent physical offset %llu devid %llu has no corresponding chunk",
8033                         physical_offset, devid);
8034                 ret = -EUCLEAN;
8035         }
8036
8037         /* Make sure no dev extent is beyond device boundary */
8038         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
8039         if (!dev) {
8040                 btrfs_err(fs_info, "failed to find devid %llu", devid);
8041                 ret = -EUCLEAN;
8042                 goto out;
8043         }
8044
8045         if (physical_offset + physical_len > dev->disk_total_bytes) {
8046                 btrfs_err(fs_info,
8047 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
8048                           devid, physical_offset, physical_len,
8049                           dev->disk_total_bytes);
8050                 ret = -EUCLEAN;
8051                 goto out;
8052         }
8053
8054         if (dev->zone_info) {
8055                 u64 zone_size = dev->zone_info->zone_size;
8056
8057                 if (!IS_ALIGNED(physical_offset, zone_size) ||
8058                     !IS_ALIGNED(physical_len, zone_size)) {
8059                         btrfs_err(fs_info,
8060 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8061                                   devid, physical_offset, physical_len);
8062                         ret = -EUCLEAN;
8063                         goto out;
8064                 }
8065         }
8066
8067 out:
8068         free_extent_map(em);
8069         return ret;
8070 }
8071
8072 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8073 {
8074         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8075         struct extent_map *em;
8076         struct rb_node *node;
8077         int ret = 0;
8078
8079         read_lock(&em_tree->lock);
8080         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8081                 em = rb_entry(node, struct extent_map, rb_node);
8082                 if (em->map_lookup->num_stripes !=
8083                     em->map_lookup->verified_stripes) {
8084                         btrfs_err(fs_info,
8085                         "chunk %llu has missing dev extent, have %d expect %d",
8086                                   em->start, em->map_lookup->verified_stripes,
8087                                   em->map_lookup->num_stripes);
8088                         ret = -EUCLEAN;
8089                         goto out;
8090                 }
8091         }
8092 out:
8093         read_unlock(&em_tree->lock);
8094         return ret;
8095 }
8096
8097 /*
8098  * Ensure that all dev extents are mapped to correct chunk, otherwise
8099  * later chunk allocation/free would cause unexpected behavior.
8100  *
8101  * NOTE: This will iterate through the whole device tree, which should be of
8102  * the same size level as the chunk tree.  This slightly increases mount time.
8103  */
8104 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8105 {
8106         struct btrfs_path *path;
8107         struct btrfs_root *root = fs_info->dev_root;
8108         struct btrfs_key key;
8109         u64 prev_devid = 0;
8110         u64 prev_dev_ext_end = 0;
8111         int ret = 0;
8112
8113         /*
8114          * We don't have a dev_root because we mounted with ignorebadroots and
8115          * failed to load the root, so we want to skip the verification in this
8116          * case for sure.
8117          *
8118          * However if the dev root is fine, but the tree itself is corrupted
8119          * we'd still fail to mount.  This verification is only to make sure
8120          * writes can happen safely, so instead just bypass this check
8121          * completely in the case of IGNOREBADROOTS.
8122          */
8123         if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8124                 return 0;
8125
8126         key.objectid = 1;
8127         key.type = BTRFS_DEV_EXTENT_KEY;
8128         key.offset = 0;
8129
8130         path = btrfs_alloc_path();
8131         if (!path)
8132                 return -ENOMEM;
8133
8134         path->reada = READA_FORWARD;
8135         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8136         if (ret < 0)
8137                 goto out;
8138
8139         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8140                 ret = btrfs_next_leaf(root, path);
8141                 if (ret < 0)
8142                         goto out;
8143                 /* No dev extents at all? Not good */
8144                 if (ret > 0) {
8145                         ret = -EUCLEAN;
8146                         goto out;
8147                 }
8148         }
8149         while (1) {
8150                 struct extent_buffer *leaf = path->nodes[0];
8151                 struct btrfs_dev_extent *dext;
8152                 int slot = path->slots[0];
8153                 u64 chunk_offset;
8154                 u64 physical_offset;
8155                 u64 physical_len;
8156                 u64 devid;
8157
8158                 btrfs_item_key_to_cpu(leaf, &key, slot);
8159                 if (key.type != BTRFS_DEV_EXTENT_KEY)
8160                         break;
8161                 devid = key.objectid;
8162                 physical_offset = key.offset;
8163
8164                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8165                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8166                 physical_len = btrfs_dev_extent_length(leaf, dext);
8167
8168                 /* Check if this dev extent overlaps with the previous one */
8169                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8170                         btrfs_err(fs_info,
8171 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8172                                   devid, physical_offset, prev_dev_ext_end);
8173                         ret = -EUCLEAN;
8174                         goto out;
8175                 }
8176
8177                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8178                                             physical_offset, physical_len);
8179                 if (ret < 0)
8180                         goto out;
8181                 prev_devid = devid;
8182                 prev_dev_ext_end = physical_offset + physical_len;
8183
8184                 ret = btrfs_next_item(root, path);
8185                 if (ret < 0)
8186                         goto out;
8187                 if (ret > 0) {
8188                         ret = 0;
8189                         break;
8190                 }
8191         }
8192
8193         /* Ensure all chunks have corresponding dev extents */
8194         ret = verify_chunk_dev_extent_mapping(fs_info);
8195 out:
8196         btrfs_free_path(path);
8197         return ret;
8198 }
8199
8200 /*
8201  * Check whether the given block group or device is pinned by any inode being
8202  * used as a swapfile.
8203  */
8204 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8205 {
8206         struct btrfs_swapfile_pin *sp;
8207         struct rb_node *node;
8208
8209         spin_lock(&fs_info->swapfile_pins_lock);
8210         node = fs_info->swapfile_pins.rb_node;
8211         while (node) {
8212                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8213                 if (ptr < sp->ptr)
8214                         node = node->rb_left;
8215                 else if (ptr > sp->ptr)
8216                         node = node->rb_right;
8217                 else
8218                         break;
8219         }
8220         spin_unlock(&fs_info->swapfile_pins_lock);
8221         return node != NULL;
8222 }
8223
8224 static int relocating_repair_kthread(void *data)
8225 {
8226         struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
8227         struct btrfs_fs_info *fs_info = cache->fs_info;
8228         u64 target;
8229         int ret = 0;
8230
8231         target = cache->start;
8232         btrfs_put_block_group(cache);
8233
8234         sb_start_write(fs_info->sb);
8235         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8236                 btrfs_info(fs_info,
8237                            "zoned: skip relocating block group %llu to repair: EBUSY",
8238                            target);
8239                 sb_end_write(fs_info->sb);
8240                 return -EBUSY;
8241         }
8242
8243         mutex_lock(&fs_info->reclaim_bgs_lock);
8244
8245         /* Ensure block group still exists */
8246         cache = btrfs_lookup_block_group(fs_info, target);
8247         if (!cache)
8248                 goto out;
8249
8250         if (!cache->relocating_repair)
8251                 goto out;
8252
8253         ret = btrfs_may_alloc_data_chunk(fs_info, target);
8254         if (ret < 0)
8255                 goto out;
8256
8257         btrfs_info(fs_info,
8258                    "zoned: relocating block group %llu to repair IO failure",
8259                    target);
8260         ret = btrfs_relocate_chunk(fs_info, target);
8261
8262 out:
8263         if (cache)
8264                 btrfs_put_block_group(cache);
8265         mutex_unlock(&fs_info->reclaim_bgs_lock);
8266         btrfs_exclop_finish(fs_info);
8267         sb_end_write(fs_info->sb);
8268
8269         return ret;
8270 }
8271
8272 int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8273 {
8274         struct btrfs_block_group *cache;
8275
8276         /* Do not attempt to repair in degraded state */
8277         if (btrfs_test_opt(fs_info, DEGRADED))
8278                 return 0;
8279
8280         cache = btrfs_lookup_block_group(fs_info, logical);
8281         if (!cache)
8282                 return 0;
8283
8284         spin_lock(&cache->lock);
8285         if (cache->relocating_repair) {
8286                 spin_unlock(&cache->lock);
8287                 btrfs_put_block_group(cache);
8288                 return 0;
8289         }
8290         cache->relocating_repair = 1;
8291         spin_unlock(&cache->lock);
8292
8293         kthread_run(relocating_repair_kthread, cache,
8294                     "btrfs-relocating-repair");
8295
8296         return 0;
8297 }