fs/btrfs/block-group.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "misc.h"
   4 #include "ctree.h"
   5 #include "block-group.h"
   6 #include "space-info.h"
   7 #include "disk-io.h"
   8 #include "free-space-cache.h"
   9 #include "free-space-tree.h"
  10 #include "volumes.h"
  11 #include "transaction.h"
  12 #include "ref-verify.h"
  13 #include "sysfs.h"
  14 #include "tree-log.h"
  15 #include "delalloc-space.h"
  16 #include "discard.h"
  17 #include "raid56.h"
  18 #include "zoned.h"
  19
  20 /*
  21  * Return target flags in extended format or 0 if restripe for this chunk_type
  22  * is not in progress
  23  *
  24  * Should be called with balance_lock held
  25  */
  26 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  27 {
  28         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  29         u64 target = 0;
  30
  31         if (!bctl)
  32                 return 0;
  33
  34         if (flags & BTRFS_BLOCK_GROUP_DATA &&
  35             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  36                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  37         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  38                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  39                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  40         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  41                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  42                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  43         }
  44
  45         return target;
  46 }
  47
  48 /*
  49  * @flags: available profiles in extended format (see ctree.h)
  50  *
  51  * Return reduced profile in chunk format.  If profile changing is in progress
  52  * (either running or paused) picks the target profile (if it's already
  53  * available), otherwise falls back to plain reducing.
  54  */
  55 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  56 {
  57         u64 num_devices = fs_info->fs_devices->rw_devices;
  58         u64 target;
  59         u64 raid_type;
  60         u64 allowed = 0;
  61
  62         /*
  63          * See if restripe for this chunk_type is in progress, if so try to
  64          * reduce to the target profile
  65          */
  66         spin_lock(&fs_info->balance_lock);
  67         target = get_restripe_target(fs_info, flags);
  68         if (target) {
  69                 spin_unlock(&fs_info->balance_lock);
  70                 return extended_to_chunk(target);
  71         }
  72         spin_unlock(&fs_info->balance_lock);
  73
  74         /* First, mask out the RAID levels which aren't possible */
  75         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  76                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  77                         allowed |= btrfs_raid_array[raid_type].bg_flag;
  78         }
  79         allowed &= flags;
  80
  81         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  82                 allowed = BTRFS_BLOCK_GROUP_RAID6;
  83         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  84                 allowed = BTRFS_BLOCK_GROUP_RAID5;
  85         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  86                 allowed = BTRFS_BLOCK_GROUP_RAID10;
  87         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  88                 allowed = BTRFS_BLOCK_GROUP_RAID1;
  89         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  90                 allowed = BTRFS_BLOCK_GROUP_RAID0;
  91
  92         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  93
  94         return extended_to_chunk(flags | allowed);
  95 }
  96
  97 u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
  98 {
  99         unsigned seq;
 100         u64 flags;
 101
 102         do {
 103                 flags = orig_flags;
 104                 seq = read_seqbegin(&fs_info->profiles_lock);
 105
 106                 if (flags & BTRFS_BLOCK_GROUP_DATA)
 107                         flags |= fs_info->avail_data_alloc_bits;
 108                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 109                         flags |= fs_info->avail_system_alloc_bits;
 110                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
 111                         flags |= fs_info->avail_metadata_alloc_bits;
 112         } while (read_seqretry(&fs_info->profiles_lock, seq));
 113
 114         return btrfs_reduce_alloc_profile(fs_info, flags);
 115 }
 116
 117 void btrfs_get_block_group(struct btrfs_block_group *cache)
 118 {
 119         refcount_inc(&cache->refs);
 120 }
 121
 122 void btrfs_put_block_group(struct btrfs_block_group *cache)
 123 {
 124         if (refcount_dec_and_test(&cache->refs)) {
 125                 WARN_ON(cache->pinned > 0);
 126                 /*
 127                  * If there was a failure to cleanup a log tree, very likely due
 128                  * to an IO failure on a writeback attempt of one or more of its
 129                  * extent buffers, we could not do proper (and cheap) unaccounting
 130                  * of their reserved space, so don't warn on reserved > 0 in that
 131                  * case.
 132                  */
 133                 if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
 134                     !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
 135                         WARN_ON(cache->reserved > 0);
 136
 137                 /*
 138                  * A block_group shouldn't be on the discard_list anymore.
 139                  * Remove the block_group from the discard_list to prevent us
 140                  * from causing a panic due to NULL pointer dereference.
 141                  */
 142                 if (WARN_ON(!list_empty(&cache->discard_list)))
 143                         btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
 144                                                   cache);
 145
 146                 /*
 147                  * If not empty, someone is still holding mutex of
 148                  * full_stripe_lock, which can only be released by caller.
 149                  * And it will definitely cause use-after-free when caller
 150                  * tries to release full stripe lock.
 151                  *
 152                  * No better way to resolve, but only to warn.
 153                  */
 154                 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 155                 kfree(cache->free_space_ctl);
 156                 kfree(cache);
 157         }
 158 }
 159
 160 /*
 161  * This adds the block group to the fs_info rb tree for the block group cache
 162  */
 163 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 164                                        struct btrfs_block_group *block_group)
 165 {
 166         struct rb_node **p;
 167         struct rb_node *parent = NULL;
 168         struct btrfs_block_group *cache;
 169
 170         ASSERT(block_group->length != 0);
 171
 172         spin_lock(&info->block_group_cache_lock);
 173         p = &info->block_group_cache_tree.rb_node;
 174
 175         while (*p) {
 176                 parent = *p;
 177                 cache = rb_entry(parent, struct btrfs_block_group, cache_node);
 178                 if (block_group->start < cache->start) {
 179                         p = &(*p)->rb_left;
 180                 } else if (block_group->start > cache->start) {
 181                         p = &(*p)->rb_right;
 182                 } else {
 183                         spin_unlock(&info->block_group_cache_lock);
 184                         return -EEXIST;
 185                 }
 186         }
 187
 188         rb_link_node(&block_group->cache_node, parent, p);
 189         rb_insert_color(&block_group->cache_node,
 190                         &info->block_group_cache_tree);
 191
 192         if (info->first_logical_byte > block_group->start)
 193                 info->first_logical_byte = block_group->start;
 194
 195         spin_unlock(&info->block_group_cache_lock);
 196
 197         return 0;
 198 }
 199
 200 /*
 201  * This will return the block group at or after bytenr if contains is 0, else
 202  * it will return the block group that contains the bytenr
 203  */
 204 static struct btrfs_block_group *block_group_cache_tree_search(
 205                 struct btrfs_fs_info *info, u64 bytenr, int contains)
 206 {
 207         struct btrfs_block_group *cache, *ret = NULL;
 208         struct rb_node *n;
 209         u64 end, start;
 210
 211         spin_lock(&info->block_group_cache_lock);
 212         n = info->block_group_cache_tree.rb_node;
 213
 214         while (n) {
 215                 cache = rb_entry(n, struct btrfs_block_group, cache_node);
 216                 end = cache->start + cache->length - 1;
 217                 start = cache->start;
 218
 219                 if (bytenr < start) {
 220                         if (!contains && (!ret || start < ret->start))
 221                                 ret = cache;
 222                         n = n->rb_left;
 223                 } else if (bytenr > start) {
 224                         if (contains && bytenr <= end) {
 225                                 ret = cache;
 226                                 break;
 227                         }
 228                         n = n->rb_right;
 229                 } else {
 230                         ret = cache;
 231                         break;
 232                 }
 233         }
 234         if (ret) {
 235                 btrfs_get_block_group(ret);
 236                 if (bytenr == 0 && info->first_logical_byte > ret->start)
 237                         info->first_logical_byte = ret->start;
 238         }
 239         spin_unlock(&info->block_group_cache_lock);
 240
 241         return ret;
 242 }
 243
 244 /*
 245  * Return the block group that starts at or after bytenr
 246  */
 247 struct btrfs_block_group *btrfs_lookup_first_block_group(
 248                 struct btrfs_fs_info *info, u64 bytenr)
 249 {
 250         return block_group_cache_tree_search(info, bytenr, 0);
 251 }
 252
 253 /*
 254  * Return the block group that contains the given bytenr
 255  */
 256 struct btrfs_block_group *btrfs_lookup_block_group(
 257                 struct btrfs_fs_info *info, u64 bytenr)
 258 {
 259         return block_group_cache_tree_search(info, bytenr, 1);
 260 }
 261
 262 struct btrfs_block_group *btrfs_next_block_group(
 263                 struct btrfs_block_group *cache)
 264 {
 265         struct btrfs_fs_info *fs_info = cache->fs_info;
 266         struct rb_node *node;
 267
 268         spin_lock(&fs_info->block_group_cache_lock);
 269
 270         /* If our block group was removed, we need a full search. */
 271         if (RB_EMPTY_NODE(&cache->cache_node)) {
 272                 const u64 next_bytenr = cache->start + cache->length;
 273
 274                 spin_unlock(&fs_info->block_group_cache_lock);
 275                 btrfs_put_block_group(cache);
 276                 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
 277         }
 278         node = rb_next(&cache->cache_node);
 279         btrfs_put_block_group(cache);
 280         if (node) {
 281                 cache = rb_entry(node, struct btrfs_block_group, cache_node);
 282                 btrfs_get_block_group(cache);
 283         } else
 284                 cache = NULL;
 285         spin_unlock(&fs_info->block_group_cache_lock);
 286         return cache;
 287 }
 288
 289 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 290 {
 291         struct btrfs_block_group *bg;
 292         bool ret = true;
 293
 294         bg = btrfs_lookup_block_group(fs_info, bytenr);
 295         if (!bg)
 296                 return false;
 297
 298         spin_lock(&bg->lock);
 299         if (bg->ro)
 300                 ret = false;
 301         else
 302                 atomic_inc(&bg->nocow_writers);
 303         spin_unlock(&bg->lock);
 304
 305         /* No put on block group, done by btrfs_dec_nocow_writers */
 306         if (!ret)
 307                 btrfs_put_block_group(bg);
 308
 309         return ret;
 310 }
 311
 312 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 313 {
 314         struct btrfs_block_group *bg;
 315
 316         bg = btrfs_lookup_block_group(fs_info, bytenr);
 317         ASSERT(bg);
 318         if (atomic_dec_and_test(&bg->nocow_writers))
 319                 wake_up_var(&bg->nocow_writers);
 320         /*
 321          * Once for our lookup and once for the lookup done by a previous call
 322          * to btrfs_inc_nocow_writers()
 323          */
 324         btrfs_put_block_group(bg);
 325         btrfs_put_block_group(bg);
 326 }
 327
 328 void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
 329 {
 330         wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
 331 }
 332
 333 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 334                                         const u64 start)
 335 {
 336         struct btrfs_block_group *bg;
 337
 338         bg = btrfs_lookup_block_group(fs_info, start);
 339         ASSERT(bg);
 340         if (atomic_dec_and_test(&bg->reservations))
 341                 wake_up_var(&bg->reservations);
 342         btrfs_put_block_group(bg);
 343 }
 344
 345 void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
 346 {
 347         struct btrfs_space_info *space_info = bg->space_info;
 348
 349         ASSERT(bg->ro);
 350
 351         if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
 352                 return;
 353
 354         /*
 355          * Our block group is read only but before we set it to read only,
 356          * some task might have had allocated an extent from it already, but it
 357          * has not yet created a respective ordered extent (and added it to a
 358          * root's list of ordered extents).
 359          * Therefore wait for any task currently allocating extents, since the
 360          * block group's reservations counter is incremented while a read lock
 361          * on the groups' semaphore is held and decremented after releasing
 362          * the read access on that semaphore and creating the ordered extent.
 363          */
 364         down_write(&space_info->groups_sem);
 365         up_write(&space_info->groups_sem);
 366
 367         wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
 368 }
 369
 370 struct btrfs_caching_control *btrfs_get_caching_control(
 371                 struct btrfs_block_group *cache)
 372 {
 373         struct btrfs_caching_control *ctl;
 374
 375         spin_lock(&cache->lock);
 376         if (!cache->caching_ctl) {
 377                 spin_unlock(&cache->lock);
 378                 return NULL;
 379         }
 380
 381         ctl = cache->caching_ctl;
 382         refcount_inc(&ctl->count);
 383         spin_unlock(&cache->lock);
 384         return ctl;
 385 }
 386
 387 void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
 388 {
 389         if (refcount_dec_and_test(&ctl->count))
 390                 kfree(ctl);
 391 }
 392
 393 /*
 394  * When we wait for progress in the block group caching, its because our
 395  * allocation attempt failed at least once.  So, we must sleep and let some
 396  * progress happen before we try again.
 397  *
 398  * This function will sleep at least once waiting for new free space to show
 399  * up, and then it will check the block group free space numbers for our min
 400  * num_bytes.  Another option is to have it go ahead and look in the rbtree for
 401  * a free extent of a given size, but this is a good start.
 402  *
 403  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 404  * any of the information in this block group.
 405  */
 406 void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
 407                                            u64 num_bytes)
 408 {
 409         struct btrfs_caching_control *caching_ctl;
 410
 411         caching_ctl = btrfs_get_caching_control(cache);
 412         if (!caching_ctl)
 413                 return;
 414
 415         wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
 416                    (cache->free_space_ctl->free_space >= num_bytes));
 417
 418         btrfs_put_caching_control(caching_ctl);
 419 }
 420
 421 static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
 422                                        struct btrfs_caching_control *caching_ctl)
 423 {
 424         wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
 425         return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
 426 }
 427
 428 static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
 429 {
 430         struct btrfs_caching_control *caching_ctl;
 431         int ret;
 432
 433         caching_ctl = btrfs_get_caching_control(cache);
 434         if (!caching_ctl)
 435                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 436         ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 437         btrfs_put_caching_control(caching_ctl);
 438         return ret;
 439 }
 440
 441 #ifdef CONFIG_BTRFS_DEBUG
 442 static void fragment_free_space(struct btrfs_block_group *block_group)
 443 {
 444         struct btrfs_fs_info *fs_info = block_group->fs_info;
 445         u64 start = block_group->start;
 446         u64 len = block_group->length;
 447         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 448                 fs_info->nodesize : fs_info->sectorsize;
 449         u64 step = chunk << 1;
 450
 451         while (len > chunk) {
 452                 btrfs_remove_free_space(block_group, start, chunk);
 453                 start += step;
 454                 if (len < step)
 455                         len = 0;
 456                 else
 457                         len -= step;
 458         }
 459 }
 460 #endif
 461
 462 /*
 463  * This is only called by btrfs_cache_block_group, since we could have freed
 464  * extents we need to check the pinned_extents for any extents that can't be
 465  * used yet since their free space will be released as soon as the transaction
 466  * commits.
 467  */
 468 u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
 469 {
 470         struct btrfs_fs_info *info = block_group->fs_info;
 471         u64 extent_start, extent_end, size, total_added = 0;
 472         int ret;
 473
 474         while (start < end) {
 475                 ret = find_first_extent_bit(&info->excluded_extents, start,
 476                                             &extent_start, &extent_end,
 477                                             EXTENT_DIRTY | EXTENT_UPTODATE,
 478                                             NULL);
 479                 if (ret)
 480                         break;
 481
 482                 if (extent_start <= start) {
 483                         start = extent_end + 1;
 484                 } else if (extent_start > start && extent_start < end) {
 485                         size = extent_start - start;
 486                         total_added += size;
 487                         ret = btrfs_add_free_space_async_trimmed(block_group,
 488                                                                  start, size);
 489                         BUG_ON(ret); /* -ENOMEM or logic error */
 490                         start = extent_end + 1;
 491                 } else {
 492                         break;
 493                 }
 494         }
 495
 496         if (start < end) {
 497                 size = end - start;
 498                 total_added += size;
 499                 ret = btrfs_add_free_space_async_trimmed(block_group, start,
 500                                                          size);
 501                 BUG_ON(ret); /* -ENOMEM or logic error */
 502         }
 503
 504         return total_added;
 505 }
 506
 507 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 508 {
 509         struct btrfs_block_group *block_group = caching_ctl->block_group;
 510         struct btrfs_fs_info *fs_info = block_group->fs_info;
 511         struct btrfs_root *extent_root = fs_info->extent_root;
 512         struct btrfs_path *path;
 513         struct extent_buffer *leaf;
 514         struct btrfs_key key;
 515         u64 total_found = 0;
 516         u64 last = 0;
 517         u32 nritems;
 518         int ret;
 519         bool wakeup = true;
 520
 521         path = btrfs_alloc_path();
 522         if (!path)
 523                 return -ENOMEM;
 524
 525         last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
 526
 527 #ifdef CONFIG_BTRFS_DEBUG
 528         /*
 529          * If we're fragmenting we don't want to make anybody think we can
 530          * allocate from this block group until we've had a chance to fragment
 531          * the free space.
 532          */
 533         if (btrfs_should_fragment_free_space(block_group))
 534                 wakeup = false;
 535 #endif
 536         /*
 537          * We don't want to deadlock with somebody trying to allocate a new
 538          * extent for the extent root while also trying to search the extent
 539          * root to add free space.  So we skip locking and search the commit
 540          * root, since its read-only
 541          */
 542         path->skip_locking = 1;
 543         path->search_commit_root = 1;
 544         path->reada = READA_FORWARD;
 545
 546         key.objectid = last;
 547         key.offset = 0;
 548         key.type = BTRFS_EXTENT_ITEM_KEY;
 549
 550 next:
 551         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 552         if (ret < 0)
 553                 goto out;
 554
 555         leaf = path->nodes[0];
 556         nritems = btrfs_header_nritems(leaf);
 557
 558         while (1) {
 559                 if (btrfs_fs_closing(fs_info) > 1) {
 560                         last = (u64)-1;
 561                         break;
 562                 }
 563
 564                 if (path->slots[0] < nritems) {
 565                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 566                 } else {
 567                         ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
 568                         if (ret)
 569                                 break;
 570
 571                         if (need_resched() ||
 572                             rwsem_is_contended(&fs_info->commit_root_sem)) {
 573                                 if (wakeup)
 574                                         caching_ctl->progress = last;
 575                                 btrfs_release_path(path);
 576                                 up_read(&fs_info->commit_root_sem);
 577                                 mutex_unlock(&caching_ctl->mutex);
 578                                 cond_resched();
 579                                 mutex_lock(&caching_ctl->mutex);
 580                                 down_read(&fs_info->commit_root_sem);
 581                                 goto next;
 582                         }
 583
 584                         ret = btrfs_next_leaf(extent_root, path);
 585                         if (ret < 0)
 586                                 goto out;
 587                         if (ret)
 588                                 break;
 589                         leaf = path->nodes[0];
 590                         nritems = btrfs_header_nritems(leaf);
 591                         continue;
 592                 }
 593
 594                 if (key.objectid < last) {
 595                         key.objectid = last;
 596                         key.offset = 0;
 597                         key.type = BTRFS_EXTENT_ITEM_KEY;
 598
 599                         if (wakeup)
 600                                 caching_ctl->progress = last;
 601                         btrfs_release_path(path);
 602                         goto next;
 603                 }
 604
 605                 if (key.objectid < block_group->start) {
 606                         path->slots[0]++;
 607                         continue;
 608                 }
 609
 610                 if (key.objectid >= block_group->start + block_group->length)
 611                         break;
 612
 613                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 614                     key.type == BTRFS_METADATA_ITEM_KEY) {
 615                         total_found += add_new_free_space(block_group, last,
 616                                                           key.objectid);
 617                         if (key.type == BTRFS_METADATA_ITEM_KEY)
 618                                 last = key.objectid +
 619                                         fs_info->nodesize;
 620                         else
 621                                 last = key.objectid + key.offset;
 622
 623                         if (total_found > CACHING_CTL_WAKE_UP) {
 624                                 total_found = 0;
 625                                 if (wakeup)
 626                                         wake_up(&caching_ctl->wait);
 627                         }
 628                 }
 629                 path->slots[0]++;
 630         }
 631         ret = 0;
 632
 633         total_found += add_new_free_space(block_group, last,
 634                                 block_group->start + block_group->length);
 635         caching_ctl->progress = (u64)-1;
 636
 637 out:
 638         btrfs_free_path(path);
 639         return ret;
 640 }
 641
 642 static noinline void caching_thread(struct btrfs_work *work)
 643 {
 644         struct btrfs_block_group *block_group;
 645         struct btrfs_fs_info *fs_info;
 646         struct btrfs_caching_control *caching_ctl;
 647         int ret;
 648
 649         caching_ctl = container_of(work, struct btrfs_caching_control, work);
 650         block_group = caching_ctl->block_group;
 651         fs_info = block_group->fs_info;
 652
 653         mutex_lock(&caching_ctl->mutex);
 654         down_read(&fs_info->commit_root_sem);
 655
 656         if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 657                 ret = load_free_space_cache(block_group);
 658                 if (ret == 1) {
 659                         ret = 0;
 660                         goto done;
 661                 }
 662
 663                 /*
 664                  * We failed to load the space cache, set ourselves to
 665                  * CACHE_STARTED and carry on.
 666                  */
 667                 spin_lock(&block_group->lock);
 668                 block_group->cached = BTRFS_CACHE_STARTED;
 669                 spin_unlock(&block_group->lock);
 670                 wake_up(&caching_ctl->wait);
 671         }
 672
 673         /*
 674          * If we are in the transaction that populated the free space tree we
 675          * can't actually cache from the free space tree as our commit root and
 676          * real root are the same, so we could change the contents of the blocks
 677          * while caching.  Instead do the slow caching in this case, and after
 678          * the transaction has committed we will be safe.
 679          */
 680         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
 681             !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
 682                 ret = load_free_space_tree(caching_ctl);
 683         else
 684                 ret = load_extent_tree_free(caching_ctl);
 685 done:
 686         spin_lock(&block_group->lock);
 687         block_group->caching_ctl = NULL;
 688         block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 689         spin_unlock(&block_group->lock);
 690
 691 #ifdef CONFIG_BTRFS_DEBUG
 692         if (btrfs_should_fragment_free_space(block_group)) {
 693                 u64 bytes_used;
 694
 695                 spin_lock(&block_group->space_info->lock);
 696                 spin_lock(&block_group->lock);
 697                 bytes_used = block_group->length - block_group->used;
 698                 block_group->space_info->bytes_used += bytes_used >> 1;
 699                 spin_unlock(&block_group->lock);
 700                 spin_unlock(&block_group->space_info->lock);
 701                 fragment_free_space(block_group);
 702         }
 703 #endif
 704
 705         caching_ctl->progress = (u64)-1;
 706
 707         up_read(&fs_info->commit_root_sem);
 708         btrfs_free_excluded_extents(block_group);
 709         mutex_unlock(&caching_ctl->mutex);
 710
 711         wake_up(&caching_ctl->wait);
 712
 713         btrfs_put_caching_control(caching_ctl);
 714         btrfs_put_block_group(block_group);
 715 }
 716
 717 int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
 718 {
 719         struct btrfs_fs_info *fs_info = cache->fs_info;
 720         struct btrfs_caching_control *caching_ctl = NULL;
 721         int ret = 0;
 722
 723         /* Allocator for zoned filesystems does not use the cache at all */
 724         if (btrfs_is_zoned(fs_info))
 725                 return 0;
 726
 727         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 728         if (!caching_ctl)
 729                 return -ENOMEM;
 730
 731         INIT_LIST_HEAD(&caching_ctl->list);
 732         mutex_init(&caching_ctl->mutex);
 733         init_waitqueue_head(&caching_ctl->wait);
 734         caching_ctl->block_group = cache;
 735         caching_ctl->progress = cache->start;
 736         refcount_set(&caching_ctl->count, 2);
 737         btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 738
 739         spin_lock(&cache->lock);
 740         if (cache->cached != BTRFS_CACHE_NO) {
 741                 kfree(caching_ctl);
 742
 743                 caching_ctl = cache->caching_ctl;
 744                 if (caching_ctl)
 745                         refcount_inc(&caching_ctl->count);
 746                 spin_unlock(&cache->lock);
 747                 goto out;
 748         }
 749         WARN_ON(cache->caching_ctl);
 750         cache->caching_ctl = caching_ctl;
 751         cache->cached = BTRFS_CACHE_STARTED;
 752         cache->has_caching_ctl = 1;
 753         spin_unlock(&cache->lock);
 754
 755         spin_lock(&fs_info->block_group_cache_lock);
 756         refcount_inc(&caching_ctl->count);
 757         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 758         spin_unlock(&fs_info->block_group_cache_lock);
 759
 760         btrfs_get_block_group(cache);
 761
 762         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 763 out:
 764         if (wait && caching_ctl)
 765                 ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
 766         if (caching_ctl)
 767                 btrfs_put_caching_control(caching_ctl);
 768
 769         return ret;
 770 }
 771
 772 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 773 {
 774         u64 extra_flags = chunk_to_extended(flags) &
 775                                 BTRFS_EXTENDED_PROFILE_MASK;
 776
 777         write_seqlock(&fs_info->profiles_lock);
 778         if (flags & BTRFS_BLOCK_GROUP_DATA)
 779                 fs_info->avail_data_alloc_bits &= ~extra_flags;
 780         if (flags & BTRFS_BLOCK_GROUP_METADATA)
 781                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
 782         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
 783                 fs_info->avail_system_alloc_bits &= ~extra_flags;
 784         write_sequnlock(&fs_info->profiles_lock);
 785 }
 786
 787 /*
 788  * Clear incompat bits for the following feature(s):
 789  *
 790  * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
 791  *            in the whole filesystem
 792  *
 793  * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
 794  */
 795 static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
 796 {
 797         bool found_raid56 = false;
 798         bool found_raid1c34 = false;
 799
 800         if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
 801             (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
 802             (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
 803                 struct list_head *head = &fs_info->space_info;
 804                 struct btrfs_space_info *sinfo;
 805
 806                 list_for_each_entry_rcu(sinfo, head, list) {
 807                         down_read(&sinfo->groups_sem);
 808                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
 809                                 found_raid56 = true;
 810                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
 811                                 found_raid56 = true;
 812                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
 813                                 found_raid1c34 = true;
 814                         if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
 815                                 found_raid1c34 = true;
 816                         up_read(&sinfo->groups_sem);
 817                 }
 818                 if (!found_raid56)
 819                         btrfs_clear_fs_incompat(fs_info, RAID56);
 820                 if (!found_raid1c34)
 821                         btrfs_clear_fs_incompat(fs_info, RAID1C34);
 822         }
 823 }
 824
 825 static int remove_block_group_item(struct btrfs_trans_handle *trans,
 826                                    struct btrfs_path *path,
 827                                    struct btrfs_block_group *block_group)
 828 {
 829         struct btrfs_fs_info *fs_info = trans->fs_info;
 830         struct btrfs_root *root;
 831         struct btrfs_key key;
 832         int ret;
 833
 834         root = fs_info->extent_root;
 835         key.objectid = block_group->start;
 836         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 837         key.offset = block_group->length;
 838
 839         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 840         if (ret > 0)
 841                 ret = -ENOENT;
 842         if (ret < 0)
 843                 return ret;
 844
 845         ret = btrfs_del_item(trans, root, path);
 846         return ret;
 847 }
 848
 849 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 850                              u64 group_start, struct extent_map *em)
 851 {
 852         struct btrfs_fs_info *fs_info = trans->fs_info;
 853         struct btrfs_path *path;
 854         struct btrfs_block_group *block_group;
 855         struct btrfs_free_cluster *cluster;
 856         struct inode *inode;
 857         struct kobject *kobj = NULL;
 858         int ret;
 859         int index;
 860         int factor;
 861         struct btrfs_caching_control *caching_ctl = NULL;
 862         bool remove_em;
 863         bool remove_rsv = false;
 864
 865         block_group = btrfs_lookup_block_group(fs_info, group_start);
 866         BUG_ON(!block_group);
 867         BUG_ON(!block_group->ro);
 868
 869         trace_btrfs_remove_block_group(block_group);
 870         /*
 871          * Free the reserved super bytes from this block group before
 872          * remove it.
 873          */
 874         btrfs_free_excluded_extents(block_group);
 875         btrfs_free_ref_tree_range(fs_info, block_group->start,
 876                                   block_group->length);
 877
 878         index = btrfs_bg_flags_to_raid_index(block_group->flags);
 879         factor = btrfs_bg_type_to_factor(block_group->flags);
 880
 881         /* make sure this block group isn't part of an allocation cluster */
 882         cluster = &fs_info->data_alloc_cluster;
 883         spin_lock(&cluster->refill_lock);
 884         btrfs_return_cluster_to_free_space(block_group, cluster);
 885         spin_unlock(&cluster->refill_lock);
 886
 887         /*
 888          * make sure this block group isn't part of a metadata
 889          * allocation cluster
 890          */
 891         cluster = &fs_info->meta_alloc_cluster;
 892         spin_lock(&cluster->refill_lock);
 893         btrfs_return_cluster_to_free_space(block_group, cluster);
 894         spin_unlock(&cluster->refill_lock);
 895
 896         btrfs_clear_treelog_bg(block_group);
 897         btrfs_clear_data_reloc_bg(block_group);
 898
 899         path = btrfs_alloc_path();
 900         if (!path) {
 901                 ret = -ENOMEM;
 902                 goto out;
 903         }
 904
 905         /*
 906          * get the inode first so any iput calls done for the io_list
 907          * aren't the final iput (no unlinks allowed now)
 908          */
 909         inode = lookup_free_space_inode(block_group, path);
 910
 911         mutex_lock(&trans->transaction->cache_write_mutex);
 912         /*
 913          * Make sure our free space cache IO is done before removing the
 914          * free space inode
 915          */
 916         spin_lock(&trans->transaction->dirty_bgs_lock);
 917         if (!list_empty(&block_group->io_list)) {
 918                 list_del_init(&block_group->io_list);
 919
 920                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
 921
 922                 spin_unlock(&trans->transaction->dirty_bgs_lock);
 923                 btrfs_wait_cache_io(trans, block_group, path);
 924                 btrfs_put_block_group(block_group);
 925                 spin_lock(&trans->transaction->dirty_bgs_lock);
 926         }
 927
 928         if (!list_empty(&block_group->dirty_list)) {
 929                 list_del_init(&block_group->dirty_list);
 930                 remove_rsv = true;
 931                 btrfs_put_block_group(block_group);
 932         }
 933         spin_unlock(&trans->transaction->dirty_bgs_lock);
 934         mutex_unlock(&trans->transaction->cache_write_mutex);
 935
 936         ret = btrfs_remove_free_space_inode(trans, inode, block_group);
 937         if (ret)
 938                 goto out;
 939
 940         spin_lock(&fs_info->block_group_cache_lock);
 941         rb_erase(&block_group->cache_node,
 942                  &fs_info->block_group_cache_tree);
 943         RB_CLEAR_NODE(&block_group->cache_node);
 944
 945         /* Once for the block groups rbtree */
 946         btrfs_put_block_group(block_group);
 947
 948         if (fs_info->first_logical_byte == block_group->start)
 949                 fs_info->first_logical_byte = (u64)-1;
 950         spin_unlock(&fs_info->block_group_cache_lock);
 951
 952         down_write(&block_group->space_info->groups_sem);
 953         /*
 954          * we must use list_del_init so people can check to see if they
 955          * are still on the list after taking the semaphore
 956          */
 957         list_del_init(&block_group->list);
 958         if (list_empty(&block_group->space_info->block_groups[index])) {
 959                 kobj = block_group->space_info->block_group_kobjs[index];
 960                 block_group->space_info->block_group_kobjs[index] = NULL;
 961                 clear_avail_alloc_bits(fs_info, block_group->flags);
 962         }
 963         up_write(&block_group->space_info->groups_sem);
 964         clear_incompat_bg_bits(fs_info, block_group->flags);
 965         if (kobj) {
 966                 kobject_del(kobj);
 967                 kobject_put(kobj);
 968         }
 969
 970         if (block_group->has_caching_ctl)
 971                 caching_ctl = btrfs_get_caching_control(block_group);
 972         if (block_group->cached == BTRFS_CACHE_STARTED)
 973                 btrfs_wait_block_group_cache_done(block_group);
 974         if (block_group->has_caching_ctl) {
 975                 spin_lock(&fs_info->block_group_cache_lock);
 976                 if (!caching_ctl) {
 977                         struct btrfs_caching_control *ctl;
 978
 979                         list_for_each_entry(ctl,
 980                                     &fs_info->caching_block_groups, list)
 981                                 if (ctl->block_group == block_group) {
 982                                         caching_ctl = ctl;
 983                                         refcount_inc(&caching_ctl->count);
 984                                         break;
 985                                 }
 986                 }
 987                 if (caching_ctl)
 988                         list_del_init(&caching_ctl->list);
 989                 spin_unlock(&fs_info->block_group_cache_lock);
 990                 if (caching_ctl) {
 991                         /* Once for the caching bgs list and once for us. */
 992                         btrfs_put_caching_control(caching_ctl);
 993                         btrfs_put_caching_control(caching_ctl);
 994                 }
 995         }
 996
 997         spin_lock(&trans->transaction->dirty_bgs_lock);
 998         WARN_ON(!list_empty(&block_group->dirty_list));
 999         WARN_ON(!list_empty(&block_group->io_list));
1000         spin_unlock(&trans->transaction->dirty_bgs_lock);
1001
1002         btrfs_remove_free_space_cache(block_group);
1003
1004         spin_lock(&block_group->space_info->lock);
1005         list_del_init(&block_group->ro_list);
1006
1007         if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1008                 WARN_ON(block_group->space_info->total_bytes
1009                         < block_group->length);
1010                 WARN_ON(block_group->space_info->bytes_readonly
1011                         < block_group->length - block_group->zone_unusable);
1012                 WARN_ON(block_group->space_info->bytes_zone_unusable
1013                         < block_group->zone_unusable);
1014                 WARN_ON(block_group->space_info->disk_total
1015                         < block_group->length * factor);
1016         }
1017         block_group->space_info->total_bytes -= block_group->length;
1018         block_group->space_info->bytes_readonly -=
1019                 (block_group->length - block_group->zone_unusable);
1020         block_group->space_info->bytes_zone_unusable -=
1021                 block_group->zone_unusable;
1022         block_group->space_info->disk_total -= block_group->length * factor;
1023
1024         spin_unlock(&block_group->space_info->lock);
1025
1026         /*
1027          * Remove the free space for the block group from the free space tree
1028          * and the block group's item from the extent tree before marking the
1029          * block group as removed. This is to prevent races with tasks that
1030          * freeze and unfreeze a block group, this task and another task
1031          * allocating a new block group - the unfreeze task ends up removing
1032          * the block group's extent map before the task calling this function
1033          * deletes the block group item from the extent tree, allowing for
1034          * another task to attempt to create another block group with the same
1035          * item key (and failing with -EEXIST and a transaction abort).
1036          */
1037         ret = remove_block_group_free_space(trans, block_group);
1038         if (ret)
1039                 goto out;
1040
1041         ret = remove_block_group_item(trans, path, block_group);
1042         if (ret < 0)
1043                 goto out;
1044
1045         spin_lock(&block_group->lock);
1046         block_group->removed = 1;
1047         /*
1048          * At this point trimming or scrub can't start on this block group,
1049          * because we removed the block group from the rbtree
1050          * fs_info->block_group_cache_tree so no one can't find it anymore and
1051          * even if someone already got this block group before we removed it
1052          * from the rbtree, they have already incremented block_group->frozen -
1053          * if they didn't, for the trimming case they won't find any free space
1054          * entries because we already removed them all when we called
1055          * btrfs_remove_free_space_cache().
1056          *
1057          * And we must not remove the extent map from the fs_info->mapping_tree
1058          * to prevent the same logical address range and physical device space
1059          * ranges from being reused for a new block group. This is needed to
1060          * avoid races with trimming and scrub.
1061          *
1062          * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1063          * completely transactionless, so while it is trimming a range the
1064          * currently running transaction might finish and a new one start,
1065          * allowing for new block groups to be created that can reuse the same
1066          * physical device locations unless we take this special care.
1067          *
1068          * There may also be an implicit trim operation if the file system
1069          * is mounted with -odiscard. The same protections must remain
1070          * in place until the extents have been discarded completely when
1071          * the transaction commit has completed.
1072          */
1073         remove_em = (atomic_read(&block_group->frozen) == 0);
1074         spin_unlock(&block_group->lock);
1075
1076         if (remove_em) {
1077                 struct extent_map_tree *em_tree;
1078
1079                 em_tree = &fs_info->mapping_tree;
1080                 write_lock(&em_tree->lock);
1081                 remove_extent_mapping(em_tree, em);
1082                 write_unlock(&em_tree->lock);
1083                 /* once for the tree */
1084                 free_extent_map(em);
1085         }
1086
1087 out:
1088         /* Once for the lookup reference */
1089         btrfs_put_block_group(block_group);
1090         if (remove_rsv)
1091                 btrfs_delayed_refs_rsv_release(fs_info, 1);
1092         btrfs_free_path(path);
1093         return ret;
1094 }
1095
1096 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1097                 struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1098 {
1099         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1100         struct extent_map *em;
1101         struct map_lookup *map;
1102         unsigned int num_items;
1103
1104         read_lock(&em_tree->lock);
1105         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1106         read_unlock(&em_tree->lock);
1107         ASSERT(em && em->start == chunk_offset);
1108
1109         /*
1110          * We need to reserve 3 + N units from the metadata space info in order
1111          * to remove a block group (done at btrfs_remove_chunk() and at
1112          * btrfs_remove_block_group()), which are used for:
1113          *
1114          * 1 unit for adding the free space inode's orphan (located in the tree
1115          * of tree roots).
1116          * 1 unit for deleting the block group item (located in the extent
1117          * tree).
1118          * 1 unit for deleting the free space item (located in tree of tree
1119          * roots).
1120          * N units for deleting N device extent items corresponding to each
1121          * stripe (located in the device tree).
1122          *
1123          * In order to remove a block group we also need to reserve units in the
1124          * system space info in order to update the chunk tree (update one or
1125          * more device items and remove one chunk item), but this is done at
1126          * btrfs_remove_chunk() through a call to check_system_chunk().
1127          */
1128         map = em->map_lookup;
1129         num_items = 3 + map->num_stripes;
1130         free_extent_map(em);
1131
1132         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
1133                                                            num_items);
1134 }
1135
1136 /*
1137  * Mark block group @cache read-only, so later write won't happen to block
1138  * group @cache.
1139  *
1140  * If @force is not set, this function will only mark the block group readonly
1141  * if we have enough free space (1M) in other metadata/system block groups.
1142  * If @force is not set, this function will mark the block group readonly
1143  * without checking free space.
1144  *
1145  * NOTE: This function doesn't care if other block groups can contain all the
1146  * data in this block group. That check should be done by relocation routine,
1147  * not this function.
1148  */
1149 static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
1150 {
1151         struct btrfs_space_info *sinfo = cache->space_info;
1152         u64 num_bytes;
1153         int ret = -ENOSPC;
1154
1155         spin_lock(&sinfo->lock);
1156         spin_lock(&cache->lock);
1157
1158         if (cache->swap_extents) {
1159                 ret = -ETXTBSY;
1160                 goto out;
1161         }
1162
1163         if (cache->ro) {
1164                 cache->ro++;
1165                 ret = 0;
1166                 goto out;
1167         }
1168
1169         num_bytes = cache->length - cache->reserved - cache->pinned -
1170                     cache->bytes_super - cache->zone_unusable - cache->used;
1171
1172         /*
1173          * Data never overcommits, even in mixed mode, so do just the straight
1174          * check of left over space in how much we have allocated.
1175          */
1176         if (force) {
1177                 ret = 0;
1178         } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1179                 u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1180
1181                 /*
1182                  * Here we make sure if we mark this bg RO, we still have enough
1183                  * free space as buffer.
1184                  */
1185                 if (sinfo_used + num_bytes <= sinfo->total_bytes)
1186                         ret = 0;
1187         } else {
1188                 /*
1189                  * We overcommit metadata, so we need to do the
1190                  * btrfs_can_overcommit check here, and we need to pass in
1191                  * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1192                  * leeway to allow us to mark this block group as read only.
1193                  */
1194                 if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1195                                          BTRFS_RESERVE_NO_FLUSH))
1196                         ret = 0;
1197         }
1198
1199         if (!ret) {
1200                 sinfo->bytes_readonly += num_bytes;
1201                 if (btrfs_is_zoned(cache->fs_info)) {
1202                         /* Migrate zone_unusable bytes to readonly */
1203                         sinfo->bytes_readonly += cache->zone_unusable;
1204                         sinfo->bytes_zone_unusable -= cache->zone_unusable;
1205                         cache->zone_unusable = 0;
1206                 }
1207                 cache->ro++;
1208                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1209         }
1210 out:
1211         spin_unlock(&cache->lock);
1212         spin_unlock(&sinfo->lock);
1213         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1214                 btrfs_info(cache->fs_info,
1215                         "unable to make block group %llu ro", cache->start);
1216                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
1217         }
1218         return ret;
1219 }
1220
1221 static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1222                                  struct btrfs_block_group *bg)
1223 {
1224         struct btrfs_fs_info *fs_info = bg->fs_info;
1225         struct btrfs_transaction *prev_trans = NULL;
1226         const u64 start = bg->start;
1227         const u64 end = start + bg->length - 1;
1228         int ret;
1229
1230         spin_lock(&fs_info->trans_lock);
1231         if (trans->transaction->list.prev != &fs_info->trans_list) {
1232                 prev_trans = list_last_entry(&trans->transaction->list,
1233                                              struct btrfs_transaction, list);
1234                 refcount_inc(&prev_trans->use_count);
1235         }
1236         spin_unlock(&fs_info->trans_lock);
1237
1238         /*
1239          * Hold the unused_bg_unpin_mutex lock to avoid racing with
1240          * btrfs_finish_extent_commit(). If we are at transaction N, another
1241          * task might be running finish_extent_commit() for the previous
1242          * transaction N - 1, and have seen a range belonging to the block
1243          * group in pinned_extents before we were able to clear the whole block
1244          * group range from pinned_extents. This means that task can lookup for
1245          * the block group after we unpinned it from pinned_extents and removed
1246          * it, leading to a BUG_ON() at unpin_extent_range().
1247          */
1248         mutex_lock(&fs_info->unused_bg_unpin_mutex);
1249         if (prev_trans) {
1250                 ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
1251                                         EXTENT_DIRTY);
1252                 if (ret)
1253                         goto out;
1254         }
1255
1256         ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
1257                                 EXTENT_DIRTY);
1258 out:
1259         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1260         if (prev_trans)
1261                 btrfs_put_transaction(prev_trans);
1262
1263         return ret == 0;
1264 }
1265
1266 /*
1267  * Process the unused_bgs list and remove any that don't have any allocated
1268  * space inside of them.
1269  */
1270 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1271 {
1272         struct btrfs_block_group *block_group;
1273         struct btrfs_space_info *space_info;
1274         struct btrfs_trans_handle *trans;
1275         const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1276         int ret = 0;
1277
1278         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1279                 return;
1280
1281         /*
1282          * Long running balances can keep us blocked here for eternity, so
1283          * simply skip deletion if we're unable to get the mutex.
1284          */
1285         if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1286                 return;
1287
1288         spin_lock(&fs_info->unused_bgs_lock);
1289         while (!list_empty(&fs_info->unused_bgs)) {
1290                 int trimming;
1291
1292                 block_group = list_first_entry(&fs_info->unused_bgs,
1293                                                struct btrfs_block_group,
1294                                                bg_list);
1295                 list_del_init(&block_group->bg_list);
1296
1297                 space_info = block_group->space_info;
1298
1299                 if (ret || btrfs_mixed_space_info(space_info)) {
1300                         btrfs_put_block_group(block_group);
1301                         continue;
1302                 }
1303                 spin_unlock(&fs_info->unused_bgs_lock);
1304
1305                 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1306
1307                 /* Don't want to race with allocators so take the groups_sem */
1308                 down_write(&space_info->groups_sem);
1309
1310                 /*
1311                  * Async discard moves the final block group discard to be prior
1312                  * to the unused_bgs code path.  Therefore, if it's not fully
1313                  * trimmed, punt it back to the async discard lists.
1314                  */
1315                 if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1316                     !btrfs_is_free_space_trimmed(block_group)) {
1317                         trace_btrfs_skip_unused_block_group(block_group);
1318                         up_write(&space_info->groups_sem);
1319                         /* Requeue if we failed because of async discard */
1320                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1321                                                  block_group);
1322                         goto next;
1323                 }
1324
1325                 spin_lock(&block_group->lock);
1326                 if (block_group->reserved || block_group->pinned ||
1327                     block_group->used || block_group->ro ||
1328                     list_is_singular(&block_group->list)) {
1329                         /*
1330                          * We want to bail if we made new allocations or have
1331                          * outstanding allocations in this block group.  We do
1332                          * the ro check in case balance is currently acting on
1333                          * this block group.
1334                          */
1335                         trace_btrfs_skip_unused_block_group(block_group);
1336                         spin_unlock(&block_group->lock);
1337                         up_write(&space_info->groups_sem);
1338                         goto next;
1339                 }
1340                 spin_unlock(&block_group->lock);
1341
1342                 /* We don't want to force the issue, only flip if it's ok. */
1343                 ret = inc_block_group_ro(block_group, 0);
1344                 up_write(&space_info->groups_sem);
1345                 if (ret < 0) {
1346                         ret = 0;
1347                         goto next;
1348                 }
1349
1350                 /*
1351                  * Want to do this before we do anything else so we can recover
1352                  * properly if we fail to join the transaction.
1353                  */
1354                 trans = btrfs_start_trans_remove_block_group(fs_info,
1355                                                      block_group->start);
1356                 if (IS_ERR(trans)) {
1357                         btrfs_dec_block_group_ro(block_group);
1358                         ret = PTR_ERR(trans);
1359                         goto next;
1360                 }
1361
1362                 /*
1363                  * We could have pending pinned extents for this block group,
1364                  * just delete them, we don't care about them anymore.
1365                  */
1366                 if (!clean_pinned_extents(trans, block_group)) {
1367                         btrfs_dec_block_group_ro(block_group);
1368                         goto end_trans;
1369                 }
1370
1371                 /*
1372                  * At this point, the block_group is read only and should fail
1373                  * new allocations.  However, btrfs_finish_extent_commit() can
1374                  * cause this block_group to be placed back on the discard
1375                  * lists because now the block_group isn't fully discarded.
1376                  * Bail here and try again later after discarding everything.
1377                  */
1378                 spin_lock(&fs_info->discard_ctl.lock);
1379                 if (!list_empty(&block_group->discard_list)) {
1380                         spin_unlock(&fs_info->discard_ctl.lock);
1381                         btrfs_dec_block_group_ro(block_group);
1382                         btrfs_discard_queue_work(&fs_info->discard_ctl,
1383                                                  block_group);
1384                         goto end_trans;
1385                 }
1386                 spin_unlock(&fs_info->discard_ctl.lock);
1387
1388                 /* Reset pinned so btrfs_put_block_group doesn't complain */
1389                 spin_lock(&space_info->lock);
1390                 spin_lock(&block_group->lock);
1391
1392                 btrfs_space_info_update_bytes_pinned(fs_info, space_info,
1393                                                      -block_group->pinned);
1394                 space_info->bytes_readonly += block_group->pinned;
1395                 block_group->pinned = 0;
1396
1397                 spin_unlock(&block_group->lock);
1398                 spin_unlock(&space_info->lock);
1399
1400                 /*
1401                  * The normal path here is an unused block group is passed here,
1402                  * then trimming is handled in the transaction commit path.
1403                  * Async discard interposes before this to do the trimming
1404                  * before coming down the unused block group path as trimming
1405                  * will no longer be done later in the transaction commit path.
1406                  */
1407                 if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1408                         goto flip_async;
1409
1410                 /*
1411                  * DISCARD can flip during remount. On zoned filesystems, we
1412                  * need to reset sequential-required zones.
1413                  */
1414                 trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1415                                 btrfs_is_zoned(fs_info);
1416
1417                 /* Implicit trim during transaction commit. */
1418                 if (trimming)
1419                         btrfs_freeze_block_group(block_group);
1420
1421                 /*
1422                  * Btrfs_remove_chunk will abort the transaction if things go
1423                  * horribly wrong.
1424                  */
1425                 ret = btrfs_remove_chunk(trans, block_group->start);
1426
1427                 if (ret) {
1428                         if (trimming)
1429                                 btrfs_unfreeze_block_group(block_group);
1430                         goto end_trans;
1431                 }
1432
1433                 /*
1434                  * If we're not mounted with -odiscard, we can just forget
1435                  * about this block group. Otherwise we'll need to wait
1436                  * until transaction commit to do the actual discard.
1437                  */
1438                 if (trimming) {
1439                         spin_lock(&fs_info->unused_bgs_lock);
1440                         /*
1441                          * A concurrent scrub might have added us to the list
1442                          * fs_info->unused_bgs, so use a list_move operation
1443                          * to add the block group to the deleted_bgs list.
1444                          */
1445                         list_move(&block_group->bg_list,
1446                                   &trans->transaction->deleted_bgs);
1447                         spin_unlock(&fs_info->unused_bgs_lock);
1448                         btrfs_get_block_group(block_group);
1449                 }
1450 end_trans:
1451                 btrfs_end_transaction(trans);
1452 next:
1453                 btrfs_put_block_group(block_group);
1454                 spin_lock(&fs_info->unused_bgs_lock);
1455         }
1456         spin_unlock(&fs_info->unused_bgs_lock);
1457         mutex_unlock(&fs_info->reclaim_bgs_lock);
1458         return;
1459
1460 flip_async:
1461         btrfs_end_transaction(trans);
1462         mutex_unlock(&fs_info->reclaim_bgs_lock);
1463         btrfs_put_block_group(block_group);
1464         btrfs_discard_punt_unused_bgs_list(fs_info);
1465 }
1466
1467 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1468 {
1469         struct btrfs_fs_info *fs_info = bg->fs_info;
1470
1471         spin_lock(&fs_info->unused_bgs_lock);
1472         if (list_empty(&bg->bg_list)) {
1473                 btrfs_get_block_group(bg);
1474                 trace_btrfs_add_unused_block_group(bg);
1475                 list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1476         }
1477         spin_unlock(&fs_info->unused_bgs_lock);
1478 }
1479
1480 void btrfs_reclaim_bgs_work(struct work_struct *work)
1481 {
1482         struct btrfs_fs_info *fs_info =
1483                 container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1484         struct btrfs_block_group *bg;
1485         struct btrfs_space_info *space_info;
1486
1487         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1488                 return;
1489
1490         sb_start_write(fs_info->sb);
1491
1492         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1493                 sb_end_write(fs_info->sb);
1494                 return;
1495         }
1496
1497         /*
1498          * Long running balances can keep us blocked here for eternity, so
1499          * simply skip reclaim if we're unable to get the mutex.
1500          */
1501         if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1502                 btrfs_exclop_finish(fs_info);
1503                 sb_end_write(fs_info->sb);
1504                 return;
1505         }
1506
1507         spin_lock(&fs_info->unused_bgs_lock);
1508         while (!list_empty(&fs_info->reclaim_bgs)) {
1509                 u64 zone_unusable;
1510                 int ret = 0;
1511
1512                 bg = list_first_entry(&fs_info->reclaim_bgs,
1513                                       struct btrfs_block_group,
1514                                       bg_list);
1515                 list_del_init(&bg->bg_list);
1516
1517                 space_info = bg->space_info;
1518                 spin_unlock(&fs_info->unused_bgs_lock);
1519
1520                 /* Don't race with allocators so take the groups_sem */
1521                 down_write(&space_info->groups_sem);
1522
1523                 spin_lock(&bg->lock);
1524                 if (bg->reserved || bg->pinned || bg->ro) {
1525                         /*
1526                          * We want to bail if we made new allocations or have
1527                          * outstanding allocations in this block group.  We do
1528                          * the ro check in case balance is currently acting on
1529                          * this block group.
1530                          */
1531                         spin_unlock(&bg->lock);
1532                         up_write(&space_info->groups_sem);
1533                         goto next;
1534                 }
1535                 spin_unlock(&bg->lock);
1536
1537                 /* Get out fast, in case we're unmounting the filesystem */
1538                 if (btrfs_fs_closing(fs_info)) {
1539                         up_write(&space_info->groups_sem);
1540                         goto next;
1541                 }
1542
1543                 /*
1544                  * Cache the zone_unusable value before turning the block group
1545                  * to read only. As soon as the blog group is read only it's
1546                  * zone_unusable value gets moved to the block group's read-only
1547                  * bytes and isn't available for calculations anymore.
1548                  */
1549                 zone_unusable = bg->zone_unusable;
1550                 ret = inc_block_group_ro(bg, 0);
1551                 up_write(&space_info->groups_sem);
1552                 if (ret < 0)
1553                         goto next;
1554
1555                 btrfs_info(fs_info,
1556                         "reclaiming chunk %llu with %llu%% used %llu%% unusable",
1557                                 bg->start, div_u64(bg->used * 100, bg->length),
1558                                 div64_u64(zone_unusable * 100, bg->length));
1559                 trace_btrfs_reclaim_block_group(bg);
1560                 ret = btrfs_relocate_chunk(fs_info, bg->start);
1561                 if (ret) {
1562                         btrfs_dec_block_group_ro(bg);
1563                         btrfs_err(fs_info, "error relocating chunk %llu",
1564                                   bg->start);
1565                 }
1566
1567 next:
1568                 btrfs_put_block_group(bg);
1569                 spin_lock(&fs_info->unused_bgs_lock);
1570         }
1571         spin_unlock(&fs_info->unused_bgs_lock);
1572         mutex_unlock(&fs_info->reclaim_bgs_lock);
1573         btrfs_exclop_finish(fs_info);
1574         sb_end_write(fs_info->sb);
1575 }
1576
1577 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1578 {
1579         spin_lock(&fs_info->unused_bgs_lock);
1580         if (!list_empty(&fs_info->reclaim_bgs))
1581                 queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
1582         spin_unlock(&fs_info->unused_bgs_lock);
1583 }
1584
1585 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1586 {
1587         struct btrfs_fs_info *fs_info = bg->fs_info;
1588
1589         spin_lock(&fs_info->unused_bgs_lock);
1590         if (list_empty(&bg->bg_list)) {
1591                 btrfs_get_block_group(bg);
1592                 trace_btrfs_add_reclaim_block_group(bg);
1593                 list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
1594         }
1595         spin_unlock(&fs_info->unused_bgs_lock);
1596 }
1597
1598 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
1599                            struct btrfs_path *path)
1600 {
1601         struct extent_map_tree *em_tree;
1602         struct extent_map *em;
1603         struct btrfs_block_group_item bg;
1604         struct extent_buffer *leaf;
1605         int slot;
1606         u64 flags;
1607         int ret = 0;
1608
1609         slot = path->slots[0];
1610         leaf = path->nodes[0];
1611
1612         em_tree = &fs_info->mapping_tree;
1613         read_lock(&em_tree->lock);
1614         em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
1615         read_unlock(&em_tree->lock);
1616         if (!em) {
1617                 btrfs_err(fs_info,
1618                           "logical %llu len %llu found bg but no related chunk",
1619                           key->objectid, key->offset);
1620                 return -ENOENT;
1621         }
1622
1623         if (em->start != key->objectid || em->len != key->offset) {
1624                 btrfs_err(fs_info,
1625                         "block group %llu len %llu mismatch with chunk %llu len %llu",
1626                         key->objectid, key->offset, em->start, em->len);
1627                 ret = -EUCLEAN;
1628                 goto out_free_em;
1629         }
1630
1631         read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
1632                            sizeof(bg));
1633         flags = btrfs_stack_block_group_flags(&bg) &
1634                 BTRFS_BLOCK_GROUP_TYPE_MASK;
1635
1636         if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1637                 btrfs_err(fs_info,
1638 "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
1639                           key->objectid, key->offset, flags,
1640                           (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
1641                 ret = -EUCLEAN;
1642         }
1643
1644 out_free_em:
1645         free_extent_map(em);
1646         return ret;
1647 }
1648
1649 static int find_first_block_group(struct btrfs_fs_info *fs_info,
1650                                   struct btrfs_path *path,
1651                                   struct btrfs_key *key)
1652 {
1653         struct btrfs_root *root = fs_info->extent_root;
1654         int ret;
1655         struct btrfs_key found_key;
1656         struct extent_buffer *leaf;
1657         int slot;
1658
1659         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1660         if (ret < 0)
1661                 return ret;
1662
1663         while (1) {
1664                 slot = path->slots[0];
1665                 leaf = path->nodes[0];
1666                 if (slot >= btrfs_header_nritems(leaf)) {
1667                         ret = btrfs_next_leaf(root, path);
1668                         if (ret == 0)
1669                                 continue;
1670                         if (ret < 0)
1671                                 goto out;
1672                         break;
1673                 }
1674                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
1675
1676                 if (found_key.objectid >= key->objectid &&
1677                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1678                         ret = read_bg_from_eb(fs_info, &found_key, path);
1679                         break;
1680                 }
1681
1682                 path->slots[0]++;
1683         }
1684 out:
1685         return ret;
1686 }
1687
1688 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1689 {
1690         u64 extra_flags = chunk_to_extended(flags) &
1691                                 BTRFS_EXTENDED_PROFILE_MASK;
1692
1693         write_seqlock(&fs_info->profiles_lock);
1694         if (flags & BTRFS_BLOCK_GROUP_DATA)
1695                 fs_info->avail_data_alloc_bits |= extra_flags;
1696         if (flags & BTRFS_BLOCK_GROUP_METADATA)
1697                 fs_info->avail_metadata_alloc_bits |= extra_flags;
1698         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1699                 fs_info->avail_system_alloc_bits |= extra_flags;
1700         write_sequnlock(&fs_info->profiles_lock);
1701 }
1702
1703 /**
1704  * Map a physical disk address to a list of logical addresses
1705  *
1706  * @fs_info:       the filesystem
1707  * @chunk_start:   logical address of block group
1708  * @bdev:          physical device to resolve, can be NULL to indicate any device
1709  * @physical:      physical address to map to logical addresses
1710  * @logical:       return array of logical addresses which map to @physical
1711  * @naddrs:        length of @logical
1712  * @stripe_len:    size of IO stripe for the given block group
1713  *
1714  * Maps a particular @physical disk address to a list of @logical addresses.
1715  * Used primarily to exclude those portions of a block group that contain super
1716  * block copies.
1717  */
1718 int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
1719                      struct block_device *bdev, u64 physical, u64 **logical,
1720                      int *naddrs, int *stripe_len)
1721 {
1722         struct extent_map *em;
1723         struct map_lookup *map;
1724         u64 *buf;
1725         u64 bytenr;
1726         u64 data_stripe_length;
1727         u64 io_stripe_size;
1728         int i, nr = 0;
1729         int ret = 0;
1730
1731         em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
1732         if (IS_ERR(em))
1733                 return -EIO;
1734
1735         map = em->map_lookup;
1736         data_stripe_length = em->orig_block_len;
1737         io_stripe_size = map->stripe_len;
1738         chunk_start = em->start;
1739
1740         /* For RAID5/6 adjust to a full IO stripe length */
1741         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1742                 io_stripe_size = map->stripe_len * nr_data_stripes(map);
1743
1744         buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
1745         if (!buf) {
1746                 ret = -ENOMEM;
1747                 goto out;
1748         }
1749
1750         for (i = 0; i < map->num_stripes; i++) {
1751                 bool already_inserted = false;
1752                 u64 stripe_nr;
1753                 u64 offset;
1754                 int j;
1755
1756                 if (!in_range(physical, map->stripes[i].physical,
1757                               data_stripe_length))
1758                         continue;
1759
1760                 if (bdev && map->stripes[i].dev->bdev != bdev)
1761                         continue;
1762
1763                 stripe_nr = physical - map->stripes[i].physical;
1764                 stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
1765
1766                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
1767                         stripe_nr = stripe_nr * map->num_stripes + i;
1768                         stripe_nr = div_u64(stripe_nr, map->sub_stripes);
1769                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
1770                         stripe_nr = stripe_nr * map->num_stripes + i;
1771                 }
1772                 /*
1773                  * The remaining case would be for RAID56, multiply by
1774                  * nr_data_stripes().  Alternatively, just use rmap_len below
1775                  * instead of map->stripe_len
1776                  */
1777
1778                 bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
1779
1780                 /* Ensure we don't add duplicate addresses */
1781                 for (j = 0; j < nr; j++) {
1782                         if (buf[j] == bytenr) {
1783                                 already_inserted = true;
1784                                 break;
1785                         }
1786                 }
1787
1788                 if (!already_inserted)
1789                         buf[nr++] = bytenr;
1790         }
1791
1792         *logical = buf;
1793         *naddrs = nr;
1794         *stripe_len = io_stripe_size;
1795 out:
1796         free_extent_map(em);
1797         return ret;
1798 }
1799
1800 static int exclude_super_stripes(struct btrfs_block_group *cache)
1801 {
1802         struct btrfs_fs_info *fs_info = cache->fs_info;
1803         const bool zoned = btrfs_is_zoned(fs_info);
1804         u64 bytenr;
1805         u64 *logical;
1806         int stripe_len;
1807         int i, nr, ret;
1808
1809         if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
1810                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
1811                 cache->bytes_super += stripe_len;
1812                 ret = btrfs_add_excluded_extent(fs_info, cache->start,
1813                                                 stripe_len);
1814                 if (ret)
1815                         return ret;
1816         }
1817
1818         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1819                 bytenr = btrfs_sb_offset(i);
1820                 ret = btrfs_rmap_block(fs_info, cache->start, NULL,
1821                                        bytenr, &logical, &nr, &stripe_len);
1822                 if (ret)
1823                         return ret;
1824
1825                 /* Shouldn't have super stripes in sequential zones */
1826                 if (zoned && nr) {
1827                         btrfs_err(fs_info,
1828                         "zoned: block group %llu must not contain super block",
1829                                   cache->start);
1830                         return -EUCLEAN;
1831                 }
1832
1833                 while (nr--) {
1834                         u64 len = min_t(u64, stripe_len,
1835                                 cache->start + cache->length - logical[nr]);
1836
1837                         cache->bytes_super += len;
1838                         ret = btrfs_add_excluded_extent(fs_info, logical[nr],
1839                                                         len);
1840                         if (ret) {
1841                                 kfree(logical);
1842                                 return ret;
1843                         }
1844                 }
1845
1846                 kfree(logical);
1847         }
1848         return 0;
1849 }
1850
1851 static void link_block_group(struct btrfs_block_group *cache)
1852 {
1853         struct btrfs_space_info *space_info = cache->space_info;
1854         int index = btrfs_bg_flags_to_raid_index(cache->flags);
1855
1856         down_write(&space_info->groups_sem);
1857         list_add_tail(&cache->list, &space_info->block_groups[index]);
1858         up_write(&space_info->groups_sem);
1859 }
1860
1861 static struct btrfs_block_group *btrfs_create_block_group_cache(
1862                 struct btrfs_fs_info *fs_info, u64 start)
1863 {
1864         struct btrfs_block_group *cache;
1865
1866         cache = kzalloc(sizeof(*cache), GFP_NOFS);
1867         if (!cache)
1868                 return NULL;
1869
1870         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
1871                                         GFP_NOFS);
1872         if (!cache->free_space_ctl) {
1873                 kfree(cache);
1874                 return NULL;
1875         }
1876
1877         cache->start = start;
1878
1879         cache->fs_info = fs_info;
1880         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
1881
1882         cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
1883
1884         refcount_set(&cache->refs, 1);
1885         spin_lock_init(&cache->lock);
1886         init_rwsem(&cache->data_rwsem);
1887         INIT_LIST_HEAD(&cache->list);
1888         INIT_LIST_HEAD(&cache->cluster_list);
1889         INIT_LIST_HEAD(&cache->bg_list);
1890         INIT_LIST_HEAD(&cache->ro_list);
1891         INIT_LIST_HEAD(&cache->discard_list);
1892         INIT_LIST_HEAD(&cache->dirty_list);
1893         INIT_LIST_HEAD(&cache->io_list);
1894         btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
1895         atomic_set(&cache->frozen, 0);
1896         mutex_init(&cache->free_space_lock);
1897         btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
1898
1899         return cache;
1900 }
1901
1902 /*
1903  * Iterate all chunks and verify that each of them has the corresponding block
1904  * group
1905  */
1906 static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
1907 {
1908         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
1909         struct extent_map *em;
1910         struct btrfs_block_group *bg;
1911         u64 start = 0;
1912         int ret = 0;
1913
1914         while (1) {
1915                 read_lock(&map_tree->lock);
1916                 /*
1917                  * lookup_extent_mapping will return the first extent map
1918                  * intersecting the range, so setting @len to 1 is enough to
1919                  * get the first chunk.
1920                  */
1921                 em = lookup_extent_mapping(map_tree, start, 1);
1922                 read_unlock(&map_tree->lock);
1923                 if (!em)
1924                         break;
1925
1926                 bg = btrfs_lookup_block_group(fs_info, em->start);
1927                 if (!bg) {
1928                         btrfs_err(fs_info,
1929         "chunk start=%llu len=%llu doesn't have corresponding block group",
1930                                      em->start, em->len);
1931                         ret = -EUCLEAN;
1932                         free_extent_map(em);
1933                         break;
1934                 }
1935                 if (bg->start != em->start || bg->length != em->len ||
1936                     (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
1937                     (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
1938                         btrfs_err(fs_info,
1939 "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
1940                                 em->start, em->len,
1941                                 em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
1942                                 bg->start, bg->length,
1943                                 bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
1944                         ret = -EUCLEAN;
1945                         free_extent_map(em);
1946                         btrfs_put_block_group(bg);
1947                         break;
1948                 }
1949                 start = em->start + em->len;
1950                 free_extent_map(em);
1951                 btrfs_put_block_group(bg);
1952         }
1953         return ret;
1954 }
1955
1956 static int read_one_block_group(struct btrfs_fs_info *info,
1957                                 struct btrfs_block_group_item *bgi,
1958                                 const struct btrfs_key *key,
1959                                 int need_clear)
1960 {
1961         struct btrfs_block_group *cache;
1962         struct btrfs_space_info *space_info;
1963         const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
1964         int ret;
1965
1966         ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
1967
1968         cache = btrfs_create_block_group_cache(info, key->objectid);
1969         if (!cache)
1970                 return -ENOMEM;
1971
1972         cache->length = key->offset;
1973         cache->used = btrfs_stack_block_group_used(bgi);
1974         cache->flags = btrfs_stack_block_group_flags(bgi);
1975
1976         set_free_space_tree_thresholds(cache);
1977
1978         if (need_clear) {
1979                 /*
1980                  * When we mount with old space cache, we need to
1981                  * set BTRFS_DC_CLEAR and set dirty flag.
1982                  *
1983                  * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
1984                  *    truncate the old free space cache inode and
1985                  *    setup a new one.
1986                  * b) Setting 'dirty flag' makes sure that we flush
1987                  *    the new space cache info onto disk.
1988                  */
1989                 if (btrfs_test_opt(info, SPACE_CACHE))
1990                         cache->disk_cache_state = BTRFS_DC_CLEAR;
1991         }
1992         if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
1993             (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
1994                         btrfs_err(info,
1995 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
1996                                   cache->start);
1997                         ret = -EINVAL;
1998                         goto error;
1999         }
2000
2001         ret = btrfs_load_block_group_zone_info(cache, false);
2002         if (ret) {
2003                 btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2004                           cache->start);
2005                 goto error;
2006         }
2007
2008         /*
2009          * We need to exclude the super stripes now so that the space info has
2010          * super bytes accounted for, otherwise we'll think we have more space
2011          * than we actually do.
2012          */
2013         ret = exclude_super_stripes(cache);
2014         if (ret) {
2015                 /* We may have excluded something, so call this just in case. */
2016                 btrfs_free_excluded_extents(cache);
2017                 goto error;
2018         }
2019
2020         /*
2021          * For zoned filesystem, space after the allocation offset is the only
2022          * free space for a block group. So, we don't need any caching work.
2023          * btrfs_calc_zone_unusable() will set the amount of free space and
2024          * zone_unusable space.
2025          *
2026          * For regular filesystem, check for two cases, either we are full, and
2027          * therefore don't need to bother with the caching work since we won't
2028          * find any space, or we are empty, and we can just add all the space
2029          * in and be done with it.  This saves us _a_lot_ of time, particularly
2030          * in the full case.
2031          */
2032         if (btrfs_is_zoned(info)) {
2033                 btrfs_calc_zone_unusable(cache);
2034         } else if (cache->length == cache->used) {
2035                 cache->last_byte_to_unpin = (u64)-1;
2036                 cache->cached = BTRFS_CACHE_FINISHED;
2037                 btrfs_free_excluded_extents(cache);
2038         } else if (cache->used == 0) {
2039                 cache->last_byte_to_unpin = (u64)-1;
2040                 cache->cached = BTRFS_CACHE_FINISHED;
2041                 add_new_free_space(cache, cache->start,
2042                                    cache->start + cache->length);
2043                 btrfs_free_excluded_extents(cache);
2044         }
2045
2046         ret = btrfs_add_block_group_cache(info, cache);
2047         if (ret) {
2048                 btrfs_remove_free_space_cache(cache);
2049                 goto error;
2050         }
2051         trace_btrfs_add_block_group(info, cache, 0);
2052         btrfs_update_space_info(info, cache->flags, cache->length,
2053                                 cache->used, cache->bytes_super,
2054                                 cache->zone_unusable, &space_info);
2055
2056         cache->space_info = space_info;
2057
2058         link_block_group(cache);
2059
2060         set_avail_alloc_bits(info, cache->flags);
2061         if (btrfs_chunk_readonly(info, cache->start)) {
2062                 inc_block_group_ro(cache, 1);
2063         } else if (cache->used == 0) {
2064                 ASSERT(list_empty(&cache->bg_list));
2065                 if (btrfs_test_opt(info, DISCARD_ASYNC))
2066                         btrfs_discard_queue_work(&info->discard_ctl, cache);
2067                 else
2068                         btrfs_mark_bg_unused(cache);
2069         }
2070         return 0;
2071 error:
2072         btrfs_put_block_group(cache);
2073         return ret;
2074 }
2075
2076 static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2077 {
2078         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
2079         struct btrfs_space_info *space_info;
2080         struct rb_node *node;
2081         int ret = 0;
2082
2083         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
2084                 struct extent_map *em;
2085                 struct map_lookup *map;
2086                 struct btrfs_block_group *bg;
2087
2088                 em = rb_entry(node, struct extent_map, rb_node);
2089                 map = em->map_lookup;
2090                 bg = btrfs_create_block_group_cache(fs_info, em->start);
2091                 if (!bg) {
2092                         ret = -ENOMEM;
2093                         break;
2094                 }
2095
2096                 /* Fill dummy cache as FULL */
2097                 bg->length = em->len;
2098                 bg->flags = map->type;
2099                 bg->last_byte_to_unpin = (u64)-1;
2100                 bg->cached = BTRFS_CACHE_FINISHED;
2101                 bg->used = em->len;
2102                 bg->flags = map->type;
2103                 ret = btrfs_add_block_group_cache(fs_info, bg);
2104                 /*
2105                  * We may have some valid block group cache added already, in
2106                  * that case we skip to the next one.
2107                  */
2108                 if (ret == -EEXIST) {
2109                         ret = 0;
2110                         btrfs_put_block_group(bg);
2111                         continue;
2112                 }
2113
2114                 if (ret) {
2115                         btrfs_remove_free_space_cache(bg);
2116                         btrfs_put_block_group(bg);
2117                         break;
2118                 }
2119
2120                 btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
2121                                         0, 0, &space_info);
2122                 bg->space_info = space_info;
2123                 link_block_group(bg);
2124
2125                 set_avail_alloc_bits(fs_info, bg->flags);
2126         }
2127         if (!ret)
2128                 btrfs_init_global_block_rsv(fs_info);
2129         return ret;
2130 }
2131
2132 int btrfs_read_block_groups(struct btrfs_fs_info *info)
2133 {
2134         struct btrfs_path *path;
2135         int ret;
2136         struct btrfs_block_group *cache;
2137         struct btrfs_space_info *space_info;
2138         struct btrfs_key key;
2139         int need_clear = 0;
2140         u64 cache_gen;
2141
2142         /*
2143          * Either no extent root (with ibadroots rescue option) or we have
2144          * unsupported RO options. The fs can never be mounted read-write, so no
2145          * need to waste time searching block group items.
2146          *
2147          * This also allows new extent tree related changes to be RO compat,
2148          * no need for a full incompat flag.
2149          */
2150         if (!info->extent_root || (btrfs_super_compat_ro_flags(info->super_copy) &
2151                       ~BTRFS_FEATURE_COMPAT_RO_SUPP))
2152                 return fill_dummy_bgs(info);
2153
2154         key.objectid = 0;
2155         key.offset = 0;
2156         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2157         path = btrfs_alloc_path();
2158         if (!path)
2159                 return -ENOMEM;
2160
2161         cache_gen = btrfs_super_cache_generation(info->super_copy);
2162         if (btrfs_test_opt(info, SPACE_CACHE) &&
2163             btrfs_super_generation(info->super_copy) != cache_gen)
2164                 need_clear = 1;
2165         if (btrfs_test_opt(info, CLEAR_CACHE))
2166                 need_clear = 1;
2167
2168         while (1) {
2169                 struct btrfs_block_group_item bgi;
2170                 struct extent_buffer *leaf;
2171                 int slot;
2172
2173                 ret = find_first_block_group(info, path, &key);
2174                 if (ret > 0)
2175                         break;
2176                 if (ret != 0)
2177                         goto error;
2178
2179                 leaf = path->nodes[0];
2180                 slot = path->slots[0];
2181
2182                 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2183                                    sizeof(bgi));
2184
2185                 btrfs_item_key_to_cpu(leaf, &key, slot);
2186                 btrfs_release_path(path);
2187                 ret = read_one_block_group(info, &bgi, &key, need_clear);
2188                 if (ret < 0)
2189                         goto error;
2190                 key.objectid += key.offset;
2191                 key.offset = 0;
2192         }
2193         btrfs_release_path(path);
2194
2195         list_for_each_entry(space_info, &info->space_info, list) {
2196                 int i;
2197
2198                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2199                         if (list_empty(&space_info->block_groups[i]))
2200                                 continue;
2201                         cache = list_first_entry(&space_info->block_groups[i],
2202                                                  struct btrfs_block_group,
2203                                                  list);
2204                         btrfs_sysfs_add_block_group_type(cache);
2205                 }
2206
2207                 if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2208                       (BTRFS_BLOCK_GROUP_RAID10 |
2209                        BTRFS_BLOCK_GROUP_RAID1_MASK |
2210                        BTRFS_BLOCK_GROUP_RAID56_MASK |
2211                        BTRFS_BLOCK_GROUP_DUP)))
2212                         continue;
2213                 /*
2214                  * Avoid allocating from un-mirrored block group if there are
2215                  * mirrored block groups.
2216                  */
2217                 list_for_each_entry(cache,
2218                                 &space_info->block_groups[BTRFS_RAID_RAID0],
2219                                 list)
2220                         inc_block_group_ro(cache, 1);
2221                 list_for_each_entry(cache,
2222                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
2223                                 list)
2224                         inc_block_group_ro(cache, 1);
2225         }
2226
2227         btrfs_init_global_block_rsv(info);
2228         ret = check_chunk_block_group_mappings(info);
2229 error:
2230         btrfs_free_path(path);
2231         /*
2232          * We've hit some error while reading the extent tree, and have
2233          * rescue=ibadroots mount option.
2234          * Try to fill the tree using dummy block groups so that the user can
2235          * continue to mount and grab their data.
2236          */
2237         if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2238                 ret = fill_dummy_bgs(info);
2239         return ret;
2240 }
2241
2242 /*
2243  * This function, insert_block_group_item(), belongs to the phase 2 of chunk
2244  * allocation.
2245  *
2246  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2247  * phases.
2248  */
2249 static int insert_block_group_item(struct btrfs_trans_handle *trans,
2250                                    struct btrfs_block_group *block_group)
2251 {
2252         struct btrfs_fs_info *fs_info = trans->fs_info;
2253         struct btrfs_block_group_item bgi;
2254         struct btrfs_root *root;
2255         struct btrfs_key key;
2256
2257         spin_lock(&block_group->lock);
2258         btrfs_set_stack_block_group_used(&bgi, block_group->used);
2259         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2260                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2261         btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2262         key.objectid = block_group->start;
2263         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2264         key.offset = block_group->length;
2265         spin_unlock(&block_group->lock);
2266
2267         root = fs_info->extent_root;
2268         return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2269 }
2270
2271 static int insert_dev_extent(struct btrfs_trans_handle *trans,
2272                             struct btrfs_device *device, u64 chunk_offset,
2273                             u64 start, u64 num_bytes)
2274 {
2275         struct btrfs_fs_info *fs_info = device->fs_info;
2276         struct btrfs_root *root = fs_info->dev_root;
2277         struct btrfs_path *path;
2278         struct btrfs_dev_extent *extent;
2279         struct extent_buffer *leaf;
2280         struct btrfs_key key;
2281         int ret;
2282
2283         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2284         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2285         path = btrfs_alloc_path();
2286         if (!path)
2287                 return -ENOMEM;
2288
2289         key.objectid = device->devid;
2290         key.type = BTRFS_DEV_EXTENT_KEY;
2291         key.offset = start;
2292         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2293         if (ret)
2294                 goto out;
2295
2296         leaf = path->nodes[0];
2297         extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2298         btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2299         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2300                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2301         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2302
2303         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2304         btrfs_mark_buffer_dirty(leaf);
2305 out:
2306         btrfs_free_path(path);
2307         return ret;
2308 }
2309
2310 /*
2311  * This function belongs to phase 2.
2312  *
2313  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2314  * phases.
2315  */
2316 static int insert_dev_extents(struct btrfs_trans_handle *trans,
2317                                    u64 chunk_offset, u64 chunk_size)
2318 {
2319         struct btrfs_fs_info *fs_info = trans->fs_info;
2320         struct btrfs_device *device;
2321         struct extent_map *em;
2322         struct map_lookup *map;
2323         u64 dev_offset;
2324         u64 stripe_size;
2325         int i;
2326         int ret = 0;
2327
2328         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2329         if (IS_ERR(em))
2330                 return PTR_ERR(em);
2331
2332         map = em->map_lookup;
2333         stripe_size = em->orig_block_len;
2334
2335         /*
2336          * Take the device list mutex to prevent races with the final phase of
2337          * a device replace operation that replaces the device object associated
2338          * with the map's stripes, because the device object's id can change
2339          * at any time during that final phase of the device replace operation
2340          * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2341          * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2342          * resulting in persisting a device extent item with such ID.
2343          */
2344         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2345         for (i = 0; i < map->num_stripes; i++) {
2346                 device = map->stripes[i].dev;
2347                 dev_offset = map->stripes[i].physical;
2348
2349                 ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2350                                        stripe_size);
2351                 if (ret)
2352                         break;
2353         }
2354         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2355
2356         free_extent_map(em);
2357         return ret;
2358 }
2359
2360 /*
2361  * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2362  * chunk allocation.
2363  *
2364  * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2365  * phases.
2366  */
2367 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2368 {
2369         struct btrfs_fs_info *fs_info = trans->fs_info;
2370         struct btrfs_block_group *block_group;
2371         int ret = 0;
2372
2373         while (!list_empty(&trans->new_bgs)) {
2374                 int index;
2375
2376                 block_group = list_first_entry(&trans->new_bgs,
2377                                                struct btrfs_block_group,
2378                                                bg_list);
2379                 if (ret)
2380                         goto next;
2381
2382                 index = btrfs_bg_flags_to_raid_index(block_group->flags);
2383
2384                 ret = insert_block_group_item(trans, block_group);
2385                 if (ret)
2386                         btrfs_abort_transaction(trans, ret);
2387                 if (!block_group->chunk_item_inserted) {
2388                         mutex_lock(&fs_info->chunk_mutex);
2389                         ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2390                         mutex_unlock(&fs_info->chunk_mutex);
2391                         if (ret)
2392                                 btrfs_abort_transaction(trans, ret);
2393                 }
2394                 ret = insert_dev_extents(trans, block_group->start,
2395                                          block_group->length);
2396                 if (ret)
2397                         btrfs_abort_transaction(trans, ret);
2398                 add_block_group_free_space(trans, block_group);
2399
2400                 /*
2401                  * If we restriped during balance, we may have added a new raid
2402                  * type, so now add the sysfs entries when it is safe to do so.
2403                  * We don't have to worry about locking here as it's handled in
2404                  * btrfs_sysfs_add_block_group_type.
2405                  */
2406                 if (block_group->space_info->block_group_kobjs[index] == NULL)
2407                         btrfs_sysfs_add_block_group_type(block_group);
2408
2409                 /* Already aborted the transaction if it failed. */
2410 next:
2411                 btrfs_delayed_refs_rsv_release(fs_info, 1);
2412                 list_del_init(&block_group->bg_list);
2413         }
2414         btrfs_trans_release_chunk_metadata(trans);
2415 }
2416
2417 struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2418                                                  u64 bytes_used, u64 type,
2419                                                  u64 chunk_offset, u64 size)
2420 {
2421         struct btrfs_fs_info *fs_info = trans->fs_info;
2422         struct btrfs_block_group *cache;
2423         int ret;
2424
2425         btrfs_set_log_full_commit(trans);
2426
2427         cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2428         if (!cache)
2429                 return ERR_PTR(-ENOMEM);
2430
2431         cache->length = size;
2432         set_free_space_tree_thresholds(cache);
2433         cache->used = bytes_used;
2434         cache->flags = type;
2435         cache->last_byte_to_unpin = (u64)-1;
2436         cache->cached = BTRFS_CACHE_FINISHED;
2437         if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2438                 cache->needs_free_space = 1;
2439
2440         ret = btrfs_load_block_group_zone_info(cache, true);
2441         if (ret) {
2442                 btrfs_put_block_group(cache);
2443                 return ERR_PTR(ret);
2444         }
2445
2446         ret = exclude_super_stripes(cache);
2447         if (ret) {
2448                 /* We may have excluded something, so call this just in case */
2449                 btrfs_free_excluded_extents(cache);
2450                 btrfs_put_block_group(cache);
2451                 return ERR_PTR(ret);
2452         }
2453
2454         add_new_free_space(cache, chunk_offset, chunk_offset + size);
2455
2456         btrfs_free_excluded_extents(cache);
2457
2458 #ifdef CONFIG_BTRFS_DEBUG
2459         if (btrfs_should_fragment_free_space(cache)) {
2460                 u64 new_bytes_used = size - bytes_used;
2461
2462                 bytes_used += new_bytes_used >> 1;
2463                 fragment_free_space(cache);
2464         }
2465 #endif
2466         /*
2467          * Ensure the corresponding space_info object is created and
2468          * assigned to our block group. We want our bg to be added to the rbtree
2469          * with its ->space_info set.
2470          */
2471         cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
2472         ASSERT(cache->space_info);
2473
2474         ret = btrfs_add_block_group_cache(fs_info, cache);
2475         if (ret) {
2476                 btrfs_remove_free_space_cache(cache);
2477                 btrfs_put_block_group(cache);
2478                 return ERR_PTR(ret);
2479         }
2480
2481         /*
2482          * Now that our block group has its ->space_info set and is inserted in
2483          * the rbtree, update the space info's counters.
2484          */
2485         trace_btrfs_add_block_group(fs_info, cache, 1);
2486         btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
2487                                 cache->bytes_super, 0, &cache->space_info);
2488         btrfs_update_global_block_rsv(fs_info);
2489
2490         link_block_group(cache);
2491
2492         list_add_tail(&cache->bg_list, &trans->new_bgs);
2493         trans->delayed_ref_updates++;
2494         btrfs_update_delayed_refs_rsv(trans);
2495
2496         set_avail_alloc_bits(fs_info, type);
2497         return cache;
2498 }
2499
2500 /*
2501  * Mark one block group RO, can be called several times for the same block
2502  * group.
2503  *
2504  * @cache:              the destination block group
2505  * @do_chunk_alloc:     whether need to do chunk pre-allocation, this is to
2506  *                      ensure we still have some free space after marking this
2507  *                      block group RO.
2508  */
2509 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2510                              bool do_chunk_alloc)
2511 {
2512         struct btrfs_fs_info *fs_info = cache->fs_info;
2513         struct btrfs_trans_handle *trans;
2514         u64 alloc_flags;
2515         int ret;
2516         bool dirty_bg_running;
2517
2518         /*
2519          * This can only happen when we are doing read-only scrub on read-only
2520          * mount.
2521          * In that case we should not start a new transaction on read-only fs.
2522          * Thus here we skip all chunk allocations.
2523          */
2524         if (sb_rdonly(fs_info->sb)) {
2525                 mutex_lock(&fs_info->ro_block_group_mutex);
2526                 ret = inc_block_group_ro(cache, 0);
2527                 mutex_unlock(&fs_info->ro_block_group_mutex);
2528                 return ret;
2529         }
2530
2531         do {
2532                 trans = btrfs_join_transaction(fs_info->extent_root);
2533                 if (IS_ERR(trans))
2534                         return PTR_ERR(trans);
2535
2536                 dirty_bg_running = false;
2537
2538                 /*
2539                  * We're not allowed to set block groups readonly after the dirty
2540                  * block group cache has started writing.  If it already started,
2541                  * back off and let this transaction commit.
2542                  */
2543                 mutex_lock(&fs_info->ro_block_group_mutex);
2544                 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2545                         u64 transid = trans->transid;
2546
2547                         mutex_unlock(&fs_info->ro_block_group_mutex);
2548                         btrfs_end_transaction(trans);
2549
2550                         ret = btrfs_wait_for_commit(fs_info, transid);
2551                         if (ret)
2552                                 return ret;
2553                         dirty_bg_running = true;
2554                 }
2555         } while (dirty_bg_running);
2556
2557         if (do_chunk_alloc) {
2558                 /*
2559                  * If we are changing raid levels, try to allocate a
2560                  * corresponding block group with the new raid level.
2561                  */
2562                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2563                 if (alloc_flags != cache->flags) {
2564                         ret = btrfs_chunk_alloc(trans, alloc_flags,
2565                                                 CHUNK_ALLOC_FORCE);
2566                         /*
2567                          * ENOSPC is allowed here, we may have enough space
2568                          * already allocated at the new raid level to carry on
2569                          */
2570                         if (ret == -ENOSPC)
2571                                 ret = 0;
2572                         if (ret < 0)
2573                                 goto out;
2574                 }
2575         }
2576
2577         ret = inc_block_group_ro(cache, 0);
2578         if (!do_chunk_alloc || ret == -ETXTBSY)
2579                 goto unlock_out;
2580         if (!ret)
2581                 goto out;
2582         alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
2583         ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
2584         if (ret < 0)
2585                 goto out;
2586         ret = inc_block_group_ro(cache, 0);
2587         if (ret == -ETXTBSY)
2588                 goto unlock_out;
2589 out:
2590         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2591                 alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
2592                 mutex_lock(&fs_info->chunk_mutex);
2593                 check_system_chunk(trans, alloc_flags);
2594                 mutex_unlock(&fs_info->chunk_mutex);
2595         }
2596 unlock_out:
2597         mutex_unlock(&fs_info->ro_block_group_mutex);
2598
2599         btrfs_end_transaction(trans);
2600         return ret;
2601 }
2602
2603 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
2604 {
2605         struct btrfs_space_info *sinfo = cache->space_info;
2606         u64 num_bytes;
2607
2608         BUG_ON(!cache->ro);
2609
2610         spin_lock(&sinfo->lock);
2611         spin_lock(&cache->lock);
2612         if (!--cache->ro) {
2613                 if (btrfs_is_zoned(cache->fs_info)) {
2614                         /* Migrate zone_unusable bytes back */
2615                         cache->zone_unusable = cache->alloc_offset - cache->used;
2616                         sinfo->bytes_zone_unusable += cache->zone_unusable;
2617                         sinfo->bytes_readonly -= cache->zone_unusable;
2618                 }
2619                 num_bytes = cache->length - cache->reserved -
2620                             cache->pinned - cache->bytes_super -
2621                             cache->zone_unusable - cache->used;
2622                 sinfo->bytes_readonly -= num_bytes;
2623                 list_del_init(&cache->ro_list);
2624         }
2625         spin_unlock(&cache->lock);
2626         spin_unlock(&sinfo->lock);
2627 }
2628
2629 static int update_block_group_item(struct btrfs_trans_handle *trans,
2630                                    struct btrfs_path *path,
2631                                    struct btrfs_block_group *cache)
2632 {
2633         struct btrfs_fs_info *fs_info = trans->fs_info;
2634         int ret;
2635         struct btrfs_root *root = fs_info->extent_root;
2636         unsigned long bi;
2637         struct extent_buffer *leaf;
2638         struct btrfs_block_group_item bgi;
2639         struct btrfs_key key;
2640
2641         key.objectid = cache->start;
2642         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2643         key.offset = cache->length;
2644
2645         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2646         if (ret) {
2647                 if (ret > 0)
2648                         ret = -ENOENT;
2649                 goto fail;
2650         }
2651
2652         leaf = path->nodes[0];
2653         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2654         btrfs_set_stack_block_group_used(&bgi, cache->used);
2655         btrfs_set_stack_block_group_chunk_objectid(&bgi,
2656                         BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2657         btrfs_set_stack_block_group_flags(&bgi, cache->flags);
2658         write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
2659         btrfs_mark_buffer_dirty(leaf);
2660 fail:
2661         btrfs_release_path(path);
2662         return ret;
2663
2664 }
2665
2666 static int cache_save_setup(struct btrfs_block_group *block_group,
2667                             struct btrfs_trans_handle *trans,
2668                             struct btrfs_path *path)
2669 {
2670         struct btrfs_fs_info *fs_info = block_group->fs_info;
2671         struct btrfs_root *root = fs_info->tree_root;
2672         struct inode *inode = NULL;
2673         struct extent_changeset *data_reserved = NULL;
2674         u64 alloc_hint = 0;
2675         int dcs = BTRFS_DC_ERROR;
2676         u64 cache_size = 0;
2677         int retries = 0;
2678         int ret = 0;
2679
2680         if (!btrfs_test_opt(fs_info, SPACE_CACHE))
2681                 return 0;
2682
2683         /*
2684          * If this block group is smaller than 100 megs don't bother caching the
2685          * block group.
2686          */
2687         if (block_group->length < (100 * SZ_1M)) {
2688                 spin_lock(&block_group->lock);
2689                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2690                 spin_unlock(&block_group->lock);
2691                 return 0;
2692         }
2693
2694         if (TRANS_ABORTED(trans))
2695                 return 0;
2696 again:
2697         inode = lookup_free_space_inode(block_group, path);
2698         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2699                 ret = PTR_ERR(inode);
2700                 btrfs_release_path(path);
2701                 goto out;
2702         }
2703
2704         if (IS_ERR(inode)) {
2705                 BUG_ON(retries);
2706                 retries++;
2707
2708                 if (block_group->ro)
2709                         goto out_free;
2710
2711                 ret = create_free_space_inode(trans, block_group, path);
2712                 if (ret)
2713                         goto out_free;
2714                 goto again;
2715         }
2716
2717         /*
2718          * We want to set the generation to 0, that way if anything goes wrong
2719          * from here on out we know not to trust this cache when we load up next
2720          * time.
2721          */
2722         BTRFS_I(inode)->generation = 0;
2723         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2724         if (ret) {
2725                 /*
2726                  * So theoretically we could recover from this, simply set the
2727                  * super cache generation to 0 so we know to invalidate the
2728                  * cache, but then we'd have to keep track of the block groups
2729                  * that fail this way so we know we _have_ to reset this cache
2730                  * before the next commit or risk reading stale cache.  So to
2731                  * limit our exposure to horrible edge cases lets just abort the
2732                  * transaction, this only happens in really bad situations
2733                  * anyway.
2734                  */
2735                 btrfs_abort_transaction(trans, ret);
2736                 goto out_put;
2737         }
2738         WARN_ON(ret);
2739
2740         /* We've already setup this transaction, go ahead and exit */
2741         if (block_group->cache_generation == trans->transid &&
2742             i_size_read(inode)) {
2743                 dcs = BTRFS_DC_SETUP;
2744                 goto out_put;
2745         }
2746
2747         if (i_size_read(inode) > 0) {
2748                 ret = btrfs_check_trunc_cache_free_space(fs_info,
2749                                         &fs_info->global_block_rsv);
2750                 if (ret)
2751                         goto out_put;
2752
2753                 ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
2754                 if (ret)
2755                         goto out_put;
2756         }
2757
2758         spin_lock(&block_group->lock);
2759         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2760             !btrfs_test_opt(fs_info, SPACE_CACHE)) {
2761                 /*
2762                  * don't bother trying to write stuff out _if_
2763                  * a) we're not cached,
2764                  * b) we're with nospace_cache mount option,
2765                  * c) we're with v2 space_cache (FREE_SPACE_TREE).
2766                  */
2767                 dcs = BTRFS_DC_WRITTEN;
2768                 spin_unlock(&block_group->lock);
2769                 goto out_put;
2770         }
2771         spin_unlock(&block_group->lock);
2772
2773         /*
2774          * We hit an ENOSPC when setting up the cache in this transaction, just
2775          * skip doing the setup, we've already cleared the cache so we're safe.
2776          */
2777         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
2778                 ret = -ENOSPC;
2779                 goto out_put;
2780         }
2781
2782         /*
2783          * Try to preallocate enough space based on how big the block group is.
2784          * Keep in mind this has to include any pinned space which could end up
2785          * taking up quite a bit since it's not folded into the other space
2786          * cache.
2787          */
2788         cache_size = div_u64(block_group->length, SZ_256M);
2789         if (!cache_size)
2790                 cache_size = 1;
2791
2792         cache_size *= 16;
2793         cache_size *= fs_info->sectorsize;
2794
2795         ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
2796                                           cache_size);
2797         if (ret)
2798                 goto out_put;
2799
2800         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
2801                                               cache_size, cache_size,
2802                                               &alloc_hint);
2803         /*
2804          * Our cache requires contiguous chunks so that we don't modify a bunch
2805          * of metadata or split extents when writing the cache out, which means
2806          * we can enospc if we are heavily fragmented in addition to just normal
2807          * out of space conditions.  So if we hit this just skip setting up any
2808          * other block groups for this transaction, maybe we'll unpin enough
2809          * space the next time around.
2810          */
2811         if (!ret)
2812                 dcs = BTRFS_DC_SETUP;
2813         else if (ret == -ENOSPC)
2814                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
2815
2816 out_put:
2817         iput(inode);
2818 out_free:
2819         btrfs_release_path(path);
2820 out:
2821         spin_lock(&block_group->lock);
2822         if (!ret && dcs == BTRFS_DC_SETUP)
2823                 block_group->cache_generation = trans->transid;
2824         block_group->disk_cache_state = dcs;
2825         spin_unlock(&block_group->lock);
2826
2827         extent_changeset_free(data_reserved);
2828         return ret;
2829 }
2830
2831 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
2832 {
2833         struct btrfs_fs_info *fs_info = trans->fs_info;
2834         struct btrfs_block_group *cache, *tmp;
2835         struct btrfs_transaction *cur_trans = trans->transaction;
2836         struct btrfs_path *path;
2837
2838         if (list_empty(&cur_trans->dirty_bgs) ||
2839             !btrfs_test_opt(fs_info, SPACE_CACHE))
2840                 return 0;
2841
2842         path = btrfs_alloc_path();
2843         if (!path)
2844                 return -ENOMEM;
2845
2846         /* Could add new block groups, use _safe just in case */
2847         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
2848                                  dirty_list) {
2849                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2850                         cache_save_setup(cache, trans, path);
2851         }
2852
2853         btrfs_free_path(path);
2854         return 0;
2855 }
2856
2857 /*
2858  * Transaction commit does final block group cache writeback during a critical
2859  * section where nothing is allowed to change the FS.  This is required in
2860  * order for the cache to actually match the block group, but can introduce a
2861  * lot of latency into the commit.
2862  *
2863  * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
2864  * There's a chance we'll have to redo some of it if the block group changes
2865  * again during the commit, but it greatly reduces the commit latency by
2866  * getting rid of the easy block groups while we're still allowing others to
2867  * join the commit.
2868  */
2869 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
2870 {
2871         struct btrfs_fs_info *fs_info = trans->fs_info;
2872         struct btrfs_block_group *cache;
2873         struct btrfs_transaction *cur_trans = trans->transaction;
2874         int ret = 0;
2875         int should_put;
2876         struct btrfs_path *path = NULL;
2877         LIST_HEAD(dirty);
2878         struct list_head *io = &cur_trans->io_bgs;
2879         int loops = 0;
2880
2881         spin_lock(&cur_trans->dirty_bgs_lock);
2882         if (list_empty(&cur_trans->dirty_bgs)) {
2883                 spin_unlock(&cur_trans->dirty_bgs_lock);
2884                 return 0;
2885         }
2886         list_splice_init(&cur_trans->dirty_bgs, &dirty);
2887         spin_unlock(&cur_trans->dirty_bgs_lock);
2888
2889 again:
2890         /* Make sure all the block groups on our dirty list actually exist */
2891         btrfs_create_pending_block_groups(trans);
2892
2893         if (!path) {
2894                 path = btrfs_alloc_path();
2895                 if (!path) {
2896                         ret = -ENOMEM;
2897                         goto out;
2898                 }
2899         }
2900
2901         /*
2902          * cache_write_mutex is here only to save us from balance or automatic
2903          * removal of empty block groups deleting this block group while we are
2904          * writing out the cache
2905          */
2906         mutex_lock(&trans->transaction->cache_write_mutex);
2907         while (!list_empty(&dirty)) {
2908                 bool drop_reserve = true;
2909
2910                 cache = list_first_entry(&dirty, struct btrfs_block_group,
2911                                          dirty_list);
2912                 /*
2913                  * This can happen if something re-dirties a block group that
2914                  * is already under IO.  Just wait for it to finish and then do
2915                  * it all again
2916                  */
2917                 if (!list_empty(&cache->io_list)) {
2918                         list_del_init(&cache->io_list);
2919                         btrfs_wait_cache_io(trans, cache, path);
2920                         btrfs_put_block_group(cache);
2921                 }
2922
2923
2924                 /*
2925                  * btrfs_wait_cache_io uses the cache->dirty_list to decide if
2926                  * it should update the cache_state.  Don't delete until after
2927                  * we wait.
2928                  *
2929                  * Since we're not running in the commit critical section
2930                  * we need the dirty_bgs_lock to protect from update_block_group
2931                  */
2932                 spin_lock(&cur_trans->dirty_bgs_lock);
2933                 list_del_init(&cache->dirty_list);
2934                 spin_unlock(&cur_trans->dirty_bgs_lock);
2935
2936                 should_put = 1;
2937
2938                 cache_save_setup(cache, trans, path);
2939
2940                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
2941                         cache->io_ctl.inode = NULL;
2942                         ret = btrfs_write_out_cache(trans, cache, path);
2943                         if (ret == 0 && cache->io_ctl.inode) {
2944                                 should_put = 0;
2945
2946                                 /*
2947                                  * The cache_write_mutex is protecting the
2948                                  * io_list, also refer to the definition of
2949                                  * btrfs_transaction::io_bgs for more details
2950                                  */
2951                                 list_add_tail(&cache->io_list, io);
2952                         } else {
2953                                 /*
2954                                  * If we failed to write the cache, the
2955                                  * generation will be bad and life goes on
2956                                  */
2957                                 ret = 0;
2958                         }
2959                 }
2960                 if (!ret) {
2961                         ret = update_block_group_item(trans, path, cache);
2962                         /*
2963                          * Our block group might still be attached to the list
2964                          * of new block groups in the transaction handle of some
2965                          * other task (struct btrfs_trans_handle->new_bgs). This
2966                          * means its block group item isn't yet in the extent
2967                          * tree. If this happens ignore the error, as we will
2968                          * try again later in the critical section of the
2969                          * transaction commit.
2970                          */
2971                         if (ret == -ENOENT) {
2972                                 ret = 0;
2973                                 spin_lock(&cur_trans->dirty_bgs_lock);
2974                                 if (list_empty(&cache->dirty_list)) {
2975                                         list_add_tail(&cache->dirty_list,
2976                                                       &cur_trans->dirty_bgs);
2977                                         btrfs_get_block_group(cache);
2978                                         drop_reserve = false;
2979                                 }
2980                                 spin_unlock(&cur_trans->dirty_bgs_lock);
2981                         } else if (ret) {
2982                                 btrfs_abort_transaction(trans, ret);
2983                         }
2984                 }
2985
2986                 /* If it's not on the io list, we need to put the block group */
2987                 if (should_put)
2988                         btrfs_put_block_group(cache);
2989                 if (drop_reserve)
2990                         btrfs_delayed_refs_rsv_release(fs_info, 1);
2991                 /*
2992                  * Avoid blocking other tasks for too long. It might even save
2993                  * us from writing caches for block groups that are going to be
2994                  * removed.
2995                  */
2996                 mutex_unlock(&trans->transaction->cache_write_mutex);
2997                 if (ret)
2998                         goto out;
2999                 mutex_lock(&trans->transaction->cache_write_mutex);
3000         }
3001         mutex_unlock(&trans->transaction->cache_write_mutex);
3002
3003         /*
3004          * Go through delayed refs for all the stuff we've just kicked off
3005          * and then loop back (just once)
3006          */
3007         if (!ret)
3008                 ret = btrfs_run_delayed_refs(trans, 0);
3009         if (!ret && loops == 0) {
3010                 loops++;
3011                 spin_lock(&cur_trans->dirty_bgs_lock);
3012                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3013                 /*
3014                  * dirty_bgs_lock protects us from concurrent block group
3015                  * deletes too (not just cache_write_mutex).
3016                  */
3017                 if (!list_empty(&dirty)) {
3018                         spin_unlock(&cur_trans->dirty_bgs_lock);
3019                         goto again;
3020                 }
3021                 spin_unlock(&cur_trans->dirty_bgs_lock);
3022         }
3023 out:
3024         if (ret < 0) {
3025                 spin_lock(&cur_trans->dirty_bgs_lock);
3026                 list_splice_init(&dirty, &cur_trans->dirty_bgs);
3027                 spin_unlock(&cur_trans->dirty_bgs_lock);
3028                 btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3029         }
3030
3031         btrfs_free_path(path);
3032         return ret;
3033 }
3034
3035 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3036 {
3037         struct btrfs_fs_info *fs_info = trans->fs_info;
3038         struct btrfs_block_group *cache;
3039         struct btrfs_transaction *cur_trans = trans->transaction;
3040         int ret = 0;
3041         int should_put;
3042         struct btrfs_path *path;
3043         struct list_head *io = &cur_trans->io_bgs;
3044
3045         path = btrfs_alloc_path();
3046         if (!path)
3047                 return -ENOMEM;
3048
3049         /*
3050          * Even though we are in the critical section of the transaction commit,
3051          * we can still have concurrent tasks adding elements to this
3052          * transaction's list of dirty block groups. These tasks correspond to
3053          * endio free space workers started when writeback finishes for a
3054          * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3055          * allocate new block groups as a result of COWing nodes of the root
3056          * tree when updating the free space inode. The writeback for the space
3057          * caches is triggered by an earlier call to
3058          * btrfs_start_dirty_block_groups() and iterations of the following
3059          * loop.
3060          * Also we want to do the cache_save_setup first and then run the
3061          * delayed refs to make sure we have the best chance at doing this all
3062          * in one shot.
3063          */
3064         spin_lock(&cur_trans->dirty_bgs_lock);
3065         while (!list_empty(&cur_trans->dirty_bgs)) {
3066                 cache = list_first_entry(&cur_trans->dirty_bgs,
3067                                          struct btrfs_block_group,
3068                                          dirty_list);
3069
3070                 /*
3071                  * This can happen if cache_save_setup re-dirties a block group
3072                  * that is already under IO.  Just wait for it to finish and
3073                  * then do it all again
3074                  */
3075                 if (!list_empty(&cache->io_list)) {
3076                         spin_unlock(&cur_trans->dirty_bgs_lock);
3077                         list_del_init(&cache->io_list);
3078                         btrfs_wait_cache_io(trans, cache, path);
3079                         btrfs_put_block_group(cache);
3080                         spin_lock(&cur_trans->dirty_bgs_lock);
3081                 }
3082
3083                 /*
3084                  * Don't remove from the dirty list until after we've waited on
3085                  * any pending IO
3086                  */
3087                 list_del_init(&cache->dirty_list);
3088                 spin_unlock(&cur_trans->dirty_bgs_lock);
3089                 should_put = 1;
3090
3091                 cache_save_setup(cache, trans, path);
3092
3093                 if (!ret)
3094                         ret = btrfs_run_delayed_refs(trans,
3095                                                      (unsigned long) -1);
3096
3097                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3098                         cache->io_ctl.inode = NULL;
3099                         ret = btrfs_write_out_cache(trans, cache, path);
3100                         if (ret == 0 && cache->io_ctl.inode) {
3101                                 should_put = 0;
3102                                 list_add_tail(&cache->io_list, io);
3103                         } else {
3104                                 /*
3105                                  * If we failed to write the cache, the
3106                                  * generation will be bad and life goes on
3107                                  */
3108                                 ret = 0;
3109                         }
3110                 }
3111                 if (!ret) {
3112                         ret = update_block_group_item(trans, path, cache);
3113                         /*
3114                          * One of the free space endio workers might have
3115                          * created a new block group while updating a free space
3116                          * cache's inode (at inode.c:btrfs_finish_ordered_io())
3117                          * and hasn't released its transaction handle yet, in
3118                          * which case the new block group is still attached to
3119                          * its transaction handle and its creation has not
3120                          * finished yet (no block group item in the extent tree
3121                          * yet, etc). If this is the case, wait for all free
3122                          * space endio workers to finish and retry. This is a
3123                          * very rare case so no need for a more efficient and
3124                          * complex approach.
3125                          */
3126                         if (ret == -ENOENT) {
3127                                 wait_event(cur_trans->writer_wait,
3128                                    atomic_read(&cur_trans->num_writers) == 1);
3129                                 ret = update_block_group_item(trans, path, cache);
3130                         }
3131                         if (ret)
3132                                 btrfs_abort_transaction(trans, ret);
3133                 }
3134
3135                 /* If its not on the io list, we need to put the block group */
3136                 if (should_put)
3137                         btrfs_put_block_group(cache);
3138                 btrfs_delayed_refs_rsv_release(fs_info, 1);
3139                 spin_lock(&cur_trans->dirty_bgs_lock);
3140         }
3141         spin_unlock(&cur_trans->dirty_bgs_lock);
3142
3143         /*
3144          * Refer to the definition of io_bgs member for details why it's safe
3145          * to use it without any locking
3146          */
3147         while (!list_empty(io)) {
3148                 cache = list_first_entry(io, struct btrfs_block_group,
3149                                          io_list);
3150                 list_del_init(&cache->io_list);
3151                 btrfs_wait_cache_io(trans, cache, path);
3152                 btrfs_put_block_group(cache);
3153         }
3154
3155         btrfs_free_path(path);
3156         return ret;
3157 }
3158
3159 int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3160                              u64 bytenr, u64 num_bytes, int alloc)
3161 {
3162         struct btrfs_fs_info *info = trans->fs_info;
3163         struct btrfs_block_group *cache = NULL;
3164         u64 total = num_bytes;
3165         u64 old_val;
3166         u64 byte_in_group;
3167         int factor;
3168         int ret = 0;
3169
3170         /* Block accounting for super block */
3171         spin_lock(&info->delalloc_root_lock);
3172         old_val = btrfs_super_bytes_used(info->super_copy);
3173         if (alloc)
3174                 old_val += num_bytes;
3175         else
3176                 old_val -= num_bytes;
3177         btrfs_set_super_bytes_used(info->super_copy, old_val);
3178         spin_unlock(&info->delalloc_root_lock);
3179
3180         while (total) {
3181                 cache = btrfs_lookup_block_group(info, bytenr);
3182                 if (!cache) {
3183                         ret = -ENOENT;
3184                         break;
3185                 }
3186                 factor = btrfs_bg_type_to_factor(cache->flags);
3187
3188                 /*
3189                  * If this block group has free space cache written out, we
3190                  * need to make sure to load it if we are removing space.  This
3191                  * is because we need the unpinning stage to actually add the
3192                  * space back to the block group, otherwise we will leak space.
3193                  */
3194                 if (!alloc && !btrfs_block_group_done(cache))
3195                         btrfs_cache_block_group(cache, true);
3196
3197                 byte_in_group = bytenr - cache->start;
3198                 WARN_ON(byte_in_group > cache->length);
3199
3200                 spin_lock(&cache->space_info->lock);
3201                 spin_lock(&cache->lock);
3202
3203                 if (btrfs_test_opt(info, SPACE_CACHE) &&
3204                     cache->disk_cache_state < BTRFS_DC_CLEAR)
3205                         cache->disk_cache_state = BTRFS_DC_CLEAR;
3206
3207                 old_val = cache->used;
3208                 num_bytes = min(total, cache->length - byte_in_group);
3209                 if (alloc) {
3210                         old_val += num_bytes;
3211                         cache->used = old_val;
3212                         cache->reserved -= num_bytes;
3213                         cache->space_info->bytes_reserved -= num_bytes;
3214                         cache->space_info->bytes_used += num_bytes;
3215                         cache->space_info->disk_used += num_bytes * factor;
3216                         spin_unlock(&cache->lock);
3217                         spin_unlock(&cache->space_info->lock);
3218                 } else {
3219                         old_val -= num_bytes;
3220                         cache->used = old_val;
3221                         cache->pinned += num_bytes;
3222                         btrfs_space_info_update_bytes_pinned(info,
3223                                         cache->space_info, num_bytes);
3224                         cache->space_info->bytes_used -= num_bytes;
3225                         cache->space_info->disk_used -= num_bytes * factor;
3226                         spin_unlock(&cache->lock);
3227                         spin_unlock(&cache->space_info->lock);
3228
3229                         set_extent_dirty(&trans->transaction->pinned_extents,
3230                                          bytenr, bytenr + num_bytes - 1,
3231                                          GFP_NOFS | __GFP_NOFAIL);
3232                 }
3233
3234                 spin_lock(&trans->transaction->dirty_bgs_lock);
3235                 if (list_empty(&cache->dirty_list)) {
3236                         list_add_tail(&cache->dirty_list,
3237                                       &trans->transaction->dirty_bgs);
3238                         trans->delayed_ref_updates++;
3239                         btrfs_get_block_group(cache);
3240                 }
3241                 spin_unlock(&trans->transaction->dirty_bgs_lock);
3242
3243                 /*
3244                  * No longer have used bytes in this block group, queue it for
3245                  * deletion. We do this after adding the block group to the
3246                  * dirty list to avoid races between cleaner kthread and space
3247                  * cache writeout.
3248                  */
3249                 if (!alloc && old_val == 0) {
3250                         if (!btrfs_test_opt(info, DISCARD_ASYNC))
3251                                 btrfs_mark_bg_unused(cache);
3252                 }
3253
3254                 btrfs_put_block_group(cache);
3255                 total -= num_bytes;
3256                 bytenr += num_bytes;
3257         }
3258
3259         /* Modified block groups are accounted for in the delayed_refs_rsv. */
3260         btrfs_update_delayed_refs_rsv(trans);
3261         return ret;
3262 }
3263
3264 /**
3265  * btrfs_add_reserved_bytes - update the block_group and space info counters
3266  * @cache:      The cache we are manipulating
3267  * @ram_bytes:  The number of bytes of file content, and will be same to
3268  *              @num_bytes except for the compress path.
3269  * @num_bytes:  The number of bytes in question
3270  * @delalloc:   The blocks are allocated for the delalloc write
3271  *
3272  * This is called by the allocator when it reserves space. If this is a
3273  * reservation and the block group has become read only we cannot make the
3274  * reservation and return -EAGAIN, otherwise this function always succeeds.
3275  */
3276 int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3277                              u64 ram_bytes, u64 num_bytes, int delalloc)
3278 {
3279         struct btrfs_space_info *space_info = cache->space_info;
3280         int ret = 0;
3281
3282         spin_lock(&space_info->lock);
3283         spin_lock(&cache->lock);
3284         if (cache->ro) {
3285                 ret = -EAGAIN;
3286         } else {
3287                 cache->reserved += num_bytes;
3288                 space_info->bytes_reserved += num_bytes;
3289                 trace_btrfs_space_reservation(cache->fs_info, "space_info",
3290                                               space_info->flags, num_bytes, 1);
3291                 btrfs_space_info_update_bytes_may_use(cache->fs_info,
3292                                                       space_info, -ram_bytes);
3293                 if (delalloc)
3294                         cache->delalloc_bytes += num_bytes;
3295
3296                 /*
3297                  * Compression can use less space than we reserved, so wake
3298                  * tickets if that happens
3299                  */
3300                 if (num_bytes < ram_bytes)
3301                         btrfs_try_granting_tickets(cache->fs_info, space_info);
3302         }
3303         spin_unlock(&cache->lock);
3304         spin_unlock(&space_info->lock);
3305         return ret;
3306 }
3307
3308 /**
3309  * btrfs_free_reserved_bytes - update the block_group and space info counters
3310  * @cache:      The cache we are manipulating
3311  * @num_bytes:  The number of bytes in question
3312  * @delalloc:   The blocks are allocated for the delalloc write
3313  *
3314  * This is called by somebody who is freeing space that was never actually used
3315  * on disk.  For example if you reserve some space for a new leaf in transaction
3316  * A and before transaction A commits you free that leaf, you call this with
3317  * reserve set to 0 in order to clear the reservation.
3318  */
3319 void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3320                                u64 num_bytes, int delalloc)
3321 {
3322         struct btrfs_space_info *space_info = cache->space_info;
3323
3324         spin_lock(&space_info->lock);
3325         spin_lock(&cache->lock);
3326         if (cache->ro)
3327                 space_info->bytes_readonly += num_bytes;
3328         cache->reserved -= num_bytes;
3329         space_info->bytes_reserved -= num_bytes;
3330         space_info->max_extent_size = 0;
3331
3332         if (delalloc)
3333                 cache->delalloc_bytes -= num_bytes;
3334         spin_unlock(&cache->lock);
3335
3336         btrfs_try_granting_tickets(cache->fs_info, space_info);
3337         spin_unlock(&space_info->lock);
3338 }
3339
3340 static void force_metadata_allocation(struct btrfs_fs_info *info)
3341 {
3342         struct list_head *head = &info->space_info;
3343         struct btrfs_space_info *found;
3344
3345         list_for_each_entry(found, head, list) {
3346                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3347                         found->force_alloc = CHUNK_ALLOC_FORCE;
3348         }
3349 }
3350
3351 static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3352                               struct btrfs_space_info *sinfo, int force)
3353 {
3354         u64 bytes_used = btrfs_space_info_used(sinfo, false);
3355         u64 thresh;
3356
3357         if (force == CHUNK_ALLOC_FORCE)
3358                 return 1;
3359
3360         /*
3361          * in limited mode, we want to have some free space up to
3362          * about 1% of the FS size.
3363          */
3364         if (force == CHUNK_ALLOC_LIMITED) {
3365                 thresh = btrfs_super_total_bytes(fs_info->super_copy);
3366                 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
3367
3368                 if (sinfo->total_bytes - bytes_used < thresh)
3369                         return 1;
3370         }
3371
3372         if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
3373                 return 0;
3374         return 1;
3375 }
3376
3377 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3378 {
3379         u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3380
3381         return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
3382 }
3383
3384 static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
3385 {
3386         struct btrfs_block_group *bg;
3387         int ret;
3388
3389         /*
3390          * Check if we have enough space in the system space info because we
3391          * will need to update device items in the chunk btree and insert a new
3392          * chunk item in the chunk btree as well. This will allocate a new
3393          * system block group if needed.
3394          */
3395         check_system_chunk(trans, flags);
3396
3397         bg = btrfs_create_chunk(trans, flags);
3398         if (IS_ERR(bg)) {
3399                 ret = PTR_ERR(bg);
3400                 goto out;
3401         }
3402
3403         ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3404         /*
3405          * Normally we are not expected to fail with -ENOSPC here, since we have
3406          * previously reserved space in the system space_info and allocated one
3407          * new system chunk if necessary. However there are two exceptions:
3408          *
3409          * 1) We may have enough free space in the system space_info but all the
3410          *    existing system block groups have a profile which can not be used
3411          *    for extent allocation.
3412          *
3413          *    This happens when mounting in degraded mode. For example we have a
3414          *    RAID1 filesystem with 2 devices, lose one device and mount the fs
3415          *    using the other device in degraded mode. If we then allocate a chunk,
3416          *    we may have enough free space in the existing system space_info, but
3417          *    none of the block groups can be used for extent allocation since they
3418          *    have a RAID1 profile, and because we are in degraded mode with a
3419          *    single device, we are forced to allocate a new system chunk with a
3420          *    SINGLE profile. Making check_system_chunk() iterate over all system
3421          *    block groups and check if they have a usable profile and enough space
3422          *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
3423          *    try again after forcing allocation of a new system chunk. Like this
3424          *    we avoid paying the cost of that search in normal circumstances, when
3425          *    we were not mounted in degraded mode;
3426          *
3427          * 2) We had enough free space info the system space_info, and one suitable
3428          *    block group to allocate from when we called check_system_chunk()
3429          *    above. However right after we called it, the only system block group
3430          *    with enough free space got turned into RO mode by a running scrub,
3431          *    and in this case we have to allocate a new one and retry. We only
3432          *    need do this allocate and retry once, since we have a transaction
3433          *    handle and scrub uses the commit root to search for block groups.
3434          */
3435         if (ret == -ENOSPC) {
3436                 const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3437                 struct btrfs_block_group *sys_bg;
3438
3439                 sys_bg = btrfs_create_chunk(trans, sys_flags);
3440                 if (IS_ERR(sys_bg)) {
3441                         ret = PTR_ERR(sys_bg);
3442                         btrfs_abort_transaction(trans, ret);
3443                         goto out;
3444                 }
3445
3446                 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3447                 if (ret) {
3448                         btrfs_abort_transaction(trans, ret);
3449                         goto out;
3450                 }
3451
3452                 ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3453                 if (ret) {
3454                         btrfs_abort_transaction(trans, ret);
3455                         goto out;
3456                 }
3457         } else if (ret) {
3458                 btrfs_abort_transaction(trans, ret);
3459                 goto out;
3460         }
3461 out:
3462         btrfs_trans_release_chunk_metadata(trans);
3463
3464         return ret;
3465 }
3466
3467 /*
3468  * Chunk allocation is done in 2 phases:
3469  *
3470  * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3471  *    the chunk, the chunk mapping, create its block group and add the items
3472  *    that belong in the chunk btree to it - more specifically, we need to
3473  *    update device items in the chunk btree and add a new chunk item to it.
3474  *
3475  * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3476  *    group item to the extent btree and the device extent items to the devices
3477  *    btree.
3478  *
3479  * This is done to prevent deadlocks. For example when COWing a node from the
3480  * extent btree we are holding a write lock on the node's parent and if we
3481  * trigger chunk allocation and attempted to insert the new block group item
3482  * in the extent btree right way, we could deadlock because the path for the
3483  * insertion can include that parent node. At first glance it seems impossible
3484  * to trigger chunk allocation after starting a transaction since tasks should
3485  * reserve enough transaction units (metadata space), however while that is true
3486  * most of the time, chunk allocation may still be triggered for several reasons:
3487  *
3488  * 1) When reserving metadata, we check if there is enough free space in the
3489  *    metadata space_info and therefore don't trigger allocation of a new chunk.
3490  *    However later when the task actually tries to COW an extent buffer from
3491  *    the extent btree or from the device btree for example, it is forced to
3492  *    allocate a new block group (chunk) because the only one that had enough
3493  *    free space was just turned to RO mode by a running scrub for example (or
3494  *    device replace, block group reclaim thread, etc), so we can not use it
3495  *    for allocating an extent and end up being forced to allocate a new one;
3496  *
3497  * 2) Because we only check that the metadata space_info has enough free bytes,
3498  *    we end up not allocating a new metadata chunk in that case. However if
3499  *    the filesystem was mounted in degraded mode, none of the existing block
3500  *    groups might be suitable for extent allocation due to their incompatible
3501  *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
3502  *    use a RAID1 profile, in degraded mode using a single device). In this case
3503  *    when the task attempts to COW some extent buffer of the extent btree for
3504  *    example, it will trigger allocation of a new metadata block group with a
3505  *    suitable profile (SINGLE profile in the example of the degraded mount of
3506  *    the RAID1 filesystem);
3507  *
3508  * 3) The task has reserved enough transaction units / metadata space, but when
3509  *    it attempts to COW an extent buffer from the extent or device btree for
3510  *    example, it does not find any free extent in any metadata block group,
3511  *    therefore forced to try to allocate a new metadata block group.
3512  *    This is because some other task allocated all available extents in the
3513  *    meanwhile - this typically happens with tasks that don't reserve space
3514  *    properly, either intentionally or as a bug. One example where this is
3515  *    done intentionally is fsync, as it does not reserve any transaction units
3516  *    and ends up allocating a variable number of metadata extents for log
3517  *    tree extent buffers.
3518  *
3519  * We also need this 2 phases setup when adding a device to a filesystem with
3520  * a seed device - we must create new metadata and system chunks without adding
3521  * any of the block group items to the chunk, extent and device btrees. If we
3522  * did not do it this way, we would get ENOSPC when attempting to update those
3523  * btrees, since all the chunks from the seed device are read-only.
3524  *
3525  * Phase 1 does the updates and insertions to the chunk btree because if we had
3526  * it done in phase 2 and have a thundering herd of tasks allocating chunks in
3527  * parallel, we risk having too many system chunks allocated by many tasks if
3528  * many tasks reach phase 1 without the previous ones completing phase 2. In the
3529  * extreme case this leads to exhaustion of the system chunk array in the
3530  * superblock. This is easier to trigger if using a btree node/leaf size of 64K
3531  * and with RAID filesystems (so we have more device items in the chunk btree).
3532  * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
3533  * the system chunk array due to concurrent allocations") provides more details.
3534  *
3535  * Allocation of system chunks does not happen through this function. A task that
3536  * needs to update the chunk btree (the only btree that uses system chunks), must
3537  * preallocate chunk space by calling either check_system_chunk() or
3538  * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
3539  * metadata chunk or when removing a chunk, while the later is used before doing
3540  * a modification to the chunk btree - use cases for the later are adding,
3541  * removing and resizing a device as well as relocation of a system chunk.
3542  * See the comment below for more details.
3543  *
3544  * The reservation of system space, done through check_system_chunk(), as well
3545  * as all the updates and insertions into the chunk btree must be done while
3546  * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
3547  * an extent buffer from the chunks btree we never trigger allocation of a new
3548  * system chunk, which would result in a deadlock (trying to lock twice an
3549  * extent buffer of the chunk btree, first time before triggering the chunk
3550  * allocation and the second time during chunk allocation while attempting to
3551  * update the chunks btree). The system chunk array is also updated while holding
3552  * that mutex. The same logic applies to removing chunks - we must reserve system
3553  * space, update the chunk btree and the system chunk array in the superblock
3554  * while holding fs_info->chunk_mutex.
3555  *
3556  * This function, btrfs_chunk_alloc(), belongs to phase 1.
3557  *
3558  * If @force is CHUNK_ALLOC_FORCE:
3559  *    - return 1 if it successfully allocates a chunk,
3560  *    - return errors including -ENOSPC otherwise.
3561  * If @force is NOT CHUNK_ALLOC_FORCE:
3562  *    - return 0 if it doesn't need to allocate a new chunk,
3563  *    - return 1 if it successfully allocates a chunk,
3564  *    - return errors including -ENOSPC otherwise.
3565  */
3566 int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
3567                       enum btrfs_chunk_alloc_enum force)
3568 {
3569         struct btrfs_fs_info *fs_info = trans->fs_info;
3570         struct btrfs_space_info *space_info;
3571         bool wait_for_alloc = false;
3572         bool should_alloc = false;
3573         int ret = 0;
3574
3575         /* Don't re-enter if we're already allocating a chunk */
3576         if (trans->allocating_chunk)
3577                 return -ENOSPC;
3578         /*
3579          * Allocation of system chunks can not happen through this path, as we
3580          * could end up in a deadlock if we are allocating a data or metadata
3581          * chunk and there is another task modifying the chunk btree.
3582          *
3583          * This is because while we are holding the chunk mutex, we will attempt
3584          * to add the new chunk item to the chunk btree or update an existing
3585          * device item in the chunk btree, while the other task that is modifying
3586          * the chunk btree is attempting to COW an extent buffer while holding a
3587          * lock on it and on its parent - if the COW operation triggers a system
3588          * chunk allocation, then we can deadlock because we are holding the
3589          * chunk mutex and we may need to access that extent buffer or its parent
3590          * in order to add the chunk item or update a device item.
3591          *
3592          * Tasks that want to modify the chunk tree should reserve system space
3593          * before updating the chunk btree, by calling either
3594          * btrfs_reserve_chunk_metadata() or check_system_chunk().
3595          * It's possible that after a task reserves the space, it still ends up
3596          * here - this happens in the cases described above at do_chunk_alloc().
3597          * The task will have to either retry or fail.
3598          */
3599         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3600                 return -ENOSPC;
3601
3602         space_info = btrfs_find_space_info(fs_info, flags);
3603         ASSERT(space_info);
3604
3605         do {
3606                 spin_lock(&space_info->lock);
3607                 if (force < space_info->force_alloc)
3608                         force = space_info->force_alloc;
3609                 should_alloc = should_alloc_chunk(fs_info, space_info, force);
3610                 if (space_info->full) {
3611                         /* No more free physical space */
3612                         if (should_alloc)
3613                                 ret = -ENOSPC;
3614                         else
3615                                 ret = 0;
3616                         spin_unlock(&space_info->lock);
3617                         return ret;
3618                 } else if (!should_alloc) {
3619                         spin_unlock(&space_info->lock);
3620                         return 0;
3621                 } else if (space_info->chunk_alloc) {
3622                         /*
3623                          * Someone is already allocating, so we need to block
3624                          * until this someone is finished and then loop to
3625                          * recheck if we should continue with our allocation
3626                          * attempt.
3627                          */
3628                         wait_for_alloc = true;
3629                         force = CHUNK_ALLOC_NO_FORCE;
3630                         spin_unlock(&space_info->lock);
3631                         mutex_lock(&fs_info->chunk_mutex);
3632                         mutex_unlock(&fs_info->chunk_mutex);
3633                 } else {
3634                         /* Proceed with allocation */
3635                         space_info->chunk_alloc = 1;
3636                         wait_for_alloc = false;
3637                         spin_unlock(&space_info->lock);
3638                 }
3639
3640                 cond_resched();
3641         } while (wait_for_alloc);
3642
3643         mutex_lock(&fs_info->chunk_mutex);
3644         trans->allocating_chunk = true;
3645
3646         /*
3647          * If we have mixed data/metadata chunks we want to make sure we keep
3648          * allocating mixed chunks instead of individual chunks.
3649          */
3650         if (btrfs_mixed_space_info(space_info))
3651                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3652
3653         /*
3654          * if we're doing a data chunk, go ahead and make sure that
3655          * we keep a reasonable number of metadata chunks allocated in the
3656          * FS as well.
3657          */
3658         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3659                 fs_info->data_chunk_allocations++;
3660                 if (!(fs_info->data_chunk_allocations %
3661                       fs_info->metadata_ratio))
3662                         force_metadata_allocation(fs_info);
3663         }
3664
3665         ret = do_chunk_alloc(trans, flags);
3666         trans->allocating_chunk = false;
3667
3668         spin_lock(&space_info->lock);
3669         if (ret < 0) {
3670                 if (ret == -ENOSPC)
3671                         space_info->full = 1;
3672                 else
3673                         goto out;
3674         } else {
3675                 ret = 1;
3676                 space_info->max_extent_size = 0;
3677         }
3678
3679         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3680 out:
3681         space_info->chunk_alloc = 0;
3682         spin_unlock(&space_info->lock);
3683         mutex_unlock(&fs_info->chunk_mutex);
3684
3685         return ret;
3686 }
3687
3688 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
3689 {
3690         u64 num_dev;
3691
3692         num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
3693         if (!num_dev)
3694                 num_dev = fs_info->fs_devices->rw_devices;
3695
3696         return num_dev;
3697 }
3698
3699 static void reserve_chunk_space(struct btrfs_trans_handle *trans,
3700                                 u64 bytes,
3701                                 u64 type)
3702 {
3703         struct btrfs_fs_info *fs_info = trans->fs_info;
3704         struct btrfs_space_info *info;
3705         u64 left;
3706         int ret = 0;
3707
3708         /*
3709          * Needed because we can end up allocating a system chunk and for an
3710          * atomic and race free space reservation in the chunk block reserve.
3711          */
3712         lockdep_assert_held(&fs_info->chunk_mutex);
3713
3714         info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3715         spin_lock(&info->lock);
3716         left = info->total_bytes - btrfs_space_info_used(info, true);
3717         spin_unlock(&info->lock);
3718
3719         if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
3720                 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
3721                            left, bytes, type);
3722                 btrfs_dump_space_info(fs_info, info, 0, 0);
3723         }
3724
3725         if (left < bytes) {
3726                 u64 flags = btrfs_system_alloc_profile(fs_info);
3727                 struct btrfs_block_group *bg;
3728
3729                 /*
3730                  * Ignore failure to create system chunk. We might end up not
3731                  * needing it, as we might not need to COW all nodes/leafs from
3732                  * the paths we visit in the chunk tree (they were already COWed
3733                  * or created in the current transaction for example).
3734                  */
3735                 bg = btrfs_create_chunk(trans, flags);
3736                 if (IS_ERR(bg)) {
3737                         ret = PTR_ERR(bg);
3738                 } else {
3739                         /*
3740                          * If we fail to add the chunk item here, we end up
3741                          * trying again at phase 2 of chunk allocation, at
3742                          * btrfs_create_pending_block_groups(). So ignore
3743                          * any error here. An ENOSPC here could happen, due to
3744                          * the cases described at do_chunk_alloc() - the system
3745                          * block group we just created was just turned into RO
3746                          * mode by a scrub for example, or a running discard
3747                          * temporarily removed its free space entries, etc.
3748                          */
3749                         btrfs_chunk_alloc_add_chunk_item(trans, bg);
3750                 }
3751         }
3752
3753         if (!ret) {
3754                 ret = btrfs_block_rsv_add(fs_info->chunk_root,
3755                                           &fs_info->chunk_block_rsv,
3756                                           bytes, BTRFS_RESERVE_NO_FLUSH);
3757                 if (!ret)
3758                         trans->chunk_bytes_reserved += bytes;
3759         }
3760 }
3761
3762 /*
3763  * Reserve space in the system space for allocating or removing a chunk.
3764  * The caller must be holding fs_info->chunk_mutex.
3765  */
3766 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
3767 {
3768         struct btrfs_fs_info *fs_info = trans->fs_info;
3769         const u64 num_devs = get_profile_num_devs(fs_info, type);
3770         u64 bytes;
3771
3772         /* num_devs device items to update and 1 chunk item to add or remove. */
3773         bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
3774                 btrfs_calc_insert_metadata_size(fs_info, 1);
3775
3776         reserve_chunk_space(trans, bytes, type);
3777 }
3778
3779 /*
3780  * Reserve space in the system space, if needed, for doing a modification to the
3781  * chunk btree.
3782  *
3783  * @trans:              A transaction handle.
3784  * @is_item_insertion:  Indicate if the modification is for inserting a new item
3785  *                      in the chunk btree or if it's for the deletion or update
3786  *                      of an existing item.
3787  *
3788  * This is used in a context where we need to update the chunk btree outside
3789  * block group allocation and removal, to avoid a deadlock with a concurrent
3790  * task that is allocating a metadata or data block group and therefore needs to
3791  * update the chunk btree while holding the chunk mutex. After the update to the
3792  * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
3793  *
3794  */
3795 void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
3796                                   bool is_item_insertion)
3797 {
3798         struct btrfs_fs_info *fs_info = trans->fs_info;
3799         u64 bytes;
3800
3801         if (is_item_insertion)
3802                 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
3803         else
3804                 bytes = btrfs_calc_metadata_size(fs_info, 1);
3805
3806         mutex_lock(&fs_info->chunk_mutex);
3807         reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
3808         mutex_unlock(&fs_info->chunk_mutex);
3809 }
3810
3811 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
3812 {
3813         struct btrfs_block_group *block_group;
3814         u64 last = 0;
3815
3816         while (1) {
3817                 struct inode *inode;
3818
3819                 block_group = btrfs_lookup_first_block_group(info, last);
3820                 while (block_group) {
3821                         btrfs_wait_block_group_cache_done(block_group);
3822                         spin_lock(&block_group->lock);
3823                         if (block_group->iref)
3824                                 break;
3825                         spin_unlock(&block_group->lock);
3826                         block_group = btrfs_next_block_group(block_group);
3827                 }
3828                 if (!block_group) {
3829                         if (last == 0)
3830                                 break;
3831                         last = 0;
3832                         continue;
3833                 }
3834
3835                 inode = block_group->inode;
3836                 block_group->iref = 0;
3837                 block_group->inode = NULL;
3838                 spin_unlock(&block_group->lock);
3839                 ASSERT(block_group->io_ctl.inode == NULL);
3840                 iput(inode);
3841                 last = block_group->start + block_group->length;
3842                 btrfs_put_block_group(block_group);
3843         }
3844 }
3845
3846 /*
3847  * Must be called only after stopping all workers, since we could have block
3848  * group caching kthreads running, and therefore they could race with us if we
3849  * freed the block groups before stopping them.
3850  */
3851 int btrfs_free_block_groups(struct btrfs_fs_info *info)
3852 {
3853         struct btrfs_block_group *block_group;
3854         struct btrfs_space_info *space_info;
3855         struct btrfs_caching_control *caching_ctl;
3856         struct rb_node *n;
3857
3858         spin_lock(&info->block_group_cache_lock);
3859         while (!list_empty(&info->caching_block_groups)) {
3860                 caching_ctl = list_entry(info->caching_block_groups.next,
3861                                          struct btrfs_caching_control, list);
3862                 list_del(&caching_ctl->list);
3863                 btrfs_put_caching_control(caching_ctl);
3864         }
3865         spin_unlock(&info->block_group_cache_lock);
3866
3867         spin_lock(&info->unused_bgs_lock);
3868         while (!list_empty(&info->unused_bgs)) {
3869                 block_group = list_first_entry(&info->unused_bgs,
3870                                                struct btrfs_block_group,
3871                                                bg_list);
3872                 list_del_init(&block_group->bg_list);
3873                 btrfs_put_block_group(block_group);
3874         }
3875         spin_unlock(&info->unused_bgs_lock);
3876
3877         spin_lock(&info->unused_bgs_lock);
3878         while (!list_empty(&info->reclaim_bgs)) {
3879                 block_group = list_first_entry(&info->reclaim_bgs,
3880                                                struct btrfs_block_group,
3881                                                bg_list);
3882                 list_del_init(&block_group->bg_list);
3883                 btrfs_put_block_group(block_group);
3884         }
3885         spin_unlock(&info->unused_bgs_lock);
3886
3887         spin_lock(&info->block_group_cache_lock);
3888         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
3889                 block_group = rb_entry(n, struct btrfs_block_group,
3890                                        cache_node);
3891                 rb_erase(&block_group->cache_node,
3892                          &info->block_group_cache_tree);
3893                 RB_CLEAR_NODE(&block_group->cache_node);
3894                 spin_unlock(&info->block_group_cache_lock);
3895
3896                 down_write(&block_group->space_info->groups_sem);
3897                 list_del(&block_group->list);
3898                 up_write(&block_group->space_info->groups_sem);
3899
3900                 /*
3901                  * We haven't cached this block group, which means we could
3902                  * possibly have excluded extents on this block group.
3903                  */
3904                 if (block_group->cached == BTRFS_CACHE_NO ||
3905                     block_group->cached == BTRFS_CACHE_ERROR)
3906                         btrfs_free_excluded_extents(block_group);
3907
3908                 btrfs_remove_free_space_cache(block_group);
3909                 ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
3910                 ASSERT(list_empty(&block_group->dirty_list));
3911                 ASSERT(list_empty(&block_group->io_list));
3912                 ASSERT(list_empty(&block_group->bg_list));
3913                 ASSERT(refcount_read(&block_group->refs) == 1);
3914                 ASSERT(block_group->swap_extents == 0);
3915                 btrfs_put_block_group(block_group);
3916
3917                 spin_lock(&info->block_group_cache_lock);
3918         }
3919         spin_unlock(&info->block_group_cache_lock);
3920
3921         btrfs_release_global_block_rsv(info);
3922
3923         while (!list_empty(&info->space_info)) {
3924                 space_info = list_entry(info->space_info.next,
3925                                         struct btrfs_space_info,
3926                                         list);
3927
3928                 /*
3929                  * Do not hide this behind enospc_debug, this is actually
3930                  * important and indicates a real bug if this happens.
3931                  */
3932                 if (WARN_ON(space_info->bytes_pinned > 0 ||
3933                             space_info->bytes_may_use > 0))
3934                         btrfs_dump_space_info(info, space_info, 0, 0);
3935
3936                 /*
3937                  * If there was a failure to cleanup a log tree, very likely due
3938                  * to an IO failure on a writeback attempt of one or more of its
3939                  * extent buffers, we could not do proper (and cheap) unaccounting
3940                  * of their reserved space, so don't warn on bytes_reserved > 0 in
3941                  * that case.
3942                  */
3943                 if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
3944                     !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
3945                         if (WARN_ON(space_info->bytes_reserved > 0))
3946                                 btrfs_dump_space_info(info, space_info, 0, 0);
3947                 }
3948
3949                 WARN_ON(space_info->reclaim_size > 0);
3950                 list_del(&space_info->list);
3951                 btrfs_sysfs_remove_space_info(space_info);
3952         }
3953         return 0;
3954 }
3955
3956 void btrfs_freeze_block_group(struct btrfs_block_group *cache)
3957 {
3958         atomic_inc(&cache->frozen);
3959 }
3960
3961 void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
3962 {
3963         struct btrfs_fs_info *fs_info = block_group->fs_info;
3964         struct extent_map_tree *em_tree;
3965         struct extent_map *em;
3966         bool cleanup;
3967
3968         spin_lock(&block_group->lock);
3969         cleanup = (atomic_dec_and_test(&block_group->frozen) &&
3970                    block_group->removed);
3971         spin_unlock(&block_group->lock);
3972
3973         if (cleanup) {
3974                 em_tree = &fs_info->mapping_tree;
3975                 write_lock(&em_tree->lock);
3976                 em = lookup_extent_mapping(em_tree, block_group->start,
3977                                            1);
3978                 BUG_ON(!em); /* logic error, can't happen */
3979                 remove_extent_mapping(em_tree, em);
3980                 write_unlock(&em_tree->lock);
3981
3982                 /* once for us and once for the tree */
3983                 free_extent_map(em);
3984                 free_extent_map(em);
3985
3986                 /*
3987                  * We may have left one free space entry and other possible
3988                  * tasks trimming this block group have left 1 entry each one.
3989                  * Free them if any.
3990                  */
3991                 __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3992         }
3993 }
3994
3995 bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
3996 {
3997         bool ret = true;
3998
3999         spin_lock(&bg->lock);
4000         if (bg->ro)
4001                 ret = false;
4002         else
4003                 bg->swap_extents++;
4004         spin_unlock(&bg->lock);
4005
4006         return ret;
4007 }
4008
4009 void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4010 {
4011         spin_lock(&bg->lock);
4012         ASSERT(!bg->ro);
4013         ASSERT(bg->swap_extents >= amount);
4014         bg->swap_extents -= amount;
4015         spin_unlock(&bg->lock);
4016 }