1 // SPDX-License-Identifier: GPL-2.0
5 #include "space-info.h"
8 #include "free-space-cache.h"
9 #include "ordered-data.h"
10 #include "transaction.h"
11 #include "block-group.h"
14 * HOW DOES SPACE RESERVATION WORK
16 * If you want to know about delalloc specifically, there is a separate comment
17 * for that with the delalloc code. This comment is about how the whole system
22 * 1) space_info. This is the ultimate arbiter of how much space we can use.
23 * There's a description of the bytes_ fields with the struct declaration,
24 * refer to that for specifics on each field. Suffice it to say that for
25 * reservations we care about total_bytes - SUM(space_info->bytes_) when
26 * determining if there is space to make an allocation. There is a space_info
27 * for METADATA, SYSTEM, and DATA areas.
29 * 2) block_rsv's. These are basically buckets for every different type of
30 * metadata reservation we have. You can see the comment in the block_rsv
31 * code on the rules for each type, but generally block_rsv->reserved is how
32 * much space is accounted for in space_info->bytes_may_use.
34 * 3) btrfs_calc*_size. These are the worst case calculations we used based
35 * on the number of items we will want to modify. We have one for changing
36 * items, and one for inserting new items. Generally we use these helpers to
37 * determine the size of the block reserves, and then use the actual bytes
38 * values to adjust the space_info counters.
40 * MAKING RESERVATIONS, THE NORMAL CASE
42 * We call into either btrfs_reserve_data_bytes() or
43 * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
44 * num_bytes we want to reserve.
47 * space_info->bytes_may_reserve += num_bytes
50 * Call btrfs_add_reserved_bytes() which does
51 * space_info->bytes_may_reserve -= num_bytes
52 * space_info->bytes_reserved += extent_bytes
55 * Call btrfs_update_block_group() which does
56 * space_info->bytes_reserved -= extent_bytes
57 * space_info->bytes_used += extent_bytes
59 * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
61 * Assume we are unable to simply make the reservation because we do not have
65 * create a reserve_ticket with ->bytes set to our reservation, add it to
66 * the tail of space_info->tickets, kick async flush thread
68 * ->handle_reserve_ticket
69 * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
72 * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
73 * Flushes various things attempting to free up space.
75 * -> btrfs_try_granting_tickets()
76 * This is called by anything that either subtracts space from
77 * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
78 * space_info->total_bytes. This loops through the ->priority_tickets and
79 * then the ->tickets list checking to see if the reservation can be
80 * completed. If it can the space is added to space_info->bytes_may_use and
81 * the ticket is woken up.
84 * Check if ->bytes == 0, if it does we got our reservation and we can carry
85 * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
88 * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
90 * Same as the above, except we add ourselves to the
91 * space_info->priority_tickets, and we do not use ticket->wait, we simply
92 * call flush_space() ourselves for the states that are safe for us to call
93 * without deadlocking and hope for the best.
97 * Generally speaking we will have two cases for each state, a "nice" state
98 * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
99 * reduce the locking over head on the various trees, and even to keep from
100 * doing any work at all in the case of delayed refs. Each of these delayed
101 * things however hold reservations, and so letting them run allows us to
102 * reclaim space so we can make new reservations.
104 * FLUSH_DELAYED_ITEMS
105 * Every inode has a delayed item to update the inode. Take a simple write
106 * for example, we would update the inode item at write time to update the
107 * mtime, and then again at finish_ordered_io() time in order to update the
108 * isize or bytes. We keep these delayed items to coalesce these operations
109 * into a single operation done on demand. These are an easy way to reclaim
113 * Look at the delalloc comment to get an idea of how much space is reserved
114 * for delayed allocation. We can reclaim some of this space simply by
115 * running delalloc, but usually we need to wait for ordered extents to
116 * reclaim the bulk of this space.
119 * We have a block reserve for the outstanding delayed refs space, and every
120 * delayed ref operation holds a reservation. Running these is a quick way
121 * to reclaim space, but we want to hold this until the end because COW can
122 * churn a lot and we can avoid making some extent tree modifications if we
123 * are able to delay for as long as possible.
126 * We will skip this the first time through space reservation, because of
127 * overcommit and we don't want to have a lot of useless metadata space when
128 * our worst case reservations will likely never come true.
131 * If we're freeing inodes we're likely freeing checksums, file extent
132 * items, and extent tree items. Loads of space could be freed up by these
133 * operations, however they won't be usable until the transaction commits.
136 * may_commit_transaction() is the ultimate arbiter on whether we commit the
137 * transaction or not. In order to avoid constantly churning we do all the
138 * above flushing first and then commit the transaction as the last resort.
139 * However we need to take into account things like pinned space that would
140 * be freed, plus any delayed work we may not have gotten rid of in the case
144 * For use by the preemptive flusher. We use this to bypass the ticketing
145 * checks in may_commit_transaction, as we have more information about the
146 * overall state of the system and may want to commit the transaction ahead
147 * of actual ENOSPC conditions.
151 * Because we hold so many reservations for metadata we will allow you to
152 * reserve more space than is currently free in the currently allocate
153 * metadata space. This only happens with metadata, data does not allow
156 * You can see the current logic for when we allow overcommit in
157 * btrfs_can_overcommit(), but it only applies to unallocated space. If there
158 * is no unallocated space to be had, all reservations are kept within the
159 * free space in the allocated metadata chunks.
161 * Because of overcommitting, you generally want to use the
162 * btrfs_can_overcommit() logic for metadata allocations, as it does the right
163 * thing with or without extra unallocated space.
166 u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
167 bool may_use_included)
170 return s_info->bytes_used + s_info->bytes_reserved +
171 s_info->bytes_pinned + s_info->bytes_readonly +
172 (may_use_included ? s_info->bytes_may_use : 0);
176 * after adding space to the filesystem, we need to clear the full flags
177 * on all the space infos.
179 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
181 struct list_head *head = &info->space_info;
182 struct btrfs_space_info *found;
184 list_for_each_entry(found, head, list)
188 static int create_space_info(struct btrfs_fs_info *info, u64 flags)
191 struct btrfs_space_info *space_info;
195 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
199 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
206 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
207 INIT_LIST_HEAD(&space_info->block_groups[i]);
208 init_rwsem(&space_info->groups_sem);
209 spin_lock_init(&space_info->lock);
210 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
211 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
212 INIT_LIST_HEAD(&space_info->ro_bgs);
213 INIT_LIST_HEAD(&space_info->tickets);
214 INIT_LIST_HEAD(&space_info->priority_tickets);
215 space_info->clamp = 1;
217 ret = btrfs_sysfs_add_space_info_type(info, space_info);
221 list_add(&space_info->list, &info->space_info);
222 if (flags & BTRFS_BLOCK_GROUP_DATA)
223 info->data_sinfo = space_info;
228 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
230 struct btrfs_super_block *disk_super;
236 disk_super = fs_info->super_copy;
237 if (!btrfs_super_root(disk_super))
240 features = btrfs_super_incompat_flags(disk_super);
241 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
244 flags = BTRFS_BLOCK_GROUP_SYSTEM;
245 ret = create_space_info(fs_info, flags);
250 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
251 ret = create_space_info(fs_info, flags);
253 flags = BTRFS_BLOCK_GROUP_METADATA;
254 ret = create_space_info(fs_info, flags);
258 flags = BTRFS_BLOCK_GROUP_DATA;
259 ret = create_space_info(fs_info, flags);
265 void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
266 u64 total_bytes, u64 bytes_used,
268 struct btrfs_space_info **space_info)
270 struct btrfs_space_info *found;
273 factor = btrfs_bg_type_to_factor(flags);
275 found = btrfs_find_space_info(info, flags);
277 spin_lock(&found->lock);
278 found->total_bytes += total_bytes;
279 found->disk_total += total_bytes * factor;
280 found->bytes_used += bytes_used;
281 found->disk_used += bytes_used * factor;
282 found->bytes_readonly += bytes_readonly;
285 btrfs_try_granting_tickets(info, found);
286 spin_unlock(&found->lock);
290 struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
293 struct list_head *head = &info->space_info;
294 struct btrfs_space_info *found;
296 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
298 list_for_each_entry(found, head, list) {
299 if (found->flags & flags)
305 static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
306 struct btrfs_space_info *space_info,
307 enum btrfs_reserve_flush_enum flush)
313 if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
314 profile = btrfs_system_alloc_profile(fs_info);
316 profile = btrfs_metadata_alloc_profile(fs_info);
318 avail = atomic64_read(&fs_info->free_chunk_space);
321 * If we have dup, raid1 or raid10 then only half of the free
322 * space is actually usable. For raid56, the space info used
323 * doesn't include the parity drive, so we don't have to
326 factor = btrfs_bg_type_to_factor(profile);
327 avail = div_u64(avail, factor);
330 * If we aren't flushing all things, let us overcommit up to
331 * 1/2th of the space. If we can flush, don't let us overcommit
332 * too much, let it overcommit up to 1/8 of the space.
334 if (flush == BTRFS_RESERVE_FLUSH_ALL)
341 int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
342 struct btrfs_space_info *space_info, u64 bytes,
343 enum btrfs_reserve_flush_enum flush)
348 /* Don't overcommit when in mixed mode */
349 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
352 used = btrfs_space_info_used(space_info, true);
353 avail = calc_available_free_space(fs_info, space_info, flush);
355 if (used + bytes < space_info->total_bytes + avail)
360 static void remove_ticket(struct btrfs_space_info *space_info,
361 struct reserve_ticket *ticket)
363 if (!list_empty(&ticket->list)) {
364 list_del_init(&ticket->list);
365 ASSERT(space_info->reclaim_size >= ticket->bytes);
366 space_info->reclaim_size -= ticket->bytes;
371 * This is for space we already have accounted in space_info->bytes_may_use, so
372 * basically when we're returning space from block_rsv's.
374 void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
375 struct btrfs_space_info *space_info)
377 struct list_head *head;
378 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
380 lockdep_assert_held(&space_info->lock);
382 head = &space_info->priority_tickets;
384 while (!list_empty(head)) {
385 struct reserve_ticket *ticket;
386 u64 used = btrfs_space_info_used(space_info, true);
388 ticket = list_first_entry(head, struct reserve_ticket, list);
390 /* Check and see if our ticket can be satisified now. */
391 if ((used + ticket->bytes <= space_info->total_bytes) ||
392 btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
394 btrfs_space_info_update_bytes_may_use(fs_info,
397 remove_ticket(space_info, ticket);
399 space_info->tickets_id++;
400 wake_up(&ticket->wait);
406 if (head == &space_info->priority_tickets) {
407 head = &space_info->tickets;
408 flush = BTRFS_RESERVE_FLUSH_ALL;
413 #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
415 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
416 spin_lock(&__rsv->lock); \
417 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
418 __rsv->size, __rsv->reserved); \
419 spin_unlock(&__rsv->lock); \
422 static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
423 struct btrfs_space_info *info)
425 lockdep_assert_held(&info->lock);
427 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
429 info->total_bytes - btrfs_space_info_used(info, true),
430 info->full ? "" : "not ");
432 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
433 info->total_bytes, info->bytes_used, info->bytes_pinned,
434 info->bytes_reserved, info->bytes_may_use,
435 info->bytes_readonly);
437 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
438 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
439 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
440 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
441 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
445 void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
446 struct btrfs_space_info *info, u64 bytes,
447 int dump_block_groups)
449 struct btrfs_block_group *cache;
452 spin_lock(&info->lock);
453 __btrfs_dump_space_info(fs_info, info);
454 spin_unlock(&info->lock);
456 if (!dump_block_groups)
459 down_read(&info->groups_sem);
461 list_for_each_entry(cache, &info->block_groups[index], list) {
462 spin_lock(&cache->lock);
464 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
465 cache->start, cache->length, cache->used, cache->pinned,
466 cache->reserved, cache->ro ? "[readonly]" : "");
467 spin_unlock(&cache->lock);
468 btrfs_dump_free_space(cache, bytes);
470 if (++index < BTRFS_NR_RAID_TYPES)
472 up_read(&info->groups_sem);
475 static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
481 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
482 nr = div64_u64(to_reclaim, bytes);
488 #define EXTENT_SIZE_PER_ITEM SZ_256K
491 * shrink metadata reservation for delalloc
493 static void shrink_delalloc(struct btrfs_fs_info *fs_info,
494 struct btrfs_space_info *space_info,
495 u64 to_reclaim, bool wait_ordered)
497 struct btrfs_trans_handle *trans;
504 /* Calc the number of the pages we need flush for space reservation */
505 if (to_reclaim == U64_MAX) {
509 * to_reclaim is set to however much metadata we need to
510 * reclaim, but reclaiming that much data doesn't really track
511 * exactly, so increase the amount to reclaim by 2x in order to
512 * make sure we're flushing enough delalloc to hopefully reclaim
513 * some metadata reservations.
515 items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
516 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
519 trans = (struct btrfs_trans_handle *)current->journal_info;
521 delalloc_bytes = percpu_counter_sum_positive(
522 &fs_info->delalloc_bytes);
523 ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
524 if (delalloc_bytes == 0 && ordered_bytes == 0)
528 * If we are doing more ordered than delalloc we need to just wait on
529 * ordered extents, otherwise we'll waste time trying to flush delalloc
530 * that likely won't give us the space back we need.
532 if (ordered_bytes > delalloc_bytes)
536 while ((delalloc_bytes || ordered_bytes) && loops < 3) {
537 u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
538 long nr_pages = min_t(u64, temp, LONG_MAX);
540 btrfs_start_delalloc_roots(fs_info, nr_pages, true);
543 if (wait_ordered && !trans) {
544 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
546 time_left = schedule_timeout_killable(1);
551 spin_lock(&space_info->lock);
552 if (list_empty(&space_info->tickets) &&
553 list_empty(&space_info->priority_tickets)) {
554 spin_unlock(&space_info->lock);
557 spin_unlock(&space_info->lock);
559 delalloc_bytes = percpu_counter_sum_positive(
560 &fs_info->delalloc_bytes);
561 ordered_bytes = percpu_counter_sum_positive(
562 &fs_info->ordered_bytes);
567 * Possibly commit the transaction if its ok to
569 * @fs_info: the filesystem
570 * @space_info: space_info we are checking for commit, either data or metadata
572 * This will check to make sure that committing the transaction will actually
573 * get us somewhere and then commit the transaction if it does. Otherwise it
574 * will return -ENOSPC.
576 static int may_commit_transaction(struct btrfs_fs_info *fs_info,
577 struct btrfs_space_info *space_info)
579 struct reserve_ticket *ticket = NULL;
580 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
581 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
582 struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
583 struct btrfs_trans_handle *trans;
584 u64 reclaim_bytes = 0;
585 u64 bytes_needed = 0;
586 u64 cur_free_bytes = 0;
588 trans = (struct btrfs_trans_handle *)current->journal_info;
592 spin_lock(&space_info->lock);
593 cur_free_bytes = btrfs_space_info_used(space_info, true);
594 if (cur_free_bytes < space_info->total_bytes)
595 cur_free_bytes = space_info->total_bytes - cur_free_bytes;
599 if (!list_empty(&space_info->priority_tickets))
600 ticket = list_first_entry(&space_info->priority_tickets,
601 struct reserve_ticket, list);
602 else if (!list_empty(&space_info->tickets))
603 ticket = list_first_entry(&space_info->tickets,
604 struct reserve_ticket, list);
606 bytes_needed = ticket->bytes;
608 if (bytes_needed > cur_free_bytes)
609 bytes_needed -= cur_free_bytes;
612 spin_unlock(&space_info->lock);
617 trans = btrfs_join_transaction(fs_info->extent_root);
619 return PTR_ERR(trans);
622 * See if there is enough pinned space to make this reservation, or if
623 * we have block groups that are going to be freed, allowing us to
624 * possibly do a chunk allocation the next loop through.
626 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
627 __percpu_counter_compare(&space_info->total_bytes_pinned,
629 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
633 * See if there is some space in the delayed insertion reserve for this
634 * reservation. If the space_info's don't match (like for DATA or
635 * SYSTEM) then just go enospc, reclaiming this space won't recover any
636 * space to satisfy those reservations.
638 if (space_info != delayed_rsv->space_info)
641 spin_lock(&delayed_rsv->lock);
642 reclaim_bytes += delayed_rsv->reserved;
643 spin_unlock(&delayed_rsv->lock);
645 spin_lock(&delayed_refs_rsv->lock);
646 reclaim_bytes += delayed_refs_rsv->reserved;
647 spin_unlock(&delayed_refs_rsv->lock);
649 spin_lock(&trans_rsv->lock);
650 reclaim_bytes += trans_rsv->reserved;
651 spin_unlock(&trans_rsv->lock);
653 if (reclaim_bytes >= bytes_needed)
655 bytes_needed -= reclaim_bytes;
657 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
659 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
663 return btrfs_commit_transaction(trans);
665 btrfs_end_transaction(trans);
670 * Try to flush some data based on policy set by @state. This is only advisory
671 * and may fail for various reasons. The caller is supposed to examine the
672 * state of @space_info to detect the outcome.
674 static void flush_space(struct btrfs_fs_info *fs_info,
675 struct btrfs_space_info *space_info, u64 num_bytes,
676 enum btrfs_flush_state state, bool for_preempt)
678 struct btrfs_root *root = fs_info->extent_root;
679 struct btrfs_trans_handle *trans;
684 case FLUSH_DELAYED_ITEMS_NR:
685 case FLUSH_DELAYED_ITEMS:
686 if (state == FLUSH_DELAYED_ITEMS_NR)
687 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
691 trans = btrfs_join_transaction(root);
693 ret = PTR_ERR(trans);
696 ret = btrfs_run_delayed_items_nr(trans, nr);
697 btrfs_end_transaction(trans);
700 case FLUSH_DELALLOC_WAIT:
701 shrink_delalloc(fs_info, space_info, num_bytes,
702 state == FLUSH_DELALLOC_WAIT);
704 case FLUSH_DELAYED_REFS_NR:
705 case FLUSH_DELAYED_REFS:
706 trans = btrfs_join_transaction(root);
708 ret = PTR_ERR(trans);
711 if (state == FLUSH_DELAYED_REFS_NR)
712 nr = calc_reclaim_items_nr(fs_info, num_bytes);
715 btrfs_run_delayed_refs(trans, nr);
716 btrfs_end_transaction(trans);
719 case ALLOC_CHUNK_FORCE:
720 trans = btrfs_join_transaction(root);
722 ret = PTR_ERR(trans);
725 ret = btrfs_chunk_alloc(trans,
726 btrfs_get_alloc_profile(fs_info, space_info->flags),
727 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
729 btrfs_end_transaction(trans);
730 if (ret > 0 || ret == -ENOSPC)
733 case RUN_DELAYED_IPUTS:
735 * If we have pending delayed iputs then we could free up a
736 * bunch of pinned space, so make sure we run the iputs before
737 * we do our pinned bytes check below.
739 btrfs_run_delayed_iputs(fs_info);
740 btrfs_wait_on_delayed_iputs(fs_info);
743 ret = may_commit_transaction(fs_info, space_info);
745 case FORCE_COMMIT_TRANS:
746 trans = btrfs_join_transaction(root);
748 ret = PTR_ERR(trans);
751 ret = btrfs_commit_transaction(trans);
758 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
764 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
765 struct btrfs_space_info *space_info)
769 u64 to_reclaim = space_info->reclaim_size;
771 lockdep_assert_held(&space_info->lock);
773 avail = calc_available_free_space(fs_info, space_info,
774 BTRFS_RESERVE_FLUSH_ALL);
775 used = btrfs_space_info_used(space_info, true);
778 * We may be flushing because suddenly we have less space than we had
779 * before, and now we're well over-committed based on our current free
780 * space. If that's the case add in our overage so we make sure to put
781 * appropriate pressure on the flushing state machine.
783 if (space_info->total_bytes + avail < used)
784 to_reclaim += used - (space_info->total_bytes + avail);
789 static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
790 struct btrfs_space_info *space_info)
792 u64 ordered, delalloc;
793 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
796 /* If we're just plain full then async reclaim just slows us down. */
797 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
801 * We have tickets queued, bail so we don't compete with the async
804 if (space_info->reclaim_size)
808 * If we have over half of the free space occupied by reservations or
809 * pinned then we want to start flushing.
811 * We do not do the traditional thing here, which is to say
813 * if (used >= ((total_bytes + avail) / 2))
816 * because this doesn't quite work how we want. If we had more than 50%
817 * of the space_info used by bytes_used and we had 0 available we'd just
818 * constantly run the background flusher. Instead we want it to kick in
819 * if our reclaimable space exceeds our clamped free space.
821 * Our clamping range is 2^1 -> 2^8. Practically speaking that means
824 * Amount of RAM Minimum threshold Maximum threshold
827 * 128GiB 512MiB 64GiB
832 * These are the range our thresholds will fall in, corresponding to how
833 * much delalloc we need for the background flusher to kick in.
836 thresh = calc_available_free_space(fs_info, space_info,
837 BTRFS_RESERVE_FLUSH_ALL);
838 thresh += (space_info->total_bytes - space_info->bytes_used -
839 space_info->bytes_reserved - space_info->bytes_readonly);
840 thresh >>= space_info->clamp;
842 used = space_info->bytes_pinned;
845 * If we have more ordered bytes than delalloc bytes then we're either
846 * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
847 * around. Preemptive flushing is only useful in that it can free up
848 * space before tickets need to wait for things to finish. In the case
849 * of ordered extents, preemptively waiting on ordered extents gets us
850 * nothing, if our reservations are tied up in ordered extents we'll
851 * simply have to slow down writers by forcing them to wait on ordered
854 * In the case that ordered is larger than delalloc, only include the
855 * block reserves that we would actually be able to directly reclaim
856 * from. In this case if we're heavy on metadata operations this will
857 * clearly be heavy enough to warrant preemptive flushing. In the case
858 * of heavy DIO or ordered reservations, preemptive flushing will just
859 * waste time and cause us to slow down.
861 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
862 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
863 if (ordered >= delalloc)
864 used += fs_info->delayed_refs_rsv.reserved +
865 fs_info->delayed_block_rsv.reserved;
867 used += space_info->bytes_may_use;
869 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
870 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
873 static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
874 struct btrfs_space_info *space_info,
875 struct reserve_ticket *ticket)
877 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
880 if (global_rsv->space_info != space_info)
883 spin_lock(&global_rsv->lock);
884 min_bytes = div_factor(global_rsv->size, 1);
885 if (global_rsv->reserved < min_bytes + ticket->bytes) {
886 spin_unlock(&global_rsv->lock);
889 global_rsv->reserved -= ticket->bytes;
890 remove_ticket(space_info, ticket);
892 wake_up(&ticket->wait);
893 space_info->tickets_id++;
894 if (global_rsv->reserved < global_rsv->size)
895 global_rsv->full = 0;
896 spin_unlock(&global_rsv->lock);
902 * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
903 * @fs_info - fs_info for this fs
904 * @space_info - the space info we were flushing
906 * We call this when we've exhausted our flushing ability and haven't made
907 * progress in satisfying tickets. The reservation code handles tickets in
908 * order, so if there is a large ticket first and then smaller ones we could
909 * very well satisfy the smaller tickets. This will attempt to wake up any
910 * tickets in the list to catch this case.
912 * This function returns true if it was able to make progress by clearing out
913 * other tickets, or if it stumbles across a ticket that was smaller than the
916 static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
917 struct btrfs_space_info *space_info)
919 struct reserve_ticket *ticket;
920 u64 tickets_id = space_info->tickets_id;
921 u64 first_ticket_bytes = 0;
923 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
924 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
925 __btrfs_dump_space_info(fs_info, space_info);
928 while (!list_empty(&space_info->tickets) &&
929 tickets_id == space_info->tickets_id) {
930 ticket = list_first_entry(&space_info->tickets,
931 struct reserve_ticket, list);
934 steal_from_global_rsv(fs_info, space_info, ticket))
938 * may_commit_transaction will avoid committing the transaction
939 * if it doesn't feel like the space reclaimed by the commit
940 * would result in the ticket succeeding. However if we have a
941 * smaller ticket in the queue it may be small enough to be
942 * satisified by committing the transaction, so if any
943 * subsequent ticket is smaller than the first ticket go ahead
944 * and send us back for another loop through the enospc flushing
947 if (first_ticket_bytes == 0)
948 first_ticket_bytes = ticket->bytes;
949 else if (first_ticket_bytes > ticket->bytes)
952 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
953 btrfs_info(fs_info, "failing ticket with %llu bytes",
956 remove_ticket(space_info, ticket);
957 ticket->error = -ENOSPC;
958 wake_up(&ticket->wait);
961 * We're just throwing tickets away, so more flushing may not
962 * trip over btrfs_try_granting_tickets, so we need to call it
963 * here to see if we can make progress with the next ticket in
966 btrfs_try_granting_tickets(fs_info, space_info);
968 return (tickets_id != space_info->tickets_id);
972 * This is for normal flushers, we can wait all goddamned day if we want to. We
973 * will loop and continuously try to flush as long as we are making progress.
974 * We count progress as clearing off tickets each time we have to loop.
976 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
978 struct btrfs_fs_info *fs_info;
979 struct btrfs_space_info *space_info;
981 enum btrfs_flush_state flush_state;
982 int commit_cycles = 0;
985 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
986 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
988 spin_lock(&space_info->lock);
989 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
991 space_info->flush = 0;
992 spin_unlock(&space_info->lock);
995 last_tickets_id = space_info->tickets_id;
996 spin_unlock(&space_info->lock);
998 flush_state = FLUSH_DELAYED_ITEMS_NR;
1000 flush_space(fs_info, space_info, to_reclaim, flush_state, false);
1001 spin_lock(&space_info->lock);
1002 if (list_empty(&space_info->tickets)) {
1003 space_info->flush = 0;
1004 spin_unlock(&space_info->lock);
1007 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
1009 if (last_tickets_id == space_info->tickets_id) {
1012 last_tickets_id = space_info->tickets_id;
1013 flush_state = FLUSH_DELAYED_ITEMS_NR;
1019 * We don't want to force a chunk allocation until we've tried
1020 * pretty hard to reclaim space. Think of the case where we
1021 * freed up a bunch of space and so have a lot of pinned space
1022 * to reclaim. We would rather use that than possibly create a
1023 * underutilized metadata chunk. So if this is our first run
1024 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
1025 * commit the transaction. If nothing has changed the next go
1026 * around then we can force a chunk allocation.
1028 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
1031 if (flush_state > COMMIT_TRANS) {
1033 if (commit_cycles > 2) {
1034 if (maybe_fail_all_tickets(fs_info, space_info)) {
1035 flush_state = FLUSH_DELAYED_ITEMS_NR;
1038 space_info->flush = 0;
1041 flush_state = FLUSH_DELAYED_ITEMS_NR;
1044 spin_unlock(&space_info->lock);
1045 } while (flush_state <= COMMIT_TRANS);
1049 * This handles pre-flushing of metadata space before we get to the point that
1050 * we need to start blocking threads on tickets. The logic here is different
1051 * from the other flush paths because it doesn't rely on tickets to tell us how
1052 * much we need to flush, instead it attempts to keep us below the 80% full
1053 * watermark of space by flushing whichever reservation pool is currently the
1056 static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
1058 struct btrfs_fs_info *fs_info;
1059 struct btrfs_space_info *space_info;
1060 struct btrfs_block_rsv *delayed_block_rsv;
1061 struct btrfs_block_rsv *delayed_refs_rsv;
1062 struct btrfs_block_rsv *global_rsv;
1063 struct btrfs_block_rsv *trans_rsv;
1066 fs_info = container_of(work, struct btrfs_fs_info,
1067 preempt_reclaim_work);
1068 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
1069 delayed_block_rsv = &fs_info->delayed_block_rsv;
1070 delayed_refs_rsv = &fs_info->delayed_refs_rsv;
1071 global_rsv = &fs_info->global_block_rsv;
1072 trans_rsv = &fs_info->trans_block_rsv;
1074 spin_lock(&space_info->lock);
1075 while (need_preemptive_reclaim(fs_info, space_info)) {
1076 enum btrfs_flush_state flush;
1077 u64 delalloc_size = 0;
1078 u64 to_reclaim, block_rsv_size;
1079 u64 global_rsv_size = global_rsv->reserved;
1084 * We don't have a precise counter for the metadata being
1085 * reserved for delalloc, so we'll approximate it by subtracting
1086 * out the block rsv's space from the bytes_may_use. If that
1087 * amount is higher than the individual reserves, then we can
1088 * assume it's tied up in delalloc reservations.
1090 block_rsv_size = global_rsv_size +
1091 delayed_block_rsv->reserved +
1092 delayed_refs_rsv->reserved +
1093 trans_rsv->reserved;
1094 if (block_rsv_size < space_info->bytes_may_use)
1095 delalloc_size = space_info->bytes_may_use - block_rsv_size;
1096 spin_unlock(&space_info->lock);
1099 * We don't want to include the global_rsv in our calculation,
1100 * because that's space we can't touch. Subtract it from the
1101 * block_rsv_size for the next checks.
1103 block_rsv_size -= global_rsv_size;
1106 * We really want to avoid flushing delalloc too much, as it
1107 * could result in poor allocation patterns, so only flush it if
1108 * it's larger than the rest of the pools combined.
1110 if (delalloc_size > block_rsv_size) {
1111 to_reclaim = delalloc_size;
1112 flush = FLUSH_DELALLOC;
1113 } else if (space_info->bytes_pinned >
1114 (delayed_block_rsv->reserved +
1115 delayed_refs_rsv->reserved)) {
1116 to_reclaim = space_info->bytes_pinned;
1117 flush = FORCE_COMMIT_TRANS;
1118 } else if (delayed_block_rsv->reserved >
1119 delayed_refs_rsv->reserved) {
1120 to_reclaim = delayed_block_rsv->reserved;
1121 flush = FLUSH_DELAYED_ITEMS_NR;
1123 to_reclaim = delayed_refs_rsv->reserved;
1124 flush = FLUSH_DELAYED_REFS_NR;
1128 * We don't want to reclaim everything, just a portion, so scale
1129 * down the to_reclaim by 1/4. If it takes us down to 0,
1130 * reclaim 1 items worth.
1134 to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
1135 flush_space(fs_info, space_info, to_reclaim, flush, true);
1137 spin_lock(&space_info->lock);
1140 /* We only went through once, back off our clamping. */
1141 if (loops == 1 && !space_info->reclaim_size)
1142 space_info->clamp = max(1, space_info->clamp - 1);
1143 trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
1144 spin_unlock(&space_info->lock);
1148 * FLUSH_DELALLOC_WAIT:
1149 * Space is freed from flushing delalloc in one of two ways.
1151 * 1) compression is on and we allocate less space than we reserved
1152 * 2) we are overwriting existing space
1154 * For #1 that extra space is reclaimed as soon as the delalloc pages are
1155 * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
1156 * length to ->bytes_reserved, and subtracts the reserved space from
1159 * For #2 this is trickier. Once the ordered extent runs we will drop the
1160 * extent in the range we are overwriting, which creates a delayed ref for
1161 * that freed extent. This however is not reclaimed until the transaction
1162 * commits, thus the next stages.
1165 * If we are freeing inodes, we want to make sure all delayed iputs have
1166 * completed, because they could have been on an inode with i_nlink == 0, and
1167 * thus have been truncated and freed up space. But again this space is not
1168 * immediately re-usable, it comes in the form of a delayed ref, which must be
1169 * run and then the transaction must be committed.
1171 * FLUSH_DELAYED_REFS
1172 * The above two cases generate delayed refs that will affect
1173 * ->total_bytes_pinned. However this counter can be inconsistent with
1174 * reality if there are outstanding delayed refs. This is because we adjust
1175 * the counter based solely on the current set of delayed refs and disregard
1176 * any on-disk state which might include more refs. So for example, if we
1177 * have an extent with 2 references, but we only drop 1, we'll see that there
1178 * is a negative delayed ref count for the extent and assume that the space
1179 * will be freed, and thus increase ->total_bytes_pinned.
1181 * Running the delayed refs gives us the actual real view of what will be
1182 * freed at the transaction commit time. This stage will not actually free
1183 * space for us, it just makes sure that may_commit_transaction() has all of
1184 * the information it needs to make the right decision.
1187 * This is where we reclaim all of the pinned space generated by the previous
1188 * two stages. We will not commit the transaction if we don't think we're
1189 * likely to satisfy our request, which means if our current free space +
1190 * total_bytes_pinned < reservation we will not commit. This is why the
1191 * previous states are actually important, to make sure we know for sure
1192 * whether committing the transaction will allow us to make progress.
1195 * For data we start with alloc chunk force, however we could have been full
1196 * before, and then the transaction commit could have freed new block groups,
1197 * so if we now have space to allocate do the force chunk allocation.
1199 static const enum btrfs_flush_state data_flush_states[] = {
1200 FLUSH_DELALLOC_WAIT,
1207 static void btrfs_async_reclaim_data_space(struct work_struct *work)
1209 struct btrfs_fs_info *fs_info;
1210 struct btrfs_space_info *space_info;
1211 u64 last_tickets_id;
1212 enum btrfs_flush_state flush_state = 0;
1214 fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
1215 space_info = fs_info->data_sinfo;
1217 spin_lock(&space_info->lock);
1218 if (list_empty(&space_info->tickets)) {
1219 space_info->flush = 0;
1220 spin_unlock(&space_info->lock);
1223 last_tickets_id = space_info->tickets_id;
1224 spin_unlock(&space_info->lock);
1226 while (!space_info->full) {
1227 flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1228 spin_lock(&space_info->lock);
1229 if (list_empty(&space_info->tickets)) {
1230 space_info->flush = 0;
1231 spin_unlock(&space_info->lock);
1234 last_tickets_id = space_info->tickets_id;
1235 spin_unlock(&space_info->lock);
1238 while (flush_state < ARRAY_SIZE(data_flush_states)) {
1239 flush_space(fs_info, space_info, U64_MAX,
1240 data_flush_states[flush_state], false);
1241 spin_lock(&space_info->lock);
1242 if (list_empty(&space_info->tickets)) {
1243 space_info->flush = 0;
1244 spin_unlock(&space_info->lock);
1248 if (last_tickets_id == space_info->tickets_id) {
1251 last_tickets_id = space_info->tickets_id;
1255 if (flush_state >= ARRAY_SIZE(data_flush_states)) {
1256 if (space_info->full) {
1257 if (maybe_fail_all_tickets(fs_info, space_info))
1260 space_info->flush = 0;
1265 spin_unlock(&space_info->lock);
1269 void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
1271 INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
1272 INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
1273 INIT_WORK(&fs_info->preempt_reclaim_work,
1274 btrfs_preempt_reclaim_metadata_space);
1277 static const enum btrfs_flush_state priority_flush_states[] = {
1278 FLUSH_DELAYED_ITEMS_NR,
1279 FLUSH_DELAYED_ITEMS,
1283 static const enum btrfs_flush_state evict_flush_states[] = {
1284 FLUSH_DELAYED_ITEMS_NR,
1285 FLUSH_DELAYED_ITEMS,
1286 FLUSH_DELAYED_REFS_NR,
1289 FLUSH_DELALLOC_WAIT,
1294 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
1295 struct btrfs_space_info *space_info,
1296 struct reserve_ticket *ticket,
1297 const enum btrfs_flush_state *states,
1303 spin_lock(&space_info->lock);
1304 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
1306 spin_unlock(&space_info->lock);
1309 spin_unlock(&space_info->lock);
1313 flush_space(fs_info, space_info, to_reclaim, states[flush_state],
1316 spin_lock(&space_info->lock);
1317 if (ticket->bytes == 0) {
1318 spin_unlock(&space_info->lock);
1321 spin_unlock(&space_info->lock);
1322 } while (flush_state < states_nr);
1325 static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
1326 struct btrfs_space_info *space_info,
1327 struct reserve_ticket *ticket)
1329 while (!space_info->full) {
1330 flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
1331 spin_lock(&space_info->lock);
1332 if (ticket->bytes == 0) {
1333 spin_unlock(&space_info->lock);
1336 spin_unlock(&space_info->lock);
1340 static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
1341 struct btrfs_space_info *space_info,
1342 struct reserve_ticket *ticket)
1348 spin_lock(&space_info->lock);
1349 while (ticket->bytes > 0 && ticket->error == 0) {
1350 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
1353 * Delete us from the list. After we unlock the space
1354 * info, we don't want the async reclaim job to reserve
1355 * space for this ticket. If that would happen, then the
1356 * ticket's task would not known that space was reserved
1357 * despite getting an error, resulting in a space leak
1358 * (bytes_may_use counter of our space_info).
1360 remove_ticket(space_info, ticket);
1361 ticket->error = -EINTR;
1364 spin_unlock(&space_info->lock);
1368 finish_wait(&ticket->wait, &wait);
1369 spin_lock(&space_info->lock);
1371 spin_unlock(&space_info->lock);
1375 * Do the appropriate flushing and waiting for a ticket
1377 * @fs_info: the filesystem
1378 * @space_info: space info for the reservation
1379 * @ticket: ticket for the reservation
1380 * @start_ns: timestamp when the reservation started
1381 * @orig_bytes: amount of bytes originally reserved
1382 * @flush: how much we can flush
1384 * This does the work of figuring out how to flush for the ticket, waiting for
1385 * the reservation, and returning the appropriate error if there is one.
1387 static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
1388 struct btrfs_space_info *space_info,
1389 struct reserve_ticket *ticket,
1390 u64 start_ns, u64 orig_bytes,
1391 enum btrfs_reserve_flush_enum flush)
1396 case BTRFS_RESERVE_FLUSH_DATA:
1397 case BTRFS_RESERVE_FLUSH_ALL:
1398 case BTRFS_RESERVE_FLUSH_ALL_STEAL:
1399 wait_reserve_ticket(fs_info, space_info, ticket);
1401 case BTRFS_RESERVE_FLUSH_LIMIT:
1402 priority_reclaim_metadata_space(fs_info, space_info, ticket,
1403 priority_flush_states,
1404 ARRAY_SIZE(priority_flush_states));
1406 case BTRFS_RESERVE_FLUSH_EVICT:
1407 priority_reclaim_metadata_space(fs_info, space_info, ticket,
1409 ARRAY_SIZE(evict_flush_states));
1411 case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
1412 priority_reclaim_data_space(fs_info, space_info, ticket);
1419 spin_lock(&space_info->lock);
1420 ret = ticket->error;
1421 if (ticket->bytes || ticket->error) {
1423 * We were a priority ticket, so we need to delete ourselves
1424 * from the list. Because we could have other priority tickets
1425 * behind us that require less space, run
1426 * btrfs_try_granting_tickets() to see if their reservations can
1429 if (!list_empty(&ticket->list)) {
1430 remove_ticket(space_info, ticket);
1431 btrfs_try_granting_tickets(fs_info, space_info);
1437 spin_unlock(&space_info->lock);
1438 ASSERT(list_empty(&ticket->list));
1440 * Check that we can't have an error set if the reservation succeeded,
1441 * as that would confuse tasks and lead them to error out without
1442 * releasing reserved space (if an error happens the expectation is that
1443 * space wasn't reserved at all).
1445 ASSERT(!(ticket->bytes == 0 && ticket->error));
1446 trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
1447 start_ns, flush, ticket->error);
1452 * This returns true if this flush state will go through the ordinary flushing
1455 static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
1457 return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
1458 (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1461 static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
1462 struct btrfs_space_info *space_info)
1464 u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
1465 u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
1468 * If we're heavy on ordered operations then clamping won't help us. We
1469 * need to clamp specifically to keep up with dirty'ing buffered
1470 * writers, because there's not a 1:1 correlation of writing delalloc
1471 * and freeing space, like there is with flushing delayed refs or
1472 * delayed nodes. If we're already more ordered than delalloc then
1473 * we're keeping up, otherwise we aren't and should probably clamp.
1475 if (ordered < delalloc)
1476 space_info->clamp = min(space_info->clamp + 1, 8);
1480 * Try to reserve bytes from the block_rsv's space
1482 * @fs_info: the filesystem
1483 * @space_info: space info we want to allocate from
1484 * @orig_bytes: number of bytes we want
1485 * @flush: whether or not we can flush to make our reservation
1487 * This will reserve orig_bytes number of bytes from the space info associated
1488 * with the block_rsv. If there is not enough space it will make an attempt to
1489 * flush out space to make room. It will do this by flushing delalloc if
1490 * possible or committing the transaction. If flush is 0 then no attempts to
1491 * regain reservations will be made and this will fail if there is not enough
1494 static int __reserve_bytes(struct btrfs_fs_info *fs_info,
1495 struct btrfs_space_info *space_info, u64 orig_bytes,
1496 enum btrfs_reserve_flush_enum flush)
1498 struct work_struct *async_work;
1499 struct reserve_ticket ticket;
1503 bool pending_tickets;
1506 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
1508 if (flush == BTRFS_RESERVE_FLUSH_DATA)
1509 async_work = &fs_info->async_data_reclaim_work;
1511 async_work = &fs_info->async_reclaim_work;
1513 spin_lock(&space_info->lock);
1515 used = btrfs_space_info_used(space_info, true);
1518 * We don't want NO_FLUSH allocations to jump everybody, they can
1519 * generally handle ENOSPC in a different way, so treat them the same as
1520 * normal flushers when it comes to skipping pending tickets.
1522 if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
1523 pending_tickets = !list_empty(&space_info->tickets) ||
1524 !list_empty(&space_info->priority_tickets);
1526 pending_tickets = !list_empty(&space_info->priority_tickets);
1529 * Carry on if we have enough space (short-circuit) OR call
1530 * can_overcommit() to ensure we can overcommit to continue.
1532 if (!pending_tickets &&
1533 ((used + orig_bytes <= space_info->total_bytes) ||
1534 btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
1535 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1541 * If we couldn't make a reservation then setup our reservation ticket
1542 * and kick the async worker if it's not already running.
1544 * If we are a priority flusher then we just need to add our ticket to
1545 * the list and we will do our own flushing further down.
1547 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1548 ticket.bytes = orig_bytes;
1550 space_info->reclaim_size += ticket.bytes;
1551 init_waitqueue_head(&ticket.wait);
1552 ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
1553 if (trace_btrfs_reserve_ticket_enabled())
1554 start_ns = ktime_get_ns();
1556 if (flush == BTRFS_RESERVE_FLUSH_ALL ||
1557 flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
1558 flush == BTRFS_RESERVE_FLUSH_DATA) {
1559 list_add_tail(&ticket.list, &space_info->tickets);
1560 if (!space_info->flush) {
1561 space_info->flush = 1;
1562 trace_btrfs_trigger_flush(fs_info,
1566 queue_work(system_unbound_wq, async_work);
1569 list_add_tail(&ticket.list,
1570 &space_info->priority_tickets);
1574 * We were forced to add a reserve ticket, so our preemptive
1575 * flushing is unable to keep up. Clamp down on the threshold
1576 * for the preemptive flushing in order to keep up with the
1579 maybe_clamp_preempt(fs_info, space_info);
1580 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1583 * We will do the space reservation dance during log replay,
1584 * which means we won't have fs_info->fs_root set, so don't do
1585 * the async reclaim as we will panic.
1587 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1588 need_preemptive_reclaim(fs_info, space_info) &&
1589 !work_busy(&fs_info->preempt_reclaim_work)) {
1590 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1591 orig_bytes, flush, "preempt");
1592 queue_work(system_unbound_wq,
1593 &fs_info->preempt_reclaim_work);
1596 spin_unlock(&space_info->lock);
1597 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1600 return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
1605 * Trye to reserve metadata bytes from the block_rsv's space
1607 * @root: the root we're allocating for
1608 * @block_rsv: block_rsv we're allocating for
1609 * @orig_bytes: number of bytes we want
1610 * @flush: whether or not we can flush to make our reservation
1612 * This will reserve orig_bytes number of bytes from the space info associated
1613 * with the block_rsv. If there is not enough space it will make an attempt to
1614 * flush out space to make room. It will do this by flushing delalloc if
1615 * possible or committing the transaction. If flush is 0 then no attempts to
1616 * regain reservations will be made and this will fail if there is not enough
1619 int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1620 struct btrfs_block_rsv *block_rsv,
1622 enum btrfs_reserve_flush_enum flush)
1624 struct btrfs_fs_info *fs_info = root->fs_info;
1625 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1628 ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
1629 if (ret == -ENOSPC &&
1630 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1631 if (block_rsv != global_rsv &&
1632 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1635 if (ret == -ENOSPC) {
1636 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1637 block_rsv->space_info->flags,
1640 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1641 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1648 * Try to reserve data bytes for an allocation
1650 * @fs_info: the filesystem
1651 * @bytes: number of bytes we need
1652 * @flush: how we are allowed to flush
1654 * This will reserve bytes from the data space info. If there is not enough
1655 * space then we will attempt to flush space as specified by flush.
1657 int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
1658 enum btrfs_reserve_flush_enum flush)
1660 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
1663 ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
1664 flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
1665 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
1667 ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
1668 if (ret == -ENOSPC) {
1669 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1670 data_sinfo->flags, bytes, 1);
1671 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1672 btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);