1 // SPDX-License-Identifier: GPL-2.0
3 #include "tree-mod-log.h"
11 struct tree_mod_elem {
15 enum btrfs_mod_log_op op;
18 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
26 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
27 struct btrfs_disk_key key;
30 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
36 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
37 struct tree_mod_root old_root;
41 * Pull a new tree mod seq number for our operation.
43 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
45 return atomic64_inc_return(&fs_info->tree_mod_seq);
49 * This adds a new blocker to the tree mod log's blocker list if the @elem
50 * passed does not already have a sequence number set. So when a caller expects
51 * to record tree modifications, it should ensure to set elem->seq to zero
52 * before calling btrfs_get_tree_mod_seq.
53 * Returns a fresh, unused tree log modification sequence number, even if no new
56 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
57 struct btrfs_seq_list *elem)
59 write_lock(&fs_info->tree_mod_log_lock);
61 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
62 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
64 write_unlock(&fs_info->tree_mod_log_lock);
69 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
70 struct btrfs_seq_list *elem)
72 struct rb_root *tm_root;
75 struct tree_mod_elem *tm;
76 u64 min_seq = BTRFS_SEQ_LAST;
77 u64 seq_putting = elem->seq;
82 write_lock(&fs_info->tree_mod_log_lock);
83 list_del(&elem->list);
86 if (!list_empty(&fs_info->tree_mod_seq_list)) {
87 struct btrfs_seq_list *first;
89 first = list_first_entry(&fs_info->tree_mod_seq_list,
90 struct btrfs_seq_list, list);
91 if (seq_putting > first->seq) {
93 * Blocker with lower sequence number exists, we cannot
94 * remove anything from the log.
96 write_unlock(&fs_info->tree_mod_log_lock);
103 * Anything that's lower than the lowest existing (read: blocked)
104 * sequence number can be removed from the tree.
106 tm_root = &fs_info->tree_mod_log;
107 for (node = rb_first(tm_root); node; node = next) {
108 next = rb_next(node);
109 tm = rb_entry(node, struct tree_mod_elem, node);
110 if (tm->seq >= min_seq)
112 rb_erase(node, tm_root);
115 write_unlock(&fs_info->tree_mod_log_lock);
119 * Key order of the log:
120 * node/leaf start address -> sequence
122 * The 'start address' is the logical address of the *new* root node for root
123 * replace operations, or the logical address of the affected block for all
126 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
127 struct tree_mod_elem *tm)
129 struct rb_root *tm_root;
130 struct rb_node **new;
131 struct rb_node *parent = NULL;
132 struct tree_mod_elem *cur;
134 lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
136 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
138 tm_root = &fs_info->tree_mod_log;
139 new = &tm_root->rb_node;
141 cur = rb_entry(*new, struct tree_mod_elem, node);
143 if (cur->logical < tm->logical)
144 new = &((*new)->rb_left);
145 else if (cur->logical > tm->logical)
146 new = &((*new)->rb_right);
147 else if (cur->seq < tm->seq)
148 new = &((*new)->rb_left);
149 else if (cur->seq > tm->seq)
150 new = &((*new)->rb_right);
155 rb_link_node(&tm->node, parent, new);
156 rb_insert_color(&tm->node, tm_root);
161 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
162 * returns zero with the tree_mod_log_lock acquired. The caller must hold
163 * this until all tree mod log insertions are recorded in the rb tree and then
164 * write unlock fs_info::tree_mod_log_lock.
166 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
167 struct extent_buffer *eb)
170 if (list_empty(&(fs_info)->tree_mod_seq_list))
172 if (eb && btrfs_header_level(eb) == 0)
175 write_lock(&fs_info->tree_mod_log_lock);
176 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
177 write_unlock(&fs_info->tree_mod_log_lock);
184 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
185 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
186 struct extent_buffer *eb)
189 if (list_empty(&(fs_info)->tree_mod_seq_list))
191 if (eb && btrfs_header_level(eb) == 0)
197 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
199 enum btrfs_mod_log_op op,
202 struct tree_mod_elem *tm;
204 tm = kzalloc(sizeof(*tm), flags);
208 tm->logical = eb->start;
209 if (op != BTRFS_MOD_LOG_KEY_ADD) {
210 btrfs_node_key(eb, &tm->key, slot);
211 tm->blockptr = btrfs_node_blockptr(eb, slot);
215 tm->generation = btrfs_node_ptr_generation(eb, slot);
216 RB_CLEAR_NODE(&tm->node);
221 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
222 enum btrfs_mod_log_op op, gfp_t flags)
224 struct tree_mod_elem *tm;
227 if (!tree_mod_need_log(eb->fs_info, eb))
230 tm = alloc_tree_mod_elem(eb, slot, op, flags);
234 if (tree_mod_dont_log(eb->fs_info, eb)) {
239 ret = tree_mod_log_insert(eb->fs_info, tm);
240 write_unlock(&eb->fs_info->tree_mod_log_lock);
247 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
248 int dst_slot, int src_slot,
251 struct tree_mod_elem *tm = NULL;
252 struct tree_mod_elem **tm_list = NULL;
257 if (!tree_mod_need_log(eb->fs_info, eb))
260 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
264 tm = kzalloc(sizeof(*tm), GFP_NOFS);
270 tm->logical = eb->start;
272 tm->move.dst_slot = dst_slot;
273 tm->move.nr_items = nr_items;
274 tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
276 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
277 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
278 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
285 if (tree_mod_dont_log(eb->fs_info, eb))
290 * When we override something during the move, we log these removals.
291 * This can only happen when we move towards the beginning of the
292 * buffer, i.e. dst_slot < src_slot.
294 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
295 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
300 ret = tree_mod_log_insert(eb->fs_info, tm);
303 write_unlock(&eb->fs_info->tree_mod_log_lock);
309 for (i = 0; i < nr_items; i++) {
310 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
311 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
315 write_unlock(&eb->fs_info->tree_mod_log_lock);
322 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
323 struct tree_mod_elem **tm_list,
329 for (i = nritems - 1; i >= 0; i--) {
330 ret = tree_mod_log_insert(fs_info, tm_list[i]);
332 for (j = nritems - 1; j > i; j--)
333 rb_erase(&tm_list[j]->node,
334 &fs_info->tree_mod_log);
342 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
343 struct extent_buffer *new_root,
346 struct btrfs_fs_info *fs_info = old_root->fs_info;
347 struct tree_mod_elem *tm = NULL;
348 struct tree_mod_elem **tm_list = NULL;
353 if (!tree_mod_need_log(fs_info, NULL))
356 if (log_removal && btrfs_header_level(old_root) > 0) {
357 nritems = btrfs_header_nritems(old_root);
358 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
364 for (i = 0; i < nritems; i++) {
365 tm_list[i] = alloc_tree_mod_elem(old_root, i,
366 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
374 tm = kzalloc(sizeof(*tm), GFP_NOFS);
380 tm->logical = new_root->start;
381 tm->old_root.logical = old_root->start;
382 tm->old_root.level = btrfs_header_level(old_root);
383 tm->generation = btrfs_header_generation(old_root);
384 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
386 if (tree_mod_dont_log(fs_info, NULL))
390 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
392 ret = tree_mod_log_insert(fs_info, tm);
394 write_unlock(&fs_info->tree_mod_log_lock);
403 for (i = 0; i < nritems; i++)
412 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
413 u64 start, u64 min_seq,
416 struct rb_root *tm_root;
417 struct rb_node *node;
418 struct tree_mod_elem *cur = NULL;
419 struct tree_mod_elem *found = NULL;
421 read_lock(&fs_info->tree_mod_log_lock);
422 tm_root = &fs_info->tree_mod_log;
423 node = tm_root->rb_node;
425 cur = rb_entry(node, struct tree_mod_elem, node);
426 if (cur->logical < start) {
427 node = node->rb_left;
428 } else if (cur->logical > start) {
429 node = node->rb_right;
430 } else if (cur->seq < min_seq) {
431 node = node->rb_left;
432 } else if (!smallest) {
433 /* We want the node with the highest seq */
435 BUG_ON(found->seq > cur->seq);
437 node = node->rb_left;
438 } else if (cur->seq > min_seq) {
439 /* We want the node with the smallest seq */
441 BUG_ON(found->seq < cur->seq);
443 node = node->rb_right;
449 read_unlock(&fs_info->tree_mod_log_lock);
455 * This returns the element from the log with the smallest time sequence
456 * value that's in the log (the oldest log item). Any element with a time
457 * sequence lower than min_seq will be ignored.
459 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
460 u64 start, u64 min_seq)
462 return __tree_mod_log_search(fs_info, start, min_seq, 1);
466 * This returns the element from the log with the largest time sequence
467 * value that's in the log (the most recent log item). Any element with
468 * a time sequence lower than min_seq will be ignored.
470 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
471 u64 start, u64 min_seq)
473 return __tree_mod_log_search(fs_info, start, min_seq, 0);
476 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
477 struct extent_buffer *src,
478 unsigned long dst_offset,
479 unsigned long src_offset,
482 struct btrfs_fs_info *fs_info = dst->fs_info;
484 struct tree_mod_elem **tm_list = NULL;
485 struct tree_mod_elem **tm_list_add, **tm_list_rem;
489 if (!tree_mod_need_log(fs_info, NULL))
492 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
495 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
500 tm_list_add = tm_list;
501 tm_list_rem = tm_list + nr_items;
502 for (i = 0; i < nr_items; i++) {
503 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
504 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
505 if (!tm_list_rem[i]) {
510 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
511 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
512 if (!tm_list_add[i]) {
518 if (tree_mod_dont_log(fs_info, NULL))
522 for (i = 0; i < nr_items; i++) {
523 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
526 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
531 write_unlock(&fs_info->tree_mod_log_lock);
537 for (i = 0; i < nr_items * 2; i++) {
538 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
539 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
543 write_unlock(&fs_info->tree_mod_log_lock);
549 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
551 struct tree_mod_elem **tm_list = NULL;
556 if (btrfs_header_level(eb) == 0)
559 if (!tree_mod_need_log(eb->fs_info, NULL))
562 nritems = btrfs_header_nritems(eb);
563 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
567 for (i = 0; i < nritems; i++) {
568 tm_list[i] = alloc_tree_mod_elem(eb, i,
569 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
576 if (tree_mod_dont_log(eb->fs_info, eb))
579 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
580 write_unlock(&eb->fs_info->tree_mod_log_lock);
588 for (i = 0; i < nritems; i++)
596 * Returns the logical address of the oldest predecessor of the given root.
597 * Entries older than time_seq are ignored.
599 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
602 struct tree_mod_elem *tm;
603 struct tree_mod_elem *found = NULL;
604 u64 root_logical = eb_root->start;
611 * The very last operation that's logged for a root is the replacement
612 * operation (if it is replaced at all). This has the logical address
613 * of the *new* root, making it the very first operation that's logged
617 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
622 * If there are no tree operation for the oldest root, we simply
623 * return it. This should only happen if that (old) root is at
630 * If there's an operation that's not a root replacement, we
631 * found the oldest version of our root. Normally, we'll find a
632 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
634 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
638 root_logical = tm->old_root.logical;
642 /* If there's no old root to return, return what we found instead */
651 * tm is a pointer to the first operation to rewind within eb. Then, all
652 * previous operations will be rewound (until we reach something older than
655 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
656 struct extent_buffer *eb,
658 struct tree_mod_elem *first_tm)
661 struct rb_node *next;
662 struct tree_mod_elem *tm = first_tm;
665 unsigned long p_size = sizeof(struct btrfs_key_ptr);
667 n = btrfs_header_nritems(eb);
668 read_lock(&fs_info->tree_mod_log_lock);
669 while (tm && tm->seq >= time_seq) {
671 * All the operations are recorded with the operator used for
672 * the modification. As we're going backwards, we do the
673 * opposite of each operation here.
676 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
677 BUG_ON(tm->slot < n);
679 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
680 case BTRFS_MOD_LOG_KEY_REMOVE:
681 btrfs_set_node_key(eb, &tm->key, tm->slot);
682 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
683 btrfs_set_node_ptr_generation(eb, tm->slot,
687 case BTRFS_MOD_LOG_KEY_REPLACE:
688 BUG_ON(tm->slot >= n);
689 btrfs_set_node_key(eb, &tm->key, tm->slot);
690 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
691 btrfs_set_node_ptr_generation(eb, tm->slot,
694 case BTRFS_MOD_LOG_KEY_ADD:
695 /* if a move operation is needed it's in the log */
698 case BTRFS_MOD_LOG_MOVE_KEYS:
699 o_dst = btrfs_node_key_ptr_offset(tm->slot);
700 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
701 memmove_extent_buffer(eb, o_dst, o_src,
702 tm->move.nr_items * p_size);
704 case BTRFS_MOD_LOG_ROOT_REPLACE:
706 * This operation is special. For roots, this must be
707 * handled explicitly before rewinding.
708 * For non-roots, this operation may exist if the node
709 * was a root: root A -> child B; then A gets empty and
710 * B is promoted to the new root. In the mod log, we'll
711 * have a root-replace operation for B, a tree block
712 * that is no root. We simply ignore that operation.
716 next = rb_next(&tm->node);
719 tm = rb_entry(next, struct tree_mod_elem, node);
720 if (tm->logical != first_tm->logical)
723 read_unlock(&fs_info->tree_mod_log_lock);
724 btrfs_set_header_nritems(eb, n);
728 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
729 * is returned. If rewind operations happen, a fresh buffer is returned. The
730 * returned buffer is always read-locked. If the returned buffer is not the
731 * input buffer, the lock on the input buffer is released and the input buffer
732 * is freed (its refcount is decremented).
734 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
735 struct btrfs_path *path,
736 struct extent_buffer *eb,
739 struct extent_buffer *eb_rewin;
740 struct tree_mod_elem *tm;
745 if (btrfs_header_level(eb) == 0)
748 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
752 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
753 BUG_ON(tm->slot != 0);
754 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
756 btrfs_tree_read_unlock(eb);
757 free_extent_buffer(eb);
760 btrfs_set_header_bytenr(eb_rewin, eb->start);
761 btrfs_set_header_backref_rev(eb_rewin,
762 btrfs_header_backref_rev(eb));
763 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
764 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
766 eb_rewin = btrfs_clone_extent_buffer(eb);
768 btrfs_tree_read_unlock(eb);
769 free_extent_buffer(eb);
774 btrfs_tree_read_unlock(eb);
775 free_extent_buffer(eb);
777 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
778 eb_rewin, btrfs_header_level(eb_rewin));
779 btrfs_tree_read_lock(eb_rewin);
780 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
781 WARN_ON(btrfs_header_nritems(eb_rewin) >
782 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
788 * Rewind the state of @root's root node to the given @time_seq value.
789 * If there are no changes, the current root->root_node is returned. If anything
790 * changed in between, there's a fresh buffer allocated on which the rewind
791 * operations are done. In any case, the returned buffer is read locked.
792 * Returns NULL on error (with no locks held).
794 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
796 struct btrfs_fs_info *fs_info = root->fs_info;
797 struct tree_mod_elem *tm;
798 struct extent_buffer *eb = NULL;
799 struct extent_buffer *eb_root;
800 u64 eb_root_owner = 0;
801 struct extent_buffer *old;
802 struct tree_mod_root *old_root = NULL;
803 u64 old_generation = 0;
807 eb_root = btrfs_read_lock_root_node(root);
808 tm = tree_mod_log_oldest_root(eb_root, time_seq);
812 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
813 old_root = &tm->old_root;
814 old_generation = tm->generation;
815 logical = old_root->logical;
816 level = old_root->level;
818 logical = eb_root->start;
819 level = btrfs_header_level(eb_root);
822 tm = tree_mod_log_search(fs_info, logical, time_seq);
823 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
824 btrfs_tree_read_unlock(eb_root);
825 free_extent_buffer(eb_root);
826 old = read_tree_block(fs_info, logical, root->root_key.objectid,
828 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
830 free_extent_buffer(old);
832 "failed to read tree block %llu from get_old_root",
835 btrfs_tree_read_lock(old);
836 eb = btrfs_clone_extent_buffer(old);
837 btrfs_tree_read_unlock(old);
838 free_extent_buffer(old);
840 } else if (old_root) {
841 eb_root_owner = btrfs_header_owner(eb_root);
842 btrfs_tree_read_unlock(eb_root);
843 free_extent_buffer(eb_root);
844 eb = alloc_dummy_extent_buffer(fs_info, logical);
846 eb = btrfs_clone_extent_buffer(eb_root);
847 btrfs_tree_read_unlock(eb_root);
848 free_extent_buffer(eb_root);
854 btrfs_set_header_bytenr(eb, eb->start);
855 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
856 btrfs_set_header_owner(eb, eb_root_owner);
857 btrfs_set_header_level(eb, old_root->level);
858 btrfs_set_header_generation(eb, old_generation);
860 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
861 btrfs_header_level(eb));
862 btrfs_tree_read_lock(eb);
864 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
866 WARN_ON(btrfs_header_level(eb) != 0);
867 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
872 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
874 struct tree_mod_elem *tm;
876 struct extent_buffer *eb_root = btrfs_root_node(root);
878 tm = tree_mod_log_oldest_root(eb_root, time_seq);
879 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
880 level = tm->old_root.level;
882 level = btrfs_header_level(eb_root);
884 free_extent_buffer(eb_root);