099503a3e2c9527fcd27b369cb0026364131106b
[platform/kernel/linux-rpi.git] / fs / btrfs / tree-mod-log.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "tree-mod-log.h"
4 #include "disk-io.h"
5
6 struct tree_mod_root {
7         u64 logical;
8         u8 level;
9 };
10
11 struct tree_mod_elem {
12         struct rb_node node;
13         u64 logical;
14         u64 seq;
15         enum btrfs_mod_log_op op;
16
17         /*
18          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
19          * operations.
20          */
21         int slot;
22
23         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
24         u64 generation;
25
26         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
27         struct btrfs_disk_key key;
28         u64 blockptr;
29
30         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
31         struct {
32                 int dst_slot;
33                 int nr_items;
34         } move;
35
36         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
37         struct tree_mod_root old_root;
38 };
39
40 /*
41  * Pull a new tree mod seq number for our operation.
42  */
43 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
44 {
45         return atomic64_inc_return(&fs_info->tree_mod_seq);
46 }
47
48 /*
49  * This adds a new blocker to the tree mod log's blocker list if the @elem
50  * passed does not already have a sequence number set. So when a caller expects
51  * to record tree modifications, it should ensure to set elem->seq to zero
52  * before calling btrfs_get_tree_mod_seq.
53  * Returns a fresh, unused tree log modification sequence number, even if no new
54  * blocker was added.
55  */
56 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
57                            struct btrfs_seq_list *elem)
58 {
59         write_lock(&fs_info->tree_mod_log_lock);
60         if (!elem->seq) {
61                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
62                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
63         }
64         write_unlock(&fs_info->tree_mod_log_lock);
65
66         return elem->seq;
67 }
68
69 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
70                             struct btrfs_seq_list *elem)
71 {
72         struct rb_root *tm_root;
73         struct rb_node *node;
74         struct rb_node *next;
75         struct tree_mod_elem *tm;
76         u64 min_seq = BTRFS_SEQ_LAST;
77         u64 seq_putting = elem->seq;
78
79         if (!seq_putting)
80                 return;
81
82         write_lock(&fs_info->tree_mod_log_lock);
83         list_del(&elem->list);
84         elem->seq = 0;
85
86         if (!list_empty(&fs_info->tree_mod_seq_list)) {
87                 struct btrfs_seq_list *first;
88
89                 first = list_first_entry(&fs_info->tree_mod_seq_list,
90                                          struct btrfs_seq_list, list);
91                 if (seq_putting > first->seq) {
92                         /*
93                          * Blocker with lower sequence number exists, we cannot
94                          * remove anything from the log.
95                          */
96                         write_unlock(&fs_info->tree_mod_log_lock);
97                         return;
98                 }
99                 min_seq = first->seq;
100         }
101
102         /*
103          * Anything that's lower than the lowest existing (read: blocked)
104          * sequence number can be removed from the tree.
105          */
106         tm_root = &fs_info->tree_mod_log;
107         for (node = rb_first(tm_root); node; node = next) {
108                 next = rb_next(node);
109                 tm = rb_entry(node, struct tree_mod_elem, node);
110                 if (tm->seq >= min_seq)
111                         continue;
112                 rb_erase(node, tm_root);
113                 kfree(tm);
114         }
115         write_unlock(&fs_info->tree_mod_log_lock);
116 }
117
118 /*
119  * Key order of the log:
120  *       node/leaf start address -> sequence
121  *
122  * The 'start address' is the logical address of the *new* root node for root
123  * replace operations, or the logical address of the affected block for all
124  * other operations.
125  */
126 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
127                                         struct tree_mod_elem *tm)
128 {
129         struct rb_root *tm_root;
130         struct rb_node **new;
131         struct rb_node *parent = NULL;
132         struct tree_mod_elem *cur;
133
134         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
135
136         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
137
138         tm_root = &fs_info->tree_mod_log;
139         new = &tm_root->rb_node;
140         while (*new) {
141                 cur = rb_entry(*new, struct tree_mod_elem, node);
142                 parent = *new;
143                 if (cur->logical < tm->logical)
144                         new = &((*new)->rb_left);
145                 else if (cur->logical > tm->logical)
146                         new = &((*new)->rb_right);
147                 else if (cur->seq < tm->seq)
148                         new = &((*new)->rb_left);
149                 else if (cur->seq > tm->seq)
150                         new = &((*new)->rb_right);
151                 else
152                         return -EEXIST;
153         }
154
155         rb_link_node(&tm->node, parent, new);
156         rb_insert_color(&tm->node, tm_root);
157         return 0;
158 }
159
160 /*
161  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
162  * returns zero with the tree_mod_log_lock acquired. The caller must hold
163  * this until all tree mod log insertions are recorded in the rb tree and then
164  * write unlock fs_info::tree_mod_log_lock.
165  */
166 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
167                                     struct extent_buffer *eb)
168 {
169         smp_mb();
170         if (list_empty(&(fs_info)->tree_mod_seq_list))
171                 return 1;
172         if (eb && btrfs_header_level(eb) == 0)
173                 return 1;
174
175         write_lock(&fs_info->tree_mod_log_lock);
176         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
177                 write_unlock(&fs_info->tree_mod_log_lock);
178                 return 1;
179         }
180
181         return 0;
182 }
183
184 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
185 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
186                                     struct extent_buffer *eb)
187 {
188         smp_mb();
189         if (list_empty(&(fs_info)->tree_mod_seq_list))
190                 return 0;
191         if (eb && btrfs_header_level(eb) == 0)
192                 return 0;
193
194         return 1;
195 }
196
197 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
198                                                  int slot,
199                                                  enum btrfs_mod_log_op op,
200                                                  gfp_t flags)
201 {
202         struct tree_mod_elem *tm;
203
204         tm = kzalloc(sizeof(*tm), flags);
205         if (!tm)
206                 return NULL;
207
208         tm->logical = eb->start;
209         if (op != BTRFS_MOD_LOG_KEY_ADD) {
210                 btrfs_node_key(eb, &tm->key, slot);
211                 tm->blockptr = btrfs_node_blockptr(eb, slot);
212         }
213         tm->op = op;
214         tm->slot = slot;
215         tm->generation = btrfs_node_ptr_generation(eb, slot);
216         RB_CLEAR_NODE(&tm->node);
217
218         return tm;
219 }
220
221 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
222                                   enum btrfs_mod_log_op op, gfp_t flags)
223 {
224         struct tree_mod_elem *tm;
225         int ret;
226
227         if (!tree_mod_need_log(eb->fs_info, eb))
228                 return 0;
229
230         tm = alloc_tree_mod_elem(eb, slot, op, flags);
231         if (!tm)
232                 return -ENOMEM;
233
234         if (tree_mod_dont_log(eb->fs_info, eb)) {
235                 kfree(tm);
236                 return 0;
237         }
238
239         ret = tree_mod_log_insert(eb->fs_info, tm);
240         write_unlock(&eb->fs_info->tree_mod_log_lock);
241         if (ret)
242                 kfree(tm);
243
244         return ret;
245 }
246
247 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
248                                    int dst_slot, int src_slot,
249                                    int nr_items)
250 {
251         struct tree_mod_elem *tm = NULL;
252         struct tree_mod_elem **tm_list = NULL;
253         int ret = 0;
254         int i;
255         int locked = 0;
256
257         if (!tree_mod_need_log(eb->fs_info, eb))
258                 return 0;
259
260         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
261         if (!tm_list)
262                 return -ENOMEM;
263
264         tm = kzalloc(sizeof(*tm), GFP_NOFS);
265         if (!tm) {
266                 ret = -ENOMEM;
267                 goto free_tms;
268         }
269
270         tm->logical = eb->start;
271         tm->slot = src_slot;
272         tm->move.dst_slot = dst_slot;
273         tm->move.nr_items = nr_items;
274         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
275
276         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
277                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
278                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
279                 if (!tm_list[i]) {
280                         ret = -ENOMEM;
281                         goto free_tms;
282                 }
283         }
284
285         if (tree_mod_dont_log(eb->fs_info, eb))
286                 goto free_tms;
287         locked = 1;
288
289         /*
290          * When we override something during the move, we log these removals.
291          * This can only happen when we move towards the beginning of the
292          * buffer, i.e. dst_slot < src_slot.
293          */
294         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
295                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
296                 if (ret)
297                         goto free_tms;
298         }
299
300         ret = tree_mod_log_insert(eb->fs_info, tm);
301         if (ret)
302                 goto free_tms;
303         write_unlock(&eb->fs_info->tree_mod_log_lock);
304         kfree(tm_list);
305
306         return 0;
307
308 free_tms:
309         for (i = 0; i < nr_items; i++) {
310                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
311                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
312                 kfree(tm_list[i]);
313         }
314         if (locked)
315                 write_unlock(&eb->fs_info->tree_mod_log_lock);
316         kfree(tm_list);
317         kfree(tm);
318
319         return ret;
320 }
321
322 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
323                                        struct tree_mod_elem **tm_list,
324                                        int nritems)
325 {
326         int i, j;
327         int ret;
328
329         for (i = nritems - 1; i >= 0; i--) {
330                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
331                 if (ret) {
332                         for (j = nritems - 1; j > i; j--)
333                                 rb_erase(&tm_list[j]->node,
334                                          &fs_info->tree_mod_log);
335                         return ret;
336                 }
337         }
338
339         return 0;
340 }
341
342 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
343                                    struct extent_buffer *new_root,
344                                    int log_removal)
345 {
346         struct btrfs_fs_info *fs_info = old_root->fs_info;
347         struct tree_mod_elem *tm = NULL;
348         struct tree_mod_elem **tm_list = NULL;
349         int nritems = 0;
350         int ret = 0;
351         int i;
352
353         if (!tree_mod_need_log(fs_info, NULL))
354                 return 0;
355
356         if (log_removal && btrfs_header_level(old_root) > 0) {
357                 nritems = btrfs_header_nritems(old_root);
358                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
359                                   GFP_NOFS);
360                 if (!tm_list) {
361                         ret = -ENOMEM;
362                         goto free_tms;
363                 }
364                 for (i = 0; i < nritems; i++) {
365                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
366                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
367                         if (!tm_list[i]) {
368                                 ret = -ENOMEM;
369                                 goto free_tms;
370                         }
371                 }
372         }
373
374         tm = kzalloc(sizeof(*tm), GFP_NOFS);
375         if (!tm) {
376                 ret = -ENOMEM;
377                 goto free_tms;
378         }
379
380         tm->logical = new_root->start;
381         tm->old_root.logical = old_root->start;
382         tm->old_root.level = btrfs_header_level(old_root);
383         tm->generation = btrfs_header_generation(old_root);
384         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
385
386         if (tree_mod_dont_log(fs_info, NULL))
387                 goto free_tms;
388
389         if (tm_list)
390                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
391         if (!ret)
392                 ret = tree_mod_log_insert(fs_info, tm);
393
394         write_unlock(&fs_info->tree_mod_log_lock);
395         if (ret)
396                 goto free_tms;
397         kfree(tm_list);
398
399         return ret;
400
401 free_tms:
402         if (tm_list) {
403                 for (i = 0; i < nritems; i++)
404                         kfree(tm_list[i]);
405                 kfree(tm_list);
406         }
407         kfree(tm);
408
409         return ret;
410 }
411
412 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
413                                                    u64 start, u64 min_seq,
414                                                    int smallest)
415 {
416         struct rb_root *tm_root;
417         struct rb_node *node;
418         struct tree_mod_elem *cur = NULL;
419         struct tree_mod_elem *found = NULL;
420
421         read_lock(&fs_info->tree_mod_log_lock);
422         tm_root = &fs_info->tree_mod_log;
423         node = tm_root->rb_node;
424         while (node) {
425                 cur = rb_entry(node, struct tree_mod_elem, node);
426                 if (cur->logical < start) {
427                         node = node->rb_left;
428                 } else if (cur->logical > start) {
429                         node = node->rb_right;
430                 } else if (cur->seq < min_seq) {
431                         node = node->rb_left;
432                 } else if (!smallest) {
433                         /* We want the node with the highest seq */
434                         if (found)
435                                 BUG_ON(found->seq > cur->seq);
436                         found = cur;
437                         node = node->rb_left;
438                 } else if (cur->seq > min_seq) {
439                         /* We want the node with the smallest seq */
440                         if (found)
441                                 BUG_ON(found->seq < cur->seq);
442                         found = cur;
443                         node = node->rb_right;
444                 } else {
445                         found = cur;
446                         break;
447                 }
448         }
449         read_unlock(&fs_info->tree_mod_log_lock);
450
451         return found;
452 }
453
454 /*
455  * This returns the element from the log with the smallest time sequence
456  * value that's in the log (the oldest log item). Any element with a time
457  * sequence lower than min_seq will be ignored.
458  */
459 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
460                                                         u64 start, u64 min_seq)
461 {
462         return __tree_mod_log_search(fs_info, start, min_seq, 1);
463 }
464
465 /*
466  * This returns the element from the log with the largest time sequence
467  * value that's in the log (the most recent log item). Any element with
468  * a time sequence lower than min_seq will be ignored.
469  */
470 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
471                                                  u64 start, u64 min_seq)
472 {
473         return __tree_mod_log_search(fs_info, start, min_seq, 0);
474 }
475
476 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
477                                struct extent_buffer *src,
478                                unsigned long dst_offset,
479                                unsigned long src_offset,
480                                int nr_items)
481 {
482         struct btrfs_fs_info *fs_info = dst->fs_info;
483         int ret = 0;
484         struct tree_mod_elem **tm_list = NULL;
485         struct tree_mod_elem **tm_list_add, **tm_list_rem;
486         int i;
487         int locked = 0;
488
489         if (!tree_mod_need_log(fs_info, NULL))
490                 return 0;
491
492         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
493                 return 0;
494
495         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
496                           GFP_NOFS);
497         if (!tm_list)
498                 return -ENOMEM;
499
500         tm_list_add = tm_list;
501         tm_list_rem = tm_list + nr_items;
502         for (i = 0; i < nr_items; i++) {
503                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
504                     BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
505                 if (!tm_list_rem[i]) {
506                         ret = -ENOMEM;
507                         goto free_tms;
508                 }
509
510                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
511                                                 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
512                 if (!tm_list_add[i]) {
513                         ret = -ENOMEM;
514                         goto free_tms;
515                 }
516         }
517
518         if (tree_mod_dont_log(fs_info, NULL))
519                 goto free_tms;
520         locked = 1;
521
522         for (i = 0; i < nr_items; i++) {
523                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
524                 if (ret)
525                         goto free_tms;
526                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
527                 if (ret)
528                         goto free_tms;
529         }
530
531         write_unlock(&fs_info->tree_mod_log_lock);
532         kfree(tm_list);
533
534         return 0;
535
536 free_tms:
537         for (i = 0; i < nr_items * 2; i++) {
538                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
539                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
540                 kfree(tm_list[i]);
541         }
542         if (locked)
543                 write_unlock(&fs_info->tree_mod_log_lock);
544         kfree(tm_list);
545
546         return ret;
547 }
548
549 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
550 {
551         struct tree_mod_elem **tm_list = NULL;
552         int nritems = 0;
553         int i;
554         int ret = 0;
555
556         if (btrfs_header_level(eb) == 0)
557                 return 0;
558
559         if (!tree_mod_need_log(eb->fs_info, NULL))
560                 return 0;
561
562         nritems = btrfs_header_nritems(eb);
563         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
564         if (!tm_list)
565                 return -ENOMEM;
566
567         for (i = 0; i < nritems; i++) {
568                 tm_list[i] = alloc_tree_mod_elem(eb, i,
569                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
570                 if (!tm_list[i]) {
571                         ret = -ENOMEM;
572                         goto free_tms;
573                 }
574         }
575
576         if (tree_mod_dont_log(eb->fs_info, eb))
577                 goto free_tms;
578
579         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
580         write_unlock(&eb->fs_info->tree_mod_log_lock);
581         if (ret)
582                 goto free_tms;
583         kfree(tm_list);
584
585         return 0;
586
587 free_tms:
588         for (i = 0; i < nritems; i++)
589                 kfree(tm_list[i]);
590         kfree(tm_list);
591
592         return ret;
593 }
594
595 /*
596  * Returns the logical address of the oldest predecessor of the given root.
597  * Entries older than time_seq are ignored.
598  */
599 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
600                                                       u64 time_seq)
601 {
602         struct tree_mod_elem *tm;
603         struct tree_mod_elem *found = NULL;
604         u64 root_logical = eb_root->start;
605         int looped = 0;
606
607         if (!time_seq)
608                 return NULL;
609
610         /*
611          * The very last operation that's logged for a root is the replacement
612          * operation (if it is replaced at all). This has the logical address
613          * of the *new* root, making it the very first operation that's logged
614          * for this root.
615          */
616         while (1) {
617                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
618                                                 time_seq);
619                 if (!looped && !tm)
620                         return NULL;
621                 /*
622                  * If there are no tree operation for the oldest root, we simply
623                  * return it. This should only happen if that (old) root is at
624                  * level 0.
625                  */
626                 if (!tm)
627                         break;
628
629                 /*
630                  * If there's an operation that's not a root replacement, we
631                  * found the oldest version of our root. Normally, we'll find a
632                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
633                  */
634                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
635                         break;
636
637                 found = tm;
638                 root_logical = tm->old_root.logical;
639                 looped = 1;
640         }
641
642         /* If there's no old root to return, return what we found instead */
643         if (!found)
644                 found = tm;
645
646         return found;
647 }
648
649
650 /*
651  * tm is a pointer to the first operation to rewind within eb. Then, all
652  * previous operations will be rewound (until we reach something older than
653  * time_seq).
654  */
655 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
656                                 struct extent_buffer *eb,
657                                 u64 time_seq,
658                                 struct tree_mod_elem *first_tm)
659 {
660         u32 n;
661         struct rb_node *next;
662         struct tree_mod_elem *tm = first_tm;
663         unsigned long o_dst;
664         unsigned long o_src;
665         unsigned long p_size = sizeof(struct btrfs_key_ptr);
666
667         n = btrfs_header_nritems(eb);
668         read_lock(&fs_info->tree_mod_log_lock);
669         while (tm && tm->seq >= time_seq) {
670                 /*
671                  * All the operations are recorded with the operator used for
672                  * the modification. As we're going backwards, we do the
673                  * opposite of each operation here.
674                  */
675                 switch (tm->op) {
676                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
677                         BUG_ON(tm->slot < n);
678                         fallthrough;
679                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
680                 case BTRFS_MOD_LOG_KEY_REMOVE:
681                         btrfs_set_node_key(eb, &tm->key, tm->slot);
682                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
683                         btrfs_set_node_ptr_generation(eb, tm->slot,
684                                                       tm->generation);
685                         n++;
686                         break;
687                 case BTRFS_MOD_LOG_KEY_REPLACE:
688                         BUG_ON(tm->slot >= n);
689                         btrfs_set_node_key(eb, &tm->key, tm->slot);
690                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
691                         btrfs_set_node_ptr_generation(eb, tm->slot,
692                                                       tm->generation);
693                         break;
694                 case BTRFS_MOD_LOG_KEY_ADD:
695                         /* if a move operation is needed it's in the log */
696                         n--;
697                         break;
698                 case BTRFS_MOD_LOG_MOVE_KEYS:
699                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
700                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
701                         memmove_extent_buffer(eb, o_dst, o_src,
702                                               tm->move.nr_items * p_size);
703                         break;
704                 case BTRFS_MOD_LOG_ROOT_REPLACE:
705                         /*
706                          * This operation is special. For roots, this must be
707                          * handled explicitly before rewinding.
708                          * For non-roots, this operation may exist if the node
709                          * was a root: root A -> child B; then A gets empty and
710                          * B is promoted to the new root. In the mod log, we'll
711                          * have a root-replace operation for B, a tree block
712                          * that is no root. We simply ignore that operation.
713                          */
714                         break;
715                 }
716                 next = rb_next(&tm->node);
717                 if (!next)
718                         break;
719                 tm = rb_entry(next, struct tree_mod_elem, node);
720                 if (tm->logical != first_tm->logical)
721                         break;
722         }
723         read_unlock(&fs_info->tree_mod_log_lock);
724         btrfs_set_header_nritems(eb, n);
725 }
726
727 /*
728  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
729  * is returned. If rewind operations happen, a fresh buffer is returned. The
730  * returned buffer is always read-locked. If the returned buffer is not the
731  * input buffer, the lock on the input buffer is released and the input buffer
732  * is freed (its refcount is decremented).
733  */
734 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
735                                                 struct btrfs_path *path,
736                                                 struct extent_buffer *eb,
737                                                 u64 time_seq)
738 {
739         struct extent_buffer *eb_rewin;
740         struct tree_mod_elem *tm;
741
742         if (!time_seq)
743                 return eb;
744
745         if (btrfs_header_level(eb) == 0)
746                 return eb;
747
748         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
749         if (!tm)
750                 return eb;
751
752         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
753                 BUG_ON(tm->slot != 0);
754                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
755                 if (!eb_rewin) {
756                         btrfs_tree_read_unlock(eb);
757                         free_extent_buffer(eb);
758                         return NULL;
759                 }
760                 btrfs_set_header_bytenr(eb_rewin, eb->start);
761                 btrfs_set_header_backref_rev(eb_rewin,
762                                              btrfs_header_backref_rev(eb));
763                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
764                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
765         } else {
766                 eb_rewin = btrfs_clone_extent_buffer(eb);
767                 if (!eb_rewin) {
768                         btrfs_tree_read_unlock(eb);
769                         free_extent_buffer(eb);
770                         return NULL;
771                 }
772         }
773
774         btrfs_tree_read_unlock(eb);
775         free_extent_buffer(eb);
776
777         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
778                                        eb_rewin, btrfs_header_level(eb_rewin));
779         btrfs_tree_read_lock(eb_rewin);
780         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
781         WARN_ON(btrfs_header_nritems(eb_rewin) >
782                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
783
784         return eb_rewin;
785 }
786
787 /*
788  * Rewind the state of @root's root node to the given @time_seq value.
789  * If there are no changes, the current root->root_node is returned. If anything
790  * changed in between, there's a fresh buffer allocated on which the rewind
791  * operations are done. In any case, the returned buffer is read locked.
792  * Returns NULL on error (with no locks held).
793  */
794 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
795 {
796         struct btrfs_fs_info *fs_info = root->fs_info;
797         struct tree_mod_elem *tm;
798         struct extent_buffer *eb = NULL;
799         struct extent_buffer *eb_root;
800         u64 eb_root_owner = 0;
801         struct extent_buffer *old;
802         struct tree_mod_root *old_root = NULL;
803         u64 old_generation = 0;
804         u64 logical;
805         int level;
806
807         eb_root = btrfs_read_lock_root_node(root);
808         tm = tree_mod_log_oldest_root(eb_root, time_seq);
809         if (!tm)
810                 return eb_root;
811
812         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
813                 old_root = &tm->old_root;
814                 old_generation = tm->generation;
815                 logical = old_root->logical;
816                 level = old_root->level;
817         } else {
818                 logical = eb_root->start;
819                 level = btrfs_header_level(eb_root);
820         }
821
822         tm = tree_mod_log_search(fs_info, logical, time_seq);
823         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
824                 btrfs_tree_read_unlock(eb_root);
825                 free_extent_buffer(eb_root);
826                 old = read_tree_block(fs_info, logical, root->root_key.objectid,
827                                       0, level, NULL);
828                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
829                         if (!IS_ERR(old))
830                                 free_extent_buffer(old);
831                         btrfs_warn(fs_info,
832                                    "failed to read tree block %llu from get_old_root",
833                                    logical);
834                 } else {
835                         btrfs_tree_read_lock(old);
836                         eb = btrfs_clone_extent_buffer(old);
837                         btrfs_tree_read_unlock(old);
838                         free_extent_buffer(old);
839                 }
840         } else if (old_root) {
841                 eb_root_owner = btrfs_header_owner(eb_root);
842                 btrfs_tree_read_unlock(eb_root);
843                 free_extent_buffer(eb_root);
844                 eb = alloc_dummy_extent_buffer(fs_info, logical);
845         } else {
846                 eb = btrfs_clone_extent_buffer(eb_root);
847                 btrfs_tree_read_unlock(eb_root);
848                 free_extent_buffer(eb_root);
849         }
850
851         if (!eb)
852                 return NULL;
853         if (old_root) {
854                 btrfs_set_header_bytenr(eb, eb->start);
855                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
856                 btrfs_set_header_owner(eb, eb_root_owner);
857                 btrfs_set_header_level(eb, old_root->level);
858                 btrfs_set_header_generation(eb, old_generation);
859         }
860         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
861                                        btrfs_header_level(eb));
862         btrfs_tree_read_lock(eb);
863         if (tm)
864                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
865         else
866                 WARN_ON(btrfs_header_level(eb) != 0);
867         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
868
869         return eb;
870 }
871
872 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
873 {
874         struct tree_mod_elem *tm;
875         int level;
876         struct extent_buffer *eb_root = btrfs_root_node(root);
877
878         tm = tree_mod_log_oldest_root(eb_root, time_seq);
879         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
880                 level = tm->old_root.level;
881         else
882                 level = btrfs_header_level(eb_root);
883
884         free_extent_buffer(eb_root);
885
886         return level;
887 }
888