fs/btrfs/tree-mod-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "messages.h"
   4 #include "tree-mod-log.h"
   5 #include "disk-io.h"
   6 #include "fs.h"
   7 #include "accessors.h"
   8 #include "tree-checker.h"
   9
  10 struct tree_mod_root {
  11         u64 logical;
  12         u8 level;
  13 };
  14
  15 struct tree_mod_elem {
  16         struct rb_node node;
  17         u64 logical;
  18         u64 seq;
  19         enum btrfs_mod_log_op op;
  20
  21         /*
  22          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
  23          * operations.
  24          */
  25         int slot;
  26
  27         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
  28         u64 generation;
  29
  30         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
  31         struct btrfs_disk_key key;
  32         u64 blockptr;
  33
  34         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
  35         struct {
  36                 int dst_slot;
  37                 int nr_items;
  38         } move;
  39
  40         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
  41         struct tree_mod_root old_root;
  42 };
  43
  44 /*
  45  * Pull a new tree mod seq number for our operation.
  46  */
  47 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
  48 {
  49         return atomic64_inc_return(&fs_info->tree_mod_seq);
  50 }
  51
  52 /*
  53  * This adds a new blocker to the tree mod log's blocker list if the @elem
  54  * passed does not already have a sequence number set. So when a caller expects
  55  * to record tree modifications, it should ensure to set elem->seq to zero
  56  * before calling btrfs_get_tree_mod_seq.
  57  * Returns a fresh, unused tree log modification sequence number, even if no new
  58  * blocker was added.
  59  */
  60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
  61                            struct btrfs_seq_list *elem)
  62 {
  63         write_lock(&fs_info->tree_mod_log_lock);
  64         if (!elem->seq) {
  65                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
  66                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
  67                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  68         }
  69         write_unlock(&fs_info->tree_mod_log_lock);
  70
  71         return elem->seq;
  72 }
  73
  74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
  75                             struct btrfs_seq_list *elem)
  76 {
  77         struct rb_root *tm_root;
  78         struct rb_node *node;
  79         struct rb_node *next;
  80         struct tree_mod_elem *tm;
  81         u64 min_seq = BTRFS_SEQ_LAST;
  82         u64 seq_putting = elem->seq;
  83
  84         if (!seq_putting)
  85                 return;
  86
  87         write_lock(&fs_info->tree_mod_log_lock);
  88         list_del(&elem->list);
  89         elem->seq = 0;
  90
  91         if (list_empty(&fs_info->tree_mod_seq_list)) {
  92                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
  93         } else {
  94                 struct btrfs_seq_list *first;
  95
  96                 first = list_first_entry(&fs_info->tree_mod_seq_list,
  97                                          struct btrfs_seq_list, list);
  98                 if (seq_putting > first->seq) {
  99                         /*
 100                          * Blocker with lower sequence number exists, we cannot
 101                          * remove anything from the log.
 102                          */
 103                         write_unlock(&fs_info->tree_mod_log_lock);
 104                         return;
 105                 }
 106                 min_seq = first->seq;
 107         }
 108
 109         /*
 110          * Anything that's lower than the lowest existing (read: blocked)
 111          * sequence number can be removed from the tree.
 112          */
 113         tm_root = &fs_info->tree_mod_log;
 114         for (node = rb_first(tm_root); node; node = next) {
 115                 next = rb_next(node);
 116                 tm = rb_entry(node, struct tree_mod_elem, node);
 117                 if (tm->seq >= min_seq)
 118                         continue;
 119                 rb_erase(node, tm_root);
 120                 kfree(tm);
 121         }
 122         write_unlock(&fs_info->tree_mod_log_lock);
 123 }
 124
 125 /*
 126  * Key order of the log:
 127  *       node/leaf start address -> sequence
 128  *
 129  * The 'start address' is the logical address of the *new* root node for root
 130  * replace operations, or the logical address of the affected block for all
 131  * other operations.
 132  */
 133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
 134                                         struct tree_mod_elem *tm)
 135 {
 136         struct rb_root *tm_root;
 137         struct rb_node **new;
 138         struct rb_node *parent = NULL;
 139         struct tree_mod_elem *cur;
 140
 141         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 142
 143         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 144
 145         tm_root = &fs_info->tree_mod_log;
 146         new = &tm_root->rb_node;
 147         while (*new) {
 148                 cur = rb_entry(*new, struct tree_mod_elem, node);
 149                 parent = *new;
 150                 if (cur->logical < tm->logical)
 151                         new = &((*new)->rb_left);
 152                 else if (cur->logical > tm->logical)
 153                         new = &((*new)->rb_right);
 154                 else if (cur->seq < tm->seq)
 155                         new = &((*new)->rb_left);
 156                 else if (cur->seq > tm->seq)
 157                         new = &((*new)->rb_right);
 158                 else
 159                         return -EEXIST;
 160         }
 161
 162         rb_link_node(&tm->node, parent, new);
 163         rb_insert_color(&tm->node, tm_root);
 164         return 0;
 165 }
 166
 167 /*
 168  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
 169  * returns false with the tree_mod_log_lock acquired. The caller must hold
 170  * this until all tree mod log insertions are recorded in the rb tree and then
 171  * write unlock fs_info::tree_mod_log_lock.
 172  */
 173 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 174                                     struct extent_buffer *eb)
 175 {
 176         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 177                 return true;
 178         if (eb && btrfs_header_level(eb) == 0)
 179                 return true;
 180
 181         write_lock(&fs_info->tree_mod_log_lock);
 182         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 183                 write_unlock(&fs_info->tree_mod_log_lock);
 184                 return true;
 185         }
 186
 187         return false;
 188 }
 189
 190 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 191 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 192                                     struct extent_buffer *eb)
 193 {
 194         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
 195                 return false;
 196         if (eb && btrfs_header_level(eb) == 0)
 197                 return false;
 198
 199         return true;
 200 }
 201
 202 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
 203                                                  int slot,
 204                                                  enum btrfs_mod_log_op op)
 205 {
 206         struct tree_mod_elem *tm;
 207
 208         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 209         if (!tm)
 210                 return NULL;
 211
 212         tm->logical = eb->start;
 213         if (op != BTRFS_MOD_LOG_KEY_ADD) {
 214                 btrfs_node_key(eb, &tm->key, slot);
 215                 tm->blockptr = btrfs_node_blockptr(eb, slot);
 216         }
 217         tm->op = op;
 218         tm->slot = slot;
 219         tm->generation = btrfs_node_ptr_generation(eb, slot);
 220         RB_CLEAR_NODE(&tm->node);
 221
 222         return tm;
 223 }
 224
 225 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 226                                   enum btrfs_mod_log_op op)
 227 {
 228         struct tree_mod_elem *tm;
 229         int ret = 0;
 230
 231         if (!tree_mod_need_log(eb->fs_info, eb))
 232                 return 0;
 233
 234         tm = alloc_tree_mod_elem(eb, slot, op);
 235         if (!tm)
 236                 ret = -ENOMEM;
 237
 238         if (tree_mod_dont_log(eb->fs_info, eb)) {
 239                 kfree(tm);
 240                 /*
 241                  * Don't error if we failed to allocate memory because we don't
 242                  * need to log.
 243                  */
 244                 return 0;
 245         } else if (ret != 0) {
 246                 /*
 247                  * We previously failed to allocate memory and we need to log,
 248                  * so we have to fail.
 249                  */
 250                 goto out_unlock;
 251         }
 252
 253         ret = tree_mod_log_insert(eb->fs_info, tm);
 254 out_unlock:
 255         write_unlock(&eb->fs_info->tree_mod_log_lock);
 256         if (ret)
 257                 kfree(tm);
 258
 259         return ret;
 260 }
 261
 262 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
 263                                                      int dst_slot, int src_slot,
 264                                                      int nr_items)
 265 {
 266         struct tree_mod_elem *tm;
 267
 268         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 269         if (!tm)
 270                 return ERR_PTR(-ENOMEM);
 271
 272         tm->logical = eb->start;
 273         tm->slot = src_slot;
 274         tm->move.dst_slot = dst_slot;
 275         tm->move.nr_items = nr_items;
 276         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
 277         RB_CLEAR_NODE(&tm->node);
 278
 279         return tm;
 280 }
 281
 282 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
 283                                    int dst_slot, int src_slot,
 284                                    int nr_items)
 285 {
 286         struct tree_mod_elem *tm = NULL;
 287         struct tree_mod_elem **tm_list = NULL;
 288         int ret = 0;
 289         int i;
 290         bool locked = false;
 291
 292         if (!tree_mod_need_log(eb->fs_info, eb))
 293                 return 0;
 294
 295         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 296         if (!tm_list) {
 297                 ret = -ENOMEM;
 298                 goto lock;
 299         }
 300
 301         tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
 302         if (IS_ERR(tm)) {
 303                 ret = PTR_ERR(tm);
 304                 tm = NULL;
 305                 goto lock;
 306         }
 307
 308         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 309                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 310                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 311                 if (!tm_list[i]) {
 312                         ret = -ENOMEM;
 313                         goto lock;
 314                 }
 315         }
 316
 317 lock:
 318         if (tree_mod_dont_log(eb->fs_info, eb)) {
 319                 /*
 320                  * Don't error if we failed to allocate memory because we don't
 321                  * need to log.
 322                  */
 323                 ret = 0;
 324                 goto free_tms;
 325         }
 326         locked = true;
 327
 328         /*
 329          * We previously failed to allocate memory and we need to log, so we
 330          * have to fail.
 331          */
 332         if (ret != 0)
 333                 goto free_tms;
 334
 335         /*
 336          * When we override something during the move, we log these removals.
 337          * This can only happen when we move towards the beginning of the
 338          * buffer, i.e. dst_slot < src_slot.
 339          */
 340         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 341                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
 342                 if (ret)
 343                         goto free_tms;
 344         }
 345
 346         ret = tree_mod_log_insert(eb->fs_info, tm);
 347         if (ret)
 348                 goto free_tms;
 349         write_unlock(&eb->fs_info->tree_mod_log_lock);
 350         kfree(tm_list);
 351
 352         return 0;
 353
 354 free_tms:
 355         if (tm_list) {
 356                 for (i = 0; i < nr_items; i++) {
 357                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 358                                 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 359                         kfree(tm_list[i]);
 360                 }
 361         }
 362         if (locked)
 363                 write_unlock(&eb->fs_info->tree_mod_log_lock);
 364         kfree(tm_list);
 365         kfree(tm);
 366
 367         return ret;
 368 }
 369
 370 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 371                                        struct tree_mod_elem **tm_list,
 372                                        int nritems)
 373 {
 374         int i, j;
 375         int ret;
 376
 377         for (i = nritems - 1; i >= 0; i--) {
 378                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
 379                 if (ret) {
 380                         for (j = nritems - 1; j > i; j--)
 381                                 rb_erase(&tm_list[j]->node,
 382                                          &fs_info->tree_mod_log);
 383                         return ret;
 384                 }
 385         }
 386
 387         return 0;
 388 }
 389
 390 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
 391                                    struct extent_buffer *new_root,
 392                                    bool log_removal)
 393 {
 394         struct btrfs_fs_info *fs_info = old_root->fs_info;
 395         struct tree_mod_elem *tm = NULL;
 396         struct tree_mod_elem **tm_list = NULL;
 397         int nritems = 0;
 398         int ret = 0;
 399         int i;
 400
 401         if (!tree_mod_need_log(fs_info, NULL))
 402                 return 0;
 403
 404         if (log_removal && btrfs_header_level(old_root) > 0) {
 405                 nritems = btrfs_header_nritems(old_root);
 406                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 407                                   GFP_NOFS);
 408                 if (!tm_list) {
 409                         ret = -ENOMEM;
 410                         goto lock;
 411                 }
 412                 for (i = 0; i < nritems; i++) {
 413                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
 414                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 415                         if (!tm_list[i]) {
 416                                 ret = -ENOMEM;
 417                                 goto lock;
 418                         }
 419                 }
 420         }
 421
 422         tm = kzalloc(sizeof(*tm), GFP_NOFS);
 423         if (!tm) {
 424                 ret = -ENOMEM;
 425                 goto lock;
 426         }
 427
 428         tm->logical = new_root->start;
 429         tm->old_root.logical = old_root->start;
 430         tm->old_root.level = btrfs_header_level(old_root);
 431         tm->generation = btrfs_header_generation(old_root);
 432         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
 433
 434 lock:
 435         if (tree_mod_dont_log(fs_info, NULL)) {
 436                 /*
 437                  * Don't error if we failed to allocate memory because we don't
 438                  * need to log.
 439                  */
 440                 ret = 0;
 441                 goto free_tms;
 442         } else if (ret != 0) {
 443                 /*
 444                  * We previously failed to allocate memory and we need to log,
 445                  * so we have to fail.
 446                  */
 447                 goto out_unlock;
 448         }
 449
 450         if (tm_list)
 451                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
 452         if (!ret)
 453                 ret = tree_mod_log_insert(fs_info, tm);
 454
 455 out_unlock:
 456         write_unlock(&fs_info->tree_mod_log_lock);
 457         if (ret)
 458                 goto free_tms;
 459         kfree(tm_list);
 460
 461         return ret;
 462
 463 free_tms:
 464         if (tm_list) {
 465                 for (i = 0; i < nritems; i++)
 466                         kfree(tm_list[i]);
 467                 kfree(tm_list);
 468         }
 469         kfree(tm);
 470
 471         return ret;
 472 }
 473
 474 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
 475                                                    u64 start, u64 min_seq,
 476                                                    bool smallest)
 477 {
 478         struct rb_root *tm_root;
 479         struct rb_node *node;
 480         struct tree_mod_elem *cur = NULL;
 481         struct tree_mod_elem *found = NULL;
 482
 483         read_lock(&fs_info->tree_mod_log_lock);
 484         tm_root = &fs_info->tree_mod_log;
 485         node = tm_root->rb_node;
 486         while (node) {
 487                 cur = rb_entry(node, struct tree_mod_elem, node);
 488                 if (cur->logical < start) {
 489                         node = node->rb_left;
 490                 } else if (cur->logical > start) {
 491                         node = node->rb_right;
 492                 } else if (cur->seq < min_seq) {
 493                         node = node->rb_left;
 494                 } else if (!smallest) {
 495                         /* We want the node with the highest seq */
 496                         if (found)
 497                                 BUG_ON(found->seq > cur->seq);
 498                         found = cur;
 499                         node = node->rb_left;
 500                 } else if (cur->seq > min_seq) {
 501                         /* We want the node with the smallest seq */
 502                         if (found)
 503                                 BUG_ON(found->seq < cur->seq);
 504                         found = cur;
 505                         node = node->rb_right;
 506                 } else {
 507                         found = cur;
 508                         break;
 509                 }
 510         }
 511         read_unlock(&fs_info->tree_mod_log_lock);
 512
 513         return found;
 514 }
 515
 516 /*
 517  * This returns the element from the log with the smallest time sequence
 518  * value that's in the log (the oldest log item). Any element with a time
 519  * sequence lower than min_seq will be ignored.
 520  */
 521 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
 522                                                         u64 start, u64 min_seq)
 523 {
 524         return __tree_mod_log_search(fs_info, start, min_seq, true);
 525 }
 526
 527 /*
 528  * This returns the element from the log with the largest time sequence
 529  * value that's in the log (the most recent log item). Any element with
 530  * a time sequence lower than min_seq will be ignored.
 531  */
 532 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
 533                                                  u64 start, u64 min_seq)
 534 {
 535         return __tree_mod_log_search(fs_info, start, min_seq, false);
 536 }
 537
 538 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
 539                                struct extent_buffer *src,
 540                                unsigned long dst_offset,
 541                                unsigned long src_offset,
 542                                int nr_items)
 543 {
 544         struct btrfs_fs_info *fs_info = dst->fs_info;
 545         int ret = 0;
 546         struct tree_mod_elem **tm_list = NULL;
 547         struct tree_mod_elem **tm_list_add = NULL;
 548         struct tree_mod_elem **tm_list_rem = NULL;
 549         int i;
 550         bool locked = false;
 551         struct tree_mod_elem *dst_move_tm = NULL;
 552         struct tree_mod_elem *src_move_tm = NULL;
 553         u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
 554         u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
 555
 556         if (!tree_mod_need_log(fs_info, NULL))
 557                 return 0;
 558
 559         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 560                 return 0;
 561
 562         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 563                           GFP_NOFS);
 564         if (!tm_list) {
 565                 ret = -ENOMEM;
 566                 goto lock;
 567         }
 568
 569         if (dst_move_nr_items) {
 570                 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
 571                                                       dst_offset, dst_move_nr_items);
 572                 if (IS_ERR(dst_move_tm)) {
 573                         ret = PTR_ERR(dst_move_tm);
 574                         dst_move_tm = NULL;
 575                         goto lock;
 576                 }
 577         }
 578         if (src_move_nr_items) {
 579                 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
 580                                                       src_offset + nr_items,
 581                                                       src_move_nr_items);
 582                 if (IS_ERR(src_move_tm)) {
 583                         ret = PTR_ERR(src_move_tm);
 584                         src_move_tm = NULL;
 585                         goto lock;
 586                 }
 587         }
 588
 589         tm_list_add = tm_list;
 590         tm_list_rem = tm_list + nr_items;
 591         for (i = 0; i < nr_items; i++) {
 592                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 593                                                      BTRFS_MOD_LOG_KEY_REMOVE);
 594                 if (!tm_list_rem[i]) {
 595                         ret = -ENOMEM;
 596                         goto lock;
 597                 }
 598
 599                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 600                                                      BTRFS_MOD_LOG_KEY_ADD);
 601                 if (!tm_list_add[i]) {
 602                         ret = -ENOMEM;
 603                         goto lock;
 604                 }
 605         }
 606
 607 lock:
 608         if (tree_mod_dont_log(fs_info, NULL)) {
 609                 /*
 610                  * Don't error if we failed to allocate memory because we don't
 611                  * need to log.
 612                  */
 613                 ret = 0;
 614                 goto free_tms;
 615         }
 616         locked = true;
 617
 618         /*
 619          * We previously failed to allocate memory and we need to log, so we
 620          * have to fail.
 621          */
 622         if (ret != 0)
 623                 goto free_tms;
 624
 625         if (dst_move_tm) {
 626                 ret = tree_mod_log_insert(fs_info, dst_move_tm);
 627                 if (ret)
 628                         goto free_tms;
 629         }
 630         for (i = 0; i < nr_items; i++) {
 631                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
 632                 if (ret)
 633                         goto free_tms;
 634                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
 635                 if (ret)
 636                         goto free_tms;
 637         }
 638         if (src_move_tm) {
 639                 ret = tree_mod_log_insert(fs_info, src_move_tm);
 640                 if (ret)
 641                         goto free_tms;
 642         }
 643
 644         write_unlock(&fs_info->tree_mod_log_lock);
 645         kfree(tm_list);
 646
 647         return 0;
 648
 649 free_tms:
 650         if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
 651                 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
 652         kfree(dst_move_tm);
 653         if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
 654                 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
 655         kfree(src_move_tm);
 656         if (tm_list) {
 657                 for (i = 0; i < nr_items * 2; i++) {
 658                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 659                                 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 660                         kfree(tm_list[i]);
 661                 }
 662         }
 663         if (locked)
 664                 write_unlock(&fs_info->tree_mod_log_lock);
 665         kfree(tm_list);
 666
 667         return ret;
 668 }
 669
 670 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
 671 {
 672         struct tree_mod_elem **tm_list = NULL;
 673         int nritems = 0;
 674         int i;
 675         int ret = 0;
 676
 677         if (!tree_mod_need_log(eb->fs_info, eb))
 678                 return 0;
 679
 680         nritems = btrfs_header_nritems(eb);
 681         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 682         if (!tm_list) {
 683                 ret = -ENOMEM;
 684                 goto lock;
 685         }
 686
 687         for (i = 0; i < nritems; i++) {
 688                 tm_list[i] = alloc_tree_mod_elem(eb, i,
 689                                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 690                 if (!tm_list[i]) {
 691                         ret = -ENOMEM;
 692                         goto lock;
 693                 }
 694         }
 695
 696 lock:
 697         if (tree_mod_dont_log(eb->fs_info, eb)) {
 698                 /*
 699                  * Don't error if we failed to allocate memory because we don't
 700                  * need to log.
 701                  */
 702                 ret = 0;
 703                 goto free_tms;
 704         } else if (ret != 0) {
 705                 /*
 706                  * We previously failed to allocate memory and we need to log,
 707                  * so we have to fail.
 708                  */
 709                 goto out_unlock;
 710         }
 711
 712         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 713 out_unlock:
 714         write_unlock(&eb->fs_info->tree_mod_log_lock);
 715         if (ret)
 716                 goto free_tms;
 717         kfree(tm_list);
 718
 719         return 0;
 720
 721 free_tms:
 722         if (tm_list) {
 723                 for (i = 0; i < nritems; i++)
 724                         kfree(tm_list[i]);
 725                 kfree(tm_list);
 726         }
 727
 728         return ret;
 729 }
 730
 731 /*
 732  * Returns the logical address of the oldest predecessor of the given root.
 733  * Entries older than time_seq are ignored.
 734  */
 735 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
 736                                                       u64 time_seq)
 737 {
 738         struct tree_mod_elem *tm;
 739         struct tree_mod_elem *found = NULL;
 740         u64 root_logical = eb_root->start;
 741         bool looped = false;
 742
 743         if (!time_seq)
 744                 return NULL;
 745
 746         /*
 747          * The very last operation that's logged for a root is the replacement
 748          * operation (if it is replaced at all). This has the logical address
 749          * of the *new* root, making it the very first operation that's logged
 750          * for this root.
 751          */
 752         while (1) {
 753                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
 754                                                 time_seq);
 755                 if (!looped && !tm)
 756                         return NULL;
 757                 /*
 758                  * If there are no tree operation for the oldest root, we simply
 759                  * return it. This should only happen if that (old) root is at
 760                  * level 0.
 761                  */
 762                 if (!tm)
 763                         break;
 764
 765                 /*
 766                  * If there's an operation that's not a root replacement, we
 767                  * found the oldest version of our root. Normally, we'll find a
 768                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
 769                  */
 770                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
 771                         break;
 772
 773                 found = tm;
 774                 root_logical = tm->old_root.logical;
 775                 looped = true;
 776         }
 777
 778         /* If there's no old root to return, return what we found instead */
 779         if (!found)
 780                 found = tm;
 781
 782         return found;
 783 }
 784
 785
 786 /*
 787  * tm is a pointer to the first operation to rewind within eb. Then, all
 788  * previous operations will be rewound (until we reach something older than
 789  * time_seq).
 790  */
 791 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 792                                 struct extent_buffer *eb,
 793                                 u64 time_seq,
 794                                 struct tree_mod_elem *first_tm)
 795 {
 796         u32 n;
 797         struct rb_node *next;
 798         struct tree_mod_elem *tm = first_tm;
 799         unsigned long o_dst;
 800         unsigned long o_src;
 801         unsigned long p_size = sizeof(struct btrfs_key_ptr);
 802         /*
 803          * max_slot tracks the maximum valid slot of the rewind eb at every
 804          * step of the rewind. This is in contrast with 'n' which eventually
 805          * matches the number of items, but can be wrong during moves or if
 806          * removes overlap on already valid slots (which is probably separately
 807          * a bug). We do this to validate the offsets of memmoves for rewinding
 808          * moves and detect invalid memmoves.
 809          *
 810          * Since a rewind eb can start empty, max_slot is a signed integer with
 811          * a special meaning for -1, which is that no slot is valid to move out
 812          * of. Any other negative value is invalid.
 813          */
 814         int max_slot;
 815         int move_src_end_slot;
 816         int move_dst_end_slot;
 817
 818         n = btrfs_header_nritems(eb);
 819         max_slot = n - 1;
 820         read_lock(&fs_info->tree_mod_log_lock);
 821         while (tm && tm->seq >= time_seq) {
 822                 ASSERT(max_slot >= -1);
 823                 /*
 824                  * All the operations are recorded with the operator used for
 825                  * the modification. As we're going backwards, we do the
 826                  * opposite of each operation here.
 827                  */
 828                 switch (tm->op) {
 829                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
 830                         BUG_ON(tm->slot < n);
 831                         fallthrough;
 832                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
 833                 case BTRFS_MOD_LOG_KEY_REMOVE:
 834                         btrfs_set_node_key(eb, &tm->key, tm->slot);
 835                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 836                         btrfs_set_node_ptr_generation(eb, tm->slot,
 837                                                       tm->generation);
 838                         n++;
 839                         if (tm->slot > max_slot)
 840                                 max_slot = tm->slot;
 841                         break;
 842                 case BTRFS_MOD_LOG_KEY_REPLACE:
 843                         BUG_ON(tm->slot >= n);
 844                         btrfs_set_node_key(eb, &tm->key, tm->slot);
 845                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
 846                         btrfs_set_node_ptr_generation(eb, tm->slot,
 847                                                       tm->generation);
 848                         break;
 849                 case BTRFS_MOD_LOG_KEY_ADD:
 850                         /*
 851                          * It is possible we could have already removed keys
 852                          * behind the known max slot, so this will be an
 853                          * overestimate. In practice, the copy operation
 854                          * inserts them in increasing order, and overestimating
 855                          * just means we miss some warnings, so it's OK. It
 856                          * isn't worth carefully tracking the full array of
 857                          * valid slots to check against when moving.
 858                          */
 859                         if (tm->slot == max_slot)
 860                                 max_slot--;
 861                         /* if a move operation is needed it's in the log */
 862                         n--;
 863                         break;
 864                 case BTRFS_MOD_LOG_MOVE_KEYS:
 865                         ASSERT(tm->move.nr_items > 0);
 866                         move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
 867                         move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
 868                         o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
 869                         o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
 870                         if (WARN_ON(move_src_end_slot > max_slot ||
 871                                     tm->move.nr_items <= 0)) {
 872                                 btrfs_warn(fs_info,
 873 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
 874                                            eb->start, tm->slot,
 875                                            tm->move.dst_slot, tm->move.nr_items,
 876                                            tm->seq, n, max_slot);
 877                         }
 878                         memmove_extent_buffer(eb, o_dst, o_src,
 879                                               tm->move.nr_items * p_size);
 880                         max_slot = move_dst_end_slot;
 881                         break;
 882                 case BTRFS_MOD_LOG_ROOT_REPLACE:
 883                         /*
 884                          * This operation is special. For roots, this must be
 885                          * handled explicitly before rewinding.
 886                          * For non-roots, this operation may exist if the node
 887                          * was a root: root A -> child B; then A gets empty and
 888                          * B is promoted to the new root. In the mod log, we'll
 889                          * have a root-replace operation for B, a tree block
 890                          * that is no root. We simply ignore that operation.
 891                          */
 892                         break;
 893                 }
 894                 next = rb_next(&tm->node);
 895                 if (!next)
 896                         break;
 897                 tm = rb_entry(next, struct tree_mod_elem, node);
 898                 if (tm->logical != first_tm->logical)
 899                         break;
 900         }
 901         read_unlock(&fs_info->tree_mod_log_lock);
 902         btrfs_set_header_nritems(eb, n);
 903 }
 904
 905 /*
 906  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
 907  * is returned. If rewind operations happen, a fresh buffer is returned. The
 908  * returned buffer is always read-locked. If the returned buffer is not the
 909  * input buffer, the lock on the input buffer is released and the input buffer
 910  * is freed (its refcount is decremented).
 911  */
 912 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
 913                                                 struct btrfs_path *path,
 914                                                 struct extent_buffer *eb,
 915                                                 u64 time_seq)
 916 {
 917         struct extent_buffer *eb_rewin;
 918         struct tree_mod_elem *tm;
 919
 920         if (!time_seq)
 921                 return eb;
 922
 923         if (btrfs_header_level(eb) == 0)
 924                 return eb;
 925
 926         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
 927         if (!tm)
 928                 return eb;
 929
 930         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 931                 BUG_ON(tm->slot != 0);
 932                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
 933                 if (!eb_rewin) {
 934                         btrfs_tree_read_unlock(eb);
 935                         free_extent_buffer(eb);
 936                         return NULL;
 937                 }
 938                 btrfs_set_header_bytenr(eb_rewin, eb->start);
 939                 btrfs_set_header_backref_rev(eb_rewin,
 940                                              btrfs_header_backref_rev(eb));
 941                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
 942                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
 943         } else {
 944                 eb_rewin = btrfs_clone_extent_buffer(eb);
 945                 if (!eb_rewin) {
 946                         btrfs_tree_read_unlock(eb);
 947                         free_extent_buffer(eb);
 948                         return NULL;
 949                 }
 950         }
 951
 952         btrfs_tree_read_unlock(eb);
 953         free_extent_buffer(eb);
 954
 955         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
 956                                        eb_rewin, btrfs_header_level(eb_rewin));
 957         btrfs_tree_read_lock(eb_rewin);
 958         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 959         WARN_ON(btrfs_header_nritems(eb_rewin) >
 960                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
 961
 962         return eb_rewin;
 963 }
 964
 965 /*
 966  * Rewind the state of @root's root node to the given @time_seq value.
 967  * If there are no changes, the current root->root_node is returned. If anything
 968  * changed in between, there's a fresh buffer allocated on which the rewind
 969  * operations are done. In any case, the returned buffer is read locked.
 970  * Returns NULL on error (with no locks held).
 971  */
 972 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
 973 {
 974         struct btrfs_fs_info *fs_info = root->fs_info;
 975         struct tree_mod_elem *tm;
 976         struct extent_buffer *eb = NULL;
 977         struct extent_buffer *eb_root;
 978         u64 eb_root_owner = 0;
 979         struct extent_buffer *old;
 980         struct tree_mod_root *old_root = NULL;
 981         u64 old_generation = 0;
 982         u64 logical;
 983         int level;
 984
 985         eb_root = btrfs_read_lock_root_node(root);
 986         tm = tree_mod_log_oldest_root(eb_root, time_seq);
 987         if (!tm)
 988                 return eb_root;
 989
 990         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
 991                 old_root = &tm->old_root;
 992                 old_generation = tm->generation;
 993                 logical = old_root->logical;
 994                 level = old_root->level;
 995         } else {
 996                 logical = eb_root->start;
 997                 level = btrfs_header_level(eb_root);
 998         }
 999
1000         tm = tree_mod_log_search(fs_info, logical, time_seq);
1001         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1002                 struct btrfs_tree_parent_check check = { 0 };
1003
1004                 btrfs_tree_read_unlock(eb_root);
1005                 free_extent_buffer(eb_root);
1006
1007                 check.level = level;
1008                 check.owner_root = root->root_key.objectid;
1009
1010                 old = read_tree_block(fs_info, logical, &check);
1011                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1012                         if (!IS_ERR(old))
1013                                 free_extent_buffer(old);
1014                         btrfs_warn(fs_info,
1015                                    "failed to read tree block %llu from get_old_root",
1016                                    logical);
1017                 } else {
1018                         struct tree_mod_elem *tm2;
1019
1020                         btrfs_tree_read_lock(old);
1021                         eb = btrfs_clone_extent_buffer(old);
1022                         /*
1023                          * After the lookup for the most recent tree mod operation
1024                          * above and before we locked and cloned the extent buffer
1025                          * 'old', a new tree mod log operation may have been added.
1026                          * So lookup for a more recent one to make sure the number
1027                          * of mod log operations we replay is consistent with the
1028                          * number of items we have in the cloned extent buffer,
1029                          * otherwise we can hit a BUG_ON when rewinding the extent
1030                          * buffer.
1031                          */
1032                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1033                         btrfs_tree_read_unlock(old);
1034                         free_extent_buffer(old);
1035                         ASSERT(tm2);
1036                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1037                         if (!tm2 || tm2->seq < tm->seq) {
1038                                 free_extent_buffer(eb);
1039                                 return NULL;
1040                         }
1041                         tm = tm2;
1042                 }
1043         } else if (old_root) {
1044                 eb_root_owner = btrfs_header_owner(eb_root);
1045                 btrfs_tree_read_unlock(eb_root);
1046                 free_extent_buffer(eb_root);
1047                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1048         } else {
1049                 eb = btrfs_clone_extent_buffer(eb_root);
1050                 btrfs_tree_read_unlock(eb_root);
1051                 free_extent_buffer(eb_root);
1052         }
1053
1054         if (!eb)
1055                 return NULL;
1056         if (old_root) {
1057                 btrfs_set_header_bytenr(eb, eb->start);
1058                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1059                 btrfs_set_header_owner(eb, eb_root_owner);
1060                 btrfs_set_header_level(eb, old_root->level);
1061                 btrfs_set_header_generation(eb, old_generation);
1062         }
1063         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1064                                        btrfs_header_level(eb));
1065         btrfs_tree_read_lock(eb);
1066         if (tm)
1067                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1068         else
1069                 WARN_ON(btrfs_header_level(eb) != 0);
1070         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1071
1072         return eb;
1073 }
1074
1075 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1076 {
1077         struct tree_mod_elem *tm;
1078         int level;
1079         struct extent_buffer *eb_root = btrfs_root_node(root);
1080
1081         tm = tree_mod_log_oldest_root(eb_root, time_seq);
1082         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1083                 level = tm->old_root.level;
1084         else
1085                 level = btrfs_header_level(eb_root);
1086
1087         free_extent_buffer(eb_root);
1088
1089         return level;
1090 }
1091
1092 /*
1093  * Return the lowest sequence number in the tree modification log.
1094  *
1095  * Return the sequence number of the oldest tree modification log user, which
1096  * corresponds to the lowest sequence number of all existing users. If there are
1097  * no users it returns 0.
1098  */
1099 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1100 {
1101         u64 ret = 0;
1102
1103         read_lock(&fs_info->tree_mod_log_lock);
1104         if (!list_empty(&fs_info->tree_mod_seq_list)) {
1105                 struct btrfs_seq_list *elem;
1106
1107                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1108                                         struct btrfs_seq_list, list);
1109                 ret = elem->seq;
1110         }
1111         read_unlock(&fs_info->tree_mod_log_lock);
1112
1113         return ret;
1114 }