1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
12 #include "transaction.h"
13 #include "print-tree.h"
17 #include "tree-mod-log.h"
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20 *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22 const struct btrfs_key *ins_key, struct btrfs_path *path,
23 int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25 struct extent_buffer *dst,
26 struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28 struct extent_buffer *dst_buf,
29 struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
33 static const struct btrfs_csums {
36 const char driver[12];
38 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41 [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42 .driver = "blake2b-256" },
45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
47 u16 t = btrfs_super_csum_type(s);
49 * csum type is validated at mount time
51 return btrfs_csums[t].size;
54 const char *btrfs_super_csum_name(u16 csum_type)
56 /* csum type is validated at mount time */
57 return btrfs_csums[csum_type].name;
61 * Return driver name if defined, otherwise the name that's also a valid driver
64 const char *btrfs_super_csum_driver(u16 csum_type)
66 /* csum type is validated at mount time */
67 return btrfs_csums[csum_type].driver[0] ?
68 btrfs_csums[csum_type].driver :
69 btrfs_csums[csum_type].name;
72 size_t __attribute_const__ btrfs_get_num_csums(void)
74 return ARRAY_SIZE(btrfs_csums);
77 struct btrfs_path *btrfs_alloc_path(void)
79 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
82 /* this also releases the path */
83 void btrfs_free_path(struct btrfs_path *p)
87 btrfs_release_path(p);
88 kmem_cache_free(btrfs_path_cachep, p);
92 * path release drops references on the extent buffers in the path
93 * and it drops any locks held by this path
95 * It is safe to call this on paths that no locks or extent buffers held.
97 noinline void btrfs_release_path(struct btrfs_path *p)
101 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
106 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
109 free_extent_buffer(p->nodes[i]);
115 * safely gets a reference on the root node of a tree. A lock
116 * is not taken, so a concurrent writer may put a different node
117 * at the root of the tree. See btrfs_lock_root_node for the
120 * The extent buffer returned by this has a reference taken, so
121 * it won't disappear. It may stop being the root of the tree
122 * at any time because there are no locks held.
124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
126 struct extent_buffer *eb;
130 eb = rcu_dereference(root->node);
133 * RCU really hurts here, we could free up the root node because
134 * it was COWed but we may not get the new root node yet so do
135 * the inc_not_zero dance and if it doesn't work then
136 * synchronize_rcu and try again.
138 if (atomic_inc_not_zero(&eb->refs)) {
149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150 * just get put onto a simple dirty list. Transaction walks this list to make
151 * sure they get properly updated on disk.
153 static void add_root_to_dirty_list(struct btrfs_root *root)
155 struct btrfs_fs_info *fs_info = root->fs_info;
157 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
161 spin_lock(&fs_info->trans_lock);
162 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163 /* Want the extent tree to be the last on the list */
164 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165 list_move_tail(&root->dirty_list,
166 &fs_info->dirty_cowonly_roots);
168 list_move(&root->dirty_list,
169 &fs_info->dirty_cowonly_roots);
171 spin_unlock(&fs_info->trans_lock);
175 * used by snapshot creation to make a copy of a root for a tree with
176 * a given objectid. The buffer with the new root node is returned in
177 * cow_ret, and this func returns zero on success or a negative error code.
179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180 struct btrfs_root *root,
181 struct extent_buffer *buf,
182 struct extent_buffer **cow_ret, u64 new_root_objectid)
184 struct btrfs_fs_info *fs_info = root->fs_info;
185 struct extent_buffer *cow;
188 struct btrfs_disk_key disk_key;
190 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191 trans->transid != fs_info->running_transaction->transid);
192 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193 trans->transid != root->last_trans);
195 level = btrfs_header_level(buf);
197 btrfs_item_key(buf, &disk_key, 0);
199 btrfs_node_key(buf, &disk_key, 0);
201 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202 &disk_key, level, buf->start, 0,
203 BTRFS_NESTING_NEW_ROOT);
207 copy_extent_buffer_full(cow, buf);
208 btrfs_set_header_bytenr(cow, cow->start);
209 btrfs_set_header_generation(cow, trans->transid);
210 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212 BTRFS_HEADER_FLAG_RELOC);
213 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
216 btrfs_set_header_owner(cow, new_root_objectid);
218 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
220 WARN_ON(btrfs_header_generation(buf) > trans->transid);
221 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222 ret = btrfs_inc_ref(trans, root, cow, 1);
224 ret = btrfs_inc_ref(trans, root, cow, 0);
226 btrfs_tree_unlock(cow);
227 free_extent_buffer(cow);
228 btrfs_abort_transaction(trans, ret);
232 btrfs_mark_buffer_dirty(cow);
238 * check if the tree block can be shared by multiple trees
240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241 struct extent_buffer *buf)
244 * Tree blocks not in shareable trees and tree roots are never shared.
245 * If a block was allocated after the last snapshot and the block was
246 * not allocated by tree relocation, we know the block is not shared.
248 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249 buf != root->node && buf != root->commit_root &&
250 (btrfs_header_generation(buf) <=
251 btrfs_root_last_snapshot(&root->root_item) ||
252 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259 struct btrfs_root *root,
260 struct extent_buffer *buf,
261 struct extent_buffer *cow,
264 struct btrfs_fs_info *fs_info = root->fs_info;
272 * Backrefs update rules:
274 * Always use full backrefs for extent pointers in tree block
275 * allocated by tree relocation.
277 * If a shared tree block is no longer referenced by its owner
278 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279 * use full backrefs for extent pointers in tree block.
281 * If a tree block is been relocating
282 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283 * use full backrefs for extent pointers in tree block.
284 * The reason for this is some operations (such as drop tree)
285 * are only allowed for blocks use full backrefs.
288 if (btrfs_block_can_be_shared(root, buf)) {
289 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290 btrfs_header_level(buf), 1,
296 btrfs_handle_fs_error(fs_info, ret, NULL);
301 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
308 owner = btrfs_header_owner(buf);
309 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
313 if ((owner == root->root_key.objectid ||
314 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316 ret = btrfs_inc_ref(trans, root, buf, 1);
320 if (root->root_key.objectid ==
321 BTRFS_TREE_RELOC_OBJECTID) {
322 ret = btrfs_dec_ref(trans, root, buf, 0);
325 ret = btrfs_inc_ref(trans, root, cow, 1);
329 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
332 if (root->root_key.objectid ==
333 BTRFS_TREE_RELOC_OBJECTID)
334 ret = btrfs_inc_ref(trans, root, cow, 1);
336 ret = btrfs_inc_ref(trans, root, cow, 0);
340 if (new_flags != 0) {
341 int level = btrfs_header_level(buf);
343 ret = btrfs_set_disk_extent_flags(trans, buf,
344 new_flags, level, 0);
349 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350 if (root->root_key.objectid ==
351 BTRFS_TREE_RELOC_OBJECTID)
352 ret = btrfs_inc_ref(trans, root, cow, 1);
354 ret = btrfs_inc_ref(trans, root, cow, 0);
357 ret = btrfs_dec_ref(trans, root, buf, 1);
361 btrfs_clean_tree_block(buf);
367 static struct extent_buffer *alloc_tree_block_no_bg_flush(
368 struct btrfs_trans_handle *trans,
369 struct btrfs_root *root,
371 const struct btrfs_disk_key *disk_key,
375 enum btrfs_lock_nesting nest)
377 struct btrfs_fs_info *fs_info = root->fs_info;
378 struct extent_buffer *ret;
381 * If we are COWing a node/leaf from the extent, chunk, device or free
382 * space trees, make sure that we do not finish block group creation of
383 * pending block groups. We do this to avoid a deadlock.
384 * COWing can result in allocation of a new chunk, and flushing pending
385 * block groups (btrfs_create_pending_block_groups()) can be triggered
386 * when finishing allocation of a new chunk. Creation of a pending block
387 * group modifies the extent, chunk, device and free space trees,
388 * therefore we could deadlock with ourselves since we are holding a
389 * lock on an extent buffer that btrfs_create_pending_block_groups() may
391 * For similar reasons, we also need to delay flushing pending block
392 * groups when splitting a leaf or node, from one of those trees, since
393 * we are holding a write lock on it and its parent or when inserting a
394 * new root node for one of those trees.
396 if (root == fs_info->extent_root ||
397 root == fs_info->chunk_root ||
398 root == fs_info->dev_root ||
399 root == fs_info->free_space_root)
400 trans->can_flush_pending_bgs = false;
402 ret = btrfs_alloc_tree_block(trans, root, parent_start,
403 root->root_key.objectid, disk_key, level,
404 hint, empty_size, nest);
405 trans->can_flush_pending_bgs = true;
411 * does the dirty work in cow of a single block. The parent block (if
412 * supplied) is updated to point to the new cow copy. The new buffer is marked
413 * dirty and returned locked. If you modify the block it needs to be marked
416 * search_start -- an allocation hint for the new block
418 * empty_size -- a hint that you plan on doing more cow. This is the size in
419 * bytes the allocator should try to find free next to the block it returns.
420 * This is just a hint and may be ignored by the allocator.
422 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
423 struct btrfs_root *root,
424 struct extent_buffer *buf,
425 struct extent_buffer *parent, int parent_slot,
426 struct extent_buffer **cow_ret,
427 u64 search_start, u64 empty_size,
428 enum btrfs_lock_nesting nest)
430 struct btrfs_fs_info *fs_info = root->fs_info;
431 struct btrfs_disk_key disk_key;
432 struct extent_buffer *cow;
436 u64 parent_start = 0;
441 btrfs_assert_tree_locked(buf);
443 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
444 trans->transid != fs_info->running_transaction->transid);
445 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
446 trans->transid != root->last_trans);
448 level = btrfs_header_level(buf);
451 btrfs_item_key(buf, &disk_key, 0);
453 btrfs_node_key(buf, &disk_key, 0);
455 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
456 parent_start = parent->start;
458 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
459 level, search_start, empty_size, nest);
463 /* cow is set to blocking by btrfs_init_new_buffer */
465 copy_extent_buffer_full(cow, buf);
466 btrfs_set_header_bytenr(cow, cow->start);
467 btrfs_set_header_generation(cow, trans->transid);
468 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
469 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
470 BTRFS_HEADER_FLAG_RELOC);
471 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
472 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
474 btrfs_set_header_owner(cow, root->root_key.objectid);
476 write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
478 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
480 btrfs_tree_unlock(cow);
481 free_extent_buffer(cow);
482 btrfs_abort_transaction(trans, ret);
486 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
487 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
489 btrfs_tree_unlock(cow);
490 free_extent_buffer(cow);
491 btrfs_abort_transaction(trans, ret);
496 if (buf == root->node) {
497 WARN_ON(parent && parent != buf);
498 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
499 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
500 parent_start = buf->start;
502 atomic_inc(&cow->refs);
503 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
505 rcu_assign_pointer(root->node, cow);
507 btrfs_free_tree_block(trans, root, buf, parent_start,
509 free_extent_buffer(buf);
510 add_root_to_dirty_list(root);
512 WARN_ON(trans->transid != btrfs_header_generation(parent));
513 btrfs_tree_mod_log_insert_key(parent, parent_slot,
514 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
515 btrfs_set_node_blockptr(parent, parent_slot,
517 btrfs_set_node_ptr_generation(parent, parent_slot,
519 btrfs_mark_buffer_dirty(parent);
521 ret = btrfs_tree_mod_log_free_eb(buf);
523 btrfs_tree_unlock(cow);
524 free_extent_buffer(cow);
525 btrfs_abort_transaction(trans, ret);
529 btrfs_free_tree_block(trans, root, buf, parent_start,
533 btrfs_tree_unlock(buf);
534 free_extent_buffer_stale(buf);
535 btrfs_mark_buffer_dirty(cow);
540 static inline int should_cow_block(struct btrfs_trans_handle *trans,
541 struct btrfs_root *root,
542 struct extent_buffer *buf)
544 if (btrfs_is_testing(root->fs_info))
547 /* Ensure we can see the FORCE_COW bit */
548 smp_mb__before_atomic();
551 * We do not need to cow a block if
552 * 1) this block is not created or changed in this transaction;
553 * 2) this block does not belong to TREE_RELOC tree;
554 * 3) the root is not forced COW.
556 * What is forced COW:
557 * when we create snapshot during committing the transaction,
558 * after we've finished copying src root, we must COW the shared
559 * block to ensure the metadata consistency.
561 if (btrfs_header_generation(buf) == trans->transid &&
562 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
563 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
564 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
565 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
571 * cows a single block, see __btrfs_cow_block for the real work.
572 * This version of it has extra checks so that a block isn't COWed more than
573 * once per transaction, as long as it hasn't been written yet
575 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
576 struct btrfs_root *root, struct extent_buffer *buf,
577 struct extent_buffer *parent, int parent_slot,
578 struct extent_buffer **cow_ret,
579 enum btrfs_lock_nesting nest)
581 struct btrfs_fs_info *fs_info = root->fs_info;
585 if (test_bit(BTRFS_ROOT_DELETING, &root->state))
587 "COW'ing blocks on a fs root that's being dropped");
589 if (trans->transaction != fs_info->running_transaction)
590 WARN(1, KERN_CRIT "trans %llu running %llu\n",
592 fs_info->running_transaction->transid);
594 if (trans->transid != fs_info->generation)
595 WARN(1, KERN_CRIT "trans %llu running %llu\n",
596 trans->transid, fs_info->generation);
598 if (!should_cow_block(trans, root, buf)) {
603 search_start = buf->start & ~((u64)SZ_1G - 1);
606 * Before CoWing this block for later modification, check if it's
607 * the subtree root and do the delayed subtree trace if needed.
609 * Also We don't care about the error, as it's handled internally.
611 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
612 ret = __btrfs_cow_block(trans, root, buf, parent,
613 parent_slot, cow_ret, search_start, 0, nest);
615 trace_btrfs_cow_block(root, buf, *cow_ret);
619 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
622 * helper function for defrag to decide if two blocks pointed to by a
623 * node are actually close by
625 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
627 if (blocknr < other && other - (blocknr + blocksize) < 32768)
629 if (blocknr > other && blocknr - (other + blocksize) < 32768)
634 #ifdef __LITTLE_ENDIAN
637 * Compare two keys, on little-endian the disk order is same as CPU order and
638 * we can avoid the conversion.
640 static int comp_keys(const struct btrfs_disk_key *disk_key,
641 const struct btrfs_key *k2)
643 const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
645 return btrfs_comp_cpu_keys(k1, k2);
651 * compare two keys in a memcmp fashion
653 static int comp_keys(const struct btrfs_disk_key *disk,
654 const struct btrfs_key *k2)
658 btrfs_disk_key_to_cpu(&k1, disk);
660 return btrfs_comp_cpu_keys(&k1, k2);
665 * same as comp_keys only with two btrfs_key's
667 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
669 if (k1->objectid > k2->objectid)
671 if (k1->objectid < k2->objectid)
673 if (k1->type > k2->type)
675 if (k1->type < k2->type)
677 if (k1->offset > k2->offset)
679 if (k1->offset < k2->offset)
685 * this is used by the defrag code to go through all the
686 * leaves pointed to by a node and reallocate them so that
687 * disk order is close to key order
689 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
690 struct btrfs_root *root, struct extent_buffer *parent,
691 int start_slot, u64 *last_ret,
692 struct btrfs_key *progress)
694 struct btrfs_fs_info *fs_info = root->fs_info;
695 struct extent_buffer *cur;
697 u64 search_start = *last_ret;
705 int progress_passed = 0;
706 struct btrfs_disk_key disk_key;
708 WARN_ON(trans->transaction != fs_info->running_transaction);
709 WARN_ON(trans->transid != fs_info->generation);
711 parent_nritems = btrfs_header_nritems(parent);
712 blocksize = fs_info->nodesize;
713 end_slot = parent_nritems - 1;
715 if (parent_nritems <= 1)
718 for (i = start_slot; i <= end_slot; i++) {
721 btrfs_node_key(parent, &disk_key, i);
722 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
726 blocknr = btrfs_node_blockptr(parent, i);
728 last_block = blocknr;
731 other = btrfs_node_blockptr(parent, i - 1);
732 close = close_blocks(blocknr, other, blocksize);
734 if (!close && i < end_slot) {
735 other = btrfs_node_blockptr(parent, i + 1);
736 close = close_blocks(blocknr, other, blocksize);
739 last_block = blocknr;
743 cur = btrfs_read_node_slot(parent, i);
746 if (search_start == 0)
747 search_start = last_block;
749 btrfs_tree_lock(cur);
750 err = __btrfs_cow_block(trans, root, cur, parent, i,
753 (end_slot - i) * blocksize),
756 btrfs_tree_unlock(cur);
757 free_extent_buffer(cur);
760 search_start = cur->start;
761 last_block = cur->start;
762 *last_ret = search_start;
763 btrfs_tree_unlock(cur);
764 free_extent_buffer(cur);
770 * search for key in the extent_buffer. The items start at offset p,
771 * and they are item_size apart. There are 'max' items in p.
773 * the slot in the array is returned via slot, and it points to
774 * the place where you would insert key if it is not found in
777 * slot may point to max if the key is bigger than all of the keys
779 static noinline int generic_bin_search(struct extent_buffer *eb,
780 unsigned long p, int item_size,
781 const struct btrfs_key *key,
787 const int key_size = sizeof(struct btrfs_disk_key);
790 btrfs_err(eb->fs_info,
791 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
792 __func__, low, high, eb->start,
793 btrfs_header_owner(eb), btrfs_header_level(eb));
799 unsigned long offset;
800 struct btrfs_disk_key *tmp;
801 struct btrfs_disk_key unaligned;
804 mid = (low + high) / 2;
805 offset = p + mid * item_size;
806 oip = offset_in_page(offset);
808 if (oip + key_size <= PAGE_SIZE) {
809 const unsigned long idx = get_eb_page_index(offset);
810 char *kaddr = page_address(eb->pages[idx]);
812 oip = get_eb_offset_in_page(eb, offset);
813 tmp = (struct btrfs_disk_key *)(kaddr + oip);
815 read_extent_buffer(eb, &unaligned, offset, key_size);
819 ret = comp_keys(tmp, key);
835 * simple bin_search frontend that does the right thing for
838 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
841 if (btrfs_header_level(eb) == 0)
842 return generic_bin_search(eb,
843 offsetof(struct btrfs_leaf, items),
844 sizeof(struct btrfs_item),
845 key, btrfs_header_nritems(eb),
848 return generic_bin_search(eb,
849 offsetof(struct btrfs_node, ptrs),
850 sizeof(struct btrfs_key_ptr),
851 key, btrfs_header_nritems(eb),
855 static void root_add_used(struct btrfs_root *root, u32 size)
857 spin_lock(&root->accounting_lock);
858 btrfs_set_root_used(&root->root_item,
859 btrfs_root_used(&root->root_item) + size);
860 spin_unlock(&root->accounting_lock);
863 static void root_sub_used(struct btrfs_root *root, u32 size)
865 spin_lock(&root->accounting_lock);
866 btrfs_set_root_used(&root->root_item,
867 btrfs_root_used(&root->root_item) - size);
868 spin_unlock(&root->accounting_lock);
871 /* given a node and slot number, this reads the blocks it points to. The
872 * extent buffer is returned with a reference taken (but unlocked).
874 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
877 int level = btrfs_header_level(parent);
878 struct extent_buffer *eb;
879 struct btrfs_key first_key;
881 if (slot < 0 || slot >= btrfs_header_nritems(parent))
882 return ERR_PTR(-ENOENT);
886 btrfs_node_key_to_cpu(parent, &first_key, slot);
887 eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
888 btrfs_header_owner(parent),
889 btrfs_node_ptr_generation(parent, slot),
890 level - 1, &first_key);
891 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
892 free_extent_buffer(eb);
900 * node level balancing, used to make sure nodes are in proper order for
901 * item deletion. We balance from the top down, so we have to make sure
902 * that a deletion won't leave an node completely empty later on.
904 static noinline int balance_level(struct btrfs_trans_handle *trans,
905 struct btrfs_root *root,
906 struct btrfs_path *path, int level)
908 struct btrfs_fs_info *fs_info = root->fs_info;
909 struct extent_buffer *right = NULL;
910 struct extent_buffer *mid;
911 struct extent_buffer *left = NULL;
912 struct extent_buffer *parent = NULL;
916 int orig_slot = path->slots[level];
921 mid = path->nodes[level];
923 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
924 WARN_ON(btrfs_header_generation(mid) != trans->transid);
926 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
928 if (level < BTRFS_MAX_LEVEL - 1) {
929 parent = path->nodes[level + 1];
930 pslot = path->slots[level + 1];
934 * deal with the case where there is only one pointer in the root
935 * by promoting the node below to a root
938 struct extent_buffer *child;
940 if (btrfs_header_nritems(mid) != 1)
943 /* promote the child to a root */
944 child = btrfs_read_node_slot(mid, 0);
946 ret = PTR_ERR(child);
947 btrfs_handle_fs_error(fs_info, ret, NULL);
951 btrfs_tree_lock(child);
952 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
955 btrfs_tree_unlock(child);
956 free_extent_buffer(child);
960 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
962 rcu_assign_pointer(root->node, child);
964 add_root_to_dirty_list(root);
965 btrfs_tree_unlock(child);
967 path->locks[level] = 0;
968 path->nodes[level] = NULL;
969 btrfs_clean_tree_block(mid);
970 btrfs_tree_unlock(mid);
971 /* once for the path */
972 free_extent_buffer(mid);
974 root_sub_used(root, mid->len);
975 btrfs_free_tree_block(trans, root, mid, 0, 1);
976 /* once for the root ptr */
977 free_extent_buffer_stale(mid);
980 if (btrfs_header_nritems(mid) >
981 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
984 left = btrfs_read_node_slot(parent, pslot - 1);
989 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
990 wret = btrfs_cow_block(trans, root, left,
991 parent, pslot - 1, &left,
992 BTRFS_NESTING_LEFT_COW);
999 right = btrfs_read_node_slot(parent, pslot + 1);
1004 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1005 wret = btrfs_cow_block(trans, root, right,
1006 parent, pslot + 1, &right,
1007 BTRFS_NESTING_RIGHT_COW);
1014 /* first, try to make some room in the middle buffer */
1016 orig_slot += btrfs_header_nritems(left);
1017 wret = push_node_left(trans, left, mid, 1);
1023 * then try to empty the right most buffer into the middle
1026 wret = push_node_left(trans, mid, right, 1);
1027 if (wret < 0 && wret != -ENOSPC)
1029 if (btrfs_header_nritems(right) == 0) {
1030 btrfs_clean_tree_block(right);
1031 btrfs_tree_unlock(right);
1032 del_ptr(root, path, level + 1, pslot + 1);
1033 root_sub_used(root, right->len);
1034 btrfs_free_tree_block(trans, root, right, 0, 1);
1035 free_extent_buffer_stale(right);
1038 struct btrfs_disk_key right_key;
1039 btrfs_node_key(right, &right_key, 0);
1040 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1041 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1043 btrfs_set_node_key(parent, &right_key, pslot + 1);
1044 btrfs_mark_buffer_dirty(parent);
1047 if (btrfs_header_nritems(mid) == 1) {
1049 * we're not allowed to leave a node with one item in the
1050 * tree during a delete. A deletion from lower in the tree
1051 * could try to delete the only pointer in this node.
1052 * So, pull some keys from the left.
1053 * There has to be a left pointer at this point because
1054 * otherwise we would have pulled some pointers from the
1059 btrfs_handle_fs_error(fs_info, ret, NULL);
1062 wret = balance_node_right(trans, mid, left);
1068 wret = push_node_left(trans, left, mid, 1);
1074 if (btrfs_header_nritems(mid) == 0) {
1075 btrfs_clean_tree_block(mid);
1076 btrfs_tree_unlock(mid);
1077 del_ptr(root, path, level + 1, pslot);
1078 root_sub_used(root, mid->len);
1079 btrfs_free_tree_block(trans, root, mid, 0, 1);
1080 free_extent_buffer_stale(mid);
1083 /* update the parent key to reflect our changes */
1084 struct btrfs_disk_key mid_key;
1085 btrfs_node_key(mid, &mid_key, 0);
1086 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1087 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1089 btrfs_set_node_key(parent, &mid_key, pslot);
1090 btrfs_mark_buffer_dirty(parent);
1093 /* update the path */
1095 if (btrfs_header_nritems(left) > orig_slot) {
1096 atomic_inc(&left->refs);
1097 /* left was locked after cow */
1098 path->nodes[level] = left;
1099 path->slots[level + 1] -= 1;
1100 path->slots[level] = orig_slot;
1102 btrfs_tree_unlock(mid);
1103 free_extent_buffer(mid);
1106 orig_slot -= btrfs_header_nritems(left);
1107 path->slots[level] = orig_slot;
1110 /* double check we haven't messed things up */
1112 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1116 btrfs_tree_unlock(right);
1117 free_extent_buffer(right);
1120 if (path->nodes[level] != left)
1121 btrfs_tree_unlock(left);
1122 free_extent_buffer(left);
1127 /* Node balancing for insertion. Here we only split or push nodes around
1128 * when they are completely full. This is also done top down, so we
1129 * have to be pessimistic.
1131 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1132 struct btrfs_root *root,
1133 struct btrfs_path *path, int level)
1135 struct btrfs_fs_info *fs_info = root->fs_info;
1136 struct extent_buffer *right = NULL;
1137 struct extent_buffer *mid;
1138 struct extent_buffer *left = NULL;
1139 struct extent_buffer *parent = NULL;
1143 int orig_slot = path->slots[level];
1148 mid = path->nodes[level];
1149 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151 if (level < BTRFS_MAX_LEVEL - 1) {
1152 parent = path->nodes[level + 1];
1153 pslot = path->slots[level + 1];
1159 left = btrfs_read_node_slot(parent, pslot - 1);
1163 /* first, try to make some room in the middle buffer */
1167 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1169 left_nr = btrfs_header_nritems(left);
1170 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1173 ret = btrfs_cow_block(trans, root, left, parent,
1175 BTRFS_NESTING_LEFT_COW);
1179 wret = push_node_left(trans, left, mid, 0);
1185 struct btrfs_disk_key disk_key;
1186 orig_slot += left_nr;
1187 btrfs_node_key(mid, &disk_key, 0);
1188 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1189 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1191 btrfs_set_node_key(parent, &disk_key, pslot);
1192 btrfs_mark_buffer_dirty(parent);
1193 if (btrfs_header_nritems(left) > orig_slot) {
1194 path->nodes[level] = left;
1195 path->slots[level + 1] -= 1;
1196 path->slots[level] = orig_slot;
1197 btrfs_tree_unlock(mid);
1198 free_extent_buffer(mid);
1201 btrfs_header_nritems(left);
1202 path->slots[level] = orig_slot;
1203 btrfs_tree_unlock(left);
1204 free_extent_buffer(left);
1208 btrfs_tree_unlock(left);
1209 free_extent_buffer(left);
1211 right = btrfs_read_node_slot(parent, pslot + 1);
1216 * then try to empty the right most buffer into the middle
1221 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1223 right_nr = btrfs_header_nritems(right);
1224 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1227 ret = btrfs_cow_block(trans, root, right,
1229 &right, BTRFS_NESTING_RIGHT_COW);
1233 wret = balance_node_right(trans, right, mid);
1239 struct btrfs_disk_key disk_key;
1241 btrfs_node_key(right, &disk_key, 0);
1242 ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1243 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1245 btrfs_set_node_key(parent, &disk_key, pslot + 1);
1246 btrfs_mark_buffer_dirty(parent);
1248 if (btrfs_header_nritems(mid) <= orig_slot) {
1249 path->nodes[level] = right;
1250 path->slots[level + 1] += 1;
1251 path->slots[level] = orig_slot -
1252 btrfs_header_nritems(mid);
1253 btrfs_tree_unlock(mid);
1254 free_extent_buffer(mid);
1256 btrfs_tree_unlock(right);
1257 free_extent_buffer(right);
1261 btrfs_tree_unlock(right);
1262 free_extent_buffer(right);
1268 * readahead one full node of leaves, finding things that are close
1269 * to the block in 'slot', and triggering ra on them.
1271 static void reada_for_search(struct btrfs_fs_info *fs_info,
1272 struct btrfs_path *path,
1273 int level, int slot, u64 objectid)
1275 struct extent_buffer *node;
1276 struct btrfs_disk_key disk_key;
1282 struct extent_buffer *eb;
1287 if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1290 if (!path->nodes[level])
1293 node = path->nodes[level];
1296 * Since the time between visiting leaves is much shorter than the time
1297 * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1298 * much IO at once (possibly random).
1300 if (path->reada == READA_FORWARD_ALWAYS) {
1302 nread_max = node->fs_info->nodesize;
1304 nread_max = SZ_128K;
1309 search = btrfs_node_blockptr(node, slot);
1310 blocksize = fs_info->nodesize;
1311 eb = find_extent_buffer(fs_info, search);
1313 free_extent_buffer(eb);
1319 nritems = btrfs_header_nritems(node);
1323 if (path->reada == READA_BACK) {
1327 } else if (path->reada == READA_FORWARD ||
1328 path->reada == READA_FORWARD_ALWAYS) {
1333 if (path->reada == READA_BACK && objectid) {
1334 btrfs_node_key(node, &disk_key, nr);
1335 if (btrfs_disk_key_objectid(&disk_key) != objectid)
1338 search = btrfs_node_blockptr(node, nr);
1339 if (path->reada == READA_FORWARD_ALWAYS ||
1340 (search <= target && target - search <= 65536) ||
1341 (search > target && search - target <= 65536)) {
1342 btrfs_readahead_node_child(node, nr);
1346 if (nread > nread_max || nscan > 32)
1351 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1353 struct extent_buffer *parent;
1357 parent = path->nodes[level + 1];
1361 nritems = btrfs_header_nritems(parent);
1362 slot = path->slots[level + 1];
1365 btrfs_readahead_node_child(parent, slot - 1);
1366 if (slot + 1 < nritems)
1367 btrfs_readahead_node_child(parent, slot + 1);
1372 * when we walk down the tree, it is usually safe to unlock the higher layers
1373 * in the tree. The exceptions are when our path goes through slot 0, because
1374 * operations on the tree might require changing key pointers higher up in the
1377 * callers might also have set path->keep_locks, which tells this code to keep
1378 * the lock if the path points to the last slot in the block. This is part of
1379 * walking through the tree, and selecting the next slot in the higher block.
1381 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
1382 * if lowest_unlock is 1, level 0 won't be unlocked
1384 static noinline void unlock_up(struct btrfs_path *path, int level,
1385 int lowest_unlock, int min_write_lock_level,
1386 int *write_lock_level)
1389 int skip_level = level;
1391 struct extent_buffer *t;
1393 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1394 if (!path->nodes[i])
1396 if (!path->locks[i])
1398 if (!no_skips && path->slots[i] == 0) {
1402 if (!no_skips && path->keep_locks) {
1405 nritems = btrfs_header_nritems(t);
1406 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1411 if (skip_level < i && i >= lowest_unlock)
1415 if (i >= lowest_unlock && i > skip_level) {
1416 btrfs_tree_unlock_rw(t, path->locks[i]);
1418 if (write_lock_level &&
1419 i > min_write_lock_level &&
1420 i <= *write_lock_level) {
1421 *write_lock_level = i - 1;
1428 * helper function for btrfs_search_slot. The goal is to find a block
1429 * in cache without setting the path to blocking. If we find the block
1430 * we return zero and the path is unchanged.
1432 * If we can't find the block, we set the path blocking and do some
1433 * reada. -EAGAIN is returned and the search must be repeated.
1436 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1437 struct extent_buffer **eb_ret, int level, int slot,
1438 const struct btrfs_key *key)
1440 struct btrfs_fs_info *fs_info = root->fs_info;
1443 struct extent_buffer *tmp;
1444 struct btrfs_key first_key;
1448 blocknr = btrfs_node_blockptr(*eb_ret, slot);
1449 gen = btrfs_node_ptr_generation(*eb_ret, slot);
1450 parent_level = btrfs_header_level(*eb_ret);
1451 btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1453 tmp = find_extent_buffer(fs_info, blocknr);
1455 if (p->reada == READA_FORWARD_ALWAYS)
1456 reada_for_search(fs_info, p, level, slot, key->objectid);
1458 /* first we do an atomic uptodate check */
1459 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1461 * Do extra check for first_key, eb can be stale due to
1462 * being cached, read from scrub, or have multiple
1463 * parents (shared tree blocks).
1465 if (btrfs_verify_level_key(tmp,
1466 parent_level - 1, &first_key, gen)) {
1467 free_extent_buffer(tmp);
1474 /* now we're allowed to do a blocking uptodate check */
1475 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1480 free_extent_buffer(tmp);
1481 btrfs_release_path(p);
1486 * reduce lock contention at high levels
1487 * of the btree by dropping locks before
1488 * we read. Don't release the lock on the current
1489 * level because we need to walk this node to figure
1490 * out which blocks to read.
1492 btrfs_unlock_up_safe(p, level + 1);
1494 if (p->reada != READA_NONE)
1495 reada_for_search(fs_info, p, level, slot, key->objectid);
1498 tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1499 gen, parent_level - 1, &first_key);
1502 * If the read above didn't mark this buffer up to date,
1503 * it will never end up being up to date. Set ret to EIO now
1504 * and give up so that our caller doesn't loop forever
1507 if (!extent_buffer_uptodate(tmp))
1509 free_extent_buffer(tmp);
1514 btrfs_release_path(p);
1519 * helper function for btrfs_search_slot. This does all of the checks
1520 * for node-level blocks and does any balancing required based on
1523 * If no extra work was required, zero is returned. If we had to
1524 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1528 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1529 struct btrfs_root *root, struct btrfs_path *p,
1530 struct extent_buffer *b, int level, int ins_len,
1531 int *write_lock_level)
1533 struct btrfs_fs_info *fs_info = root->fs_info;
1536 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1537 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1545 reada_for_balance(p, level);
1546 ret = split_node(trans, root, p, level);
1548 b = p->nodes[level];
1549 } else if (ins_len < 0 && btrfs_header_nritems(b) <
1550 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1552 if (*write_lock_level < level + 1) {
1553 *write_lock_level = level + 1;
1554 btrfs_release_path(p);
1558 reada_for_balance(p, level);
1559 ret = balance_level(trans, root, p, level);
1563 b = p->nodes[level];
1565 btrfs_release_path(p);
1568 BUG_ON(btrfs_header_nritems(b) == 1);
1573 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1574 u64 iobjectid, u64 ioff, u8 key_type,
1575 struct btrfs_key *found_key)
1578 struct btrfs_key key;
1579 struct extent_buffer *eb;
1584 key.type = key_type;
1585 key.objectid = iobjectid;
1588 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1592 eb = path->nodes[0];
1593 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1594 ret = btrfs_next_leaf(fs_root, path);
1597 eb = path->nodes[0];
1600 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1601 if (found_key->type != key.type ||
1602 found_key->objectid != key.objectid)
1608 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1609 struct btrfs_path *p,
1610 int write_lock_level)
1612 struct btrfs_fs_info *fs_info = root->fs_info;
1613 struct extent_buffer *b;
1617 /* We try very hard to do read locks on the root */
1618 root_lock = BTRFS_READ_LOCK;
1620 if (p->search_commit_root) {
1622 * The commit roots are read only so we always do read locks,
1623 * and we always must hold the commit_root_sem when doing
1624 * searches on them, the only exception is send where we don't
1625 * want to block transaction commits for a long time, so
1626 * we need to clone the commit root in order to avoid races
1627 * with transaction commits that create a snapshot of one of
1628 * the roots used by a send operation.
1630 if (p->need_commit_sem) {
1631 down_read(&fs_info->commit_root_sem);
1632 b = btrfs_clone_extent_buffer(root->commit_root);
1633 up_read(&fs_info->commit_root_sem);
1635 return ERR_PTR(-ENOMEM);
1638 b = root->commit_root;
1639 atomic_inc(&b->refs);
1641 level = btrfs_header_level(b);
1643 * Ensure that all callers have set skip_locking when
1644 * p->search_commit_root = 1.
1646 ASSERT(p->skip_locking == 1);
1651 if (p->skip_locking) {
1652 b = btrfs_root_node(root);
1653 level = btrfs_header_level(b);
1658 * If the level is set to maximum, we can skip trying to get the read
1661 if (write_lock_level < BTRFS_MAX_LEVEL) {
1663 * We don't know the level of the root node until we actually
1664 * have it read locked
1666 b = btrfs_read_lock_root_node(root);
1667 level = btrfs_header_level(b);
1668 if (level > write_lock_level)
1671 /* Whoops, must trade for write lock */
1672 btrfs_tree_read_unlock(b);
1673 free_extent_buffer(b);
1676 b = btrfs_lock_root_node(root);
1677 root_lock = BTRFS_WRITE_LOCK;
1679 /* The level might have changed, check again */
1680 level = btrfs_header_level(b);
1683 p->nodes[level] = b;
1684 if (!p->skip_locking)
1685 p->locks[level] = root_lock;
1687 * Callers are responsible for dropping b's references.
1694 * btrfs_search_slot - look for a key in a tree and perform necessary
1695 * modifications to preserve tree invariants.
1697 * @trans: Handle of transaction, used when modifying the tree
1698 * @p: Holds all btree nodes along the search path
1699 * @root: The root node of the tree
1700 * @key: The key we are looking for
1701 * @ins_len: Indicates purpose of search:
1702 * >0 for inserts it's size of item inserted (*)
1704 * 0 for plain searches, not modifying the tree
1706 * (*) If size of item inserted doesn't include
1707 * sizeof(struct btrfs_item), then p->search_for_extension must
1709 * @cow: boolean should CoW operations be performed. Must always be 1
1710 * when modifying the tree.
1712 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1713 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1715 * If @key is found, 0 is returned and you can find the item in the leaf level
1716 * of the path (level 0)
1718 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1719 * points to the slot where it should be inserted
1721 * If an error is encountered while searching the tree a negative error number
1724 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1725 const struct btrfs_key *key, struct btrfs_path *p,
1726 int ins_len, int cow)
1728 struct extent_buffer *b;
1733 int lowest_unlock = 1;
1734 /* everything at write_lock_level or lower must be write locked */
1735 int write_lock_level = 0;
1736 u8 lowest_level = 0;
1737 int min_write_lock_level;
1740 lowest_level = p->lowest_level;
1741 WARN_ON(lowest_level && ins_len > 0);
1742 WARN_ON(p->nodes[0] != NULL);
1743 BUG_ON(!cow && ins_len);
1748 /* when we are removing items, we might have to go up to level
1749 * two as we update tree pointers Make sure we keep write
1750 * for those levels as well
1752 write_lock_level = 2;
1753 } else if (ins_len > 0) {
1755 * for inserting items, make sure we have a write lock on
1756 * level 1 so we can update keys
1758 write_lock_level = 1;
1762 write_lock_level = -1;
1764 if (cow && (p->keep_locks || p->lowest_level))
1765 write_lock_level = BTRFS_MAX_LEVEL;
1767 min_write_lock_level = write_lock_level;
1771 b = btrfs_search_slot_get_root(root, p, write_lock_level);
1780 level = btrfs_header_level(b);
1783 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1786 * if we don't really need to cow this block
1787 * then we don't want to set the path blocking,
1788 * so we test it here
1790 if (!should_cow_block(trans, root, b))
1794 * must have write locks on this node and the
1797 if (level > write_lock_level ||
1798 (level + 1 > write_lock_level &&
1799 level + 1 < BTRFS_MAX_LEVEL &&
1800 p->nodes[level + 1])) {
1801 write_lock_level = level + 1;
1802 btrfs_release_path(p);
1807 err = btrfs_cow_block(trans, root, b, NULL, 0,
1811 err = btrfs_cow_block(trans, root, b,
1812 p->nodes[level + 1],
1813 p->slots[level + 1], &b,
1821 p->nodes[level] = b;
1823 * Leave path with blocking locks to avoid massive
1824 * lock context switch, this is made on purpose.
1828 * we have a lock on b and as long as we aren't changing
1829 * the tree, there is no way to for the items in b to change.
1830 * It is safe to drop the lock on our parent before we
1831 * go through the expensive btree search on b.
1833 * If we're inserting or deleting (ins_len != 0), then we might
1834 * be changing slot zero, which may require changing the parent.
1835 * So, we can't drop the lock until after we know which slot
1836 * we're operating on.
1838 if (!ins_len && !p->keep_locks) {
1841 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1842 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1848 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1849 * we can safely assume the target key will always be in slot 0
1850 * on lower levels due to the invariants BTRFS' btree provides,
1851 * namely that a btrfs_key_ptr entry always points to the
1852 * lowest key in the child node, thus we can skip searching
1855 if (prev_cmp == 0) {
1859 ret = btrfs_bin_search(b, key, &slot);
1866 p->slots[level] = slot;
1868 * Item key already exists. In this case, if we are
1869 * allowed to insert the item (for example, in dir_item
1870 * case, item key collision is allowed), it will be
1871 * merged with the original item. Only the item size
1872 * grows, no new btrfs item will be added. If
1873 * search_for_extension is not set, ins_len already
1874 * accounts the size btrfs_item, deduct it here so leaf
1875 * space check will be correct.
1877 if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1878 ASSERT(ins_len >= sizeof(struct btrfs_item));
1879 ins_len -= sizeof(struct btrfs_item);
1882 btrfs_leaf_free_space(b) < ins_len) {
1883 if (write_lock_level < 1) {
1884 write_lock_level = 1;
1885 btrfs_release_path(p);
1889 err = split_leaf(trans, root, key,
1890 p, ins_len, ret == 0);
1898 if (!p->search_for_split)
1899 unlock_up(p, level, lowest_unlock,
1900 min_write_lock_level, NULL);
1903 if (ret && slot > 0) {
1907 p->slots[level] = slot;
1908 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1916 b = p->nodes[level];
1917 slot = p->slots[level];
1920 * Slot 0 is special, if we change the key we have to update
1921 * the parent pointer which means we must have a write lock on
1924 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1925 write_lock_level = level + 1;
1926 btrfs_release_path(p);
1930 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1933 if (level == lowest_level) {
1939 err = read_block_for_search(root, p, &b, level, slot, key);
1947 if (!p->skip_locking) {
1948 level = btrfs_header_level(b);
1949 if (level <= write_lock_level) {
1951 p->locks[level] = BTRFS_WRITE_LOCK;
1953 btrfs_tree_read_lock(b);
1954 p->locks[level] = BTRFS_READ_LOCK;
1956 p->nodes[level] = b;
1961 if (ret < 0 && !p->skip_release_on_error)
1962 btrfs_release_path(p);
1965 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1968 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1969 * current state of the tree together with the operations recorded in the tree
1970 * modification log to search for the key in a previous version of this tree, as
1971 * denoted by the time_seq parameter.
1973 * Naturally, there is no support for insert, delete or cow operations.
1975 * The resulting path and return value will be set up as if we called
1976 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1978 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1979 struct btrfs_path *p, u64 time_seq)
1981 struct btrfs_fs_info *fs_info = root->fs_info;
1982 struct extent_buffer *b;
1987 int lowest_unlock = 1;
1988 u8 lowest_level = 0;
1990 lowest_level = p->lowest_level;
1991 WARN_ON(p->nodes[0] != NULL);
1993 if (p->search_commit_root) {
1995 return btrfs_search_slot(NULL, root, key, p, 0, 0);
1999 b = btrfs_get_old_root(root, time_seq);
2004 level = btrfs_header_level(b);
2005 p->locks[level] = BTRFS_READ_LOCK;
2010 level = btrfs_header_level(b);
2011 p->nodes[level] = b;
2014 * we have a lock on b and as long as we aren't changing
2015 * the tree, there is no way to for the items in b to change.
2016 * It is safe to drop the lock on our parent before we
2017 * go through the expensive btree search on b.
2019 btrfs_unlock_up_safe(p, level + 1);
2021 ret = btrfs_bin_search(b, key, &slot);
2026 p->slots[level] = slot;
2027 unlock_up(p, level, lowest_unlock, 0, NULL);
2031 if (ret && slot > 0) {
2035 p->slots[level] = slot;
2036 unlock_up(p, level, lowest_unlock, 0, NULL);
2038 if (level == lowest_level) {
2044 err = read_block_for_search(root, p, &b, level, slot, key);
2052 level = btrfs_header_level(b);
2053 btrfs_tree_read_lock(b);
2054 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2059 p->locks[level] = BTRFS_READ_LOCK;
2060 p->nodes[level] = b;
2065 btrfs_release_path(p);
2071 * helper to use instead of search slot if no exact match is needed but
2072 * instead the next or previous item should be returned.
2073 * When find_higher is true, the next higher item is returned, the next lower
2075 * When return_any and find_higher are both true, and no higher item is found,
2076 * return the next lower instead.
2077 * When return_any is true and find_higher is false, and no lower item is found,
2078 * return the next higher instead.
2079 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2082 int btrfs_search_slot_for_read(struct btrfs_root *root,
2083 const struct btrfs_key *key,
2084 struct btrfs_path *p, int find_higher,
2088 struct extent_buffer *leaf;
2091 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2095 * a return value of 1 means the path is at the position where the
2096 * item should be inserted. Normally this is the next bigger item,
2097 * but in case the previous item is the last in a leaf, path points
2098 * to the first free slot in the previous leaf, i.e. at an invalid
2104 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2105 ret = btrfs_next_leaf(root, p);
2111 * no higher item found, return the next
2116 btrfs_release_path(p);
2120 if (p->slots[0] == 0) {
2121 ret = btrfs_prev_leaf(root, p);
2126 if (p->slots[0] == btrfs_header_nritems(leaf))
2133 * no lower item found, return the next
2138 btrfs_release_path(p);
2148 * adjust the pointers going up the tree, starting at level
2149 * making sure the right key of each node is points to 'key'.
2150 * This is used after shifting pointers to the left, so it stops
2151 * fixing up pointers when a given leaf/node is not in slot 0 of the
2155 static void fixup_low_keys(struct btrfs_path *path,
2156 struct btrfs_disk_key *key, int level)
2159 struct extent_buffer *t;
2162 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2163 int tslot = path->slots[i];
2165 if (!path->nodes[i])
2168 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2169 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2171 btrfs_set_node_key(t, key, tslot);
2172 btrfs_mark_buffer_dirty(path->nodes[i]);
2181 * This function isn't completely safe. It's the caller's responsibility
2182 * that the new key won't break the order
2184 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2185 struct btrfs_path *path,
2186 const struct btrfs_key *new_key)
2188 struct btrfs_disk_key disk_key;
2189 struct extent_buffer *eb;
2192 eb = path->nodes[0];
2193 slot = path->slots[0];
2195 btrfs_item_key(eb, &disk_key, slot - 1);
2196 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2198 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2199 slot, btrfs_disk_key_objectid(&disk_key),
2200 btrfs_disk_key_type(&disk_key),
2201 btrfs_disk_key_offset(&disk_key),
2202 new_key->objectid, new_key->type,
2204 btrfs_print_leaf(eb);
2208 if (slot < btrfs_header_nritems(eb) - 1) {
2209 btrfs_item_key(eb, &disk_key, slot + 1);
2210 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2212 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2213 slot, btrfs_disk_key_objectid(&disk_key),
2214 btrfs_disk_key_type(&disk_key),
2215 btrfs_disk_key_offset(&disk_key),
2216 new_key->objectid, new_key->type,
2218 btrfs_print_leaf(eb);
2223 btrfs_cpu_key_to_disk(&disk_key, new_key);
2224 btrfs_set_item_key(eb, &disk_key, slot);
2225 btrfs_mark_buffer_dirty(eb);
2227 fixup_low_keys(path, &disk_key, 1);
2231 * Check key order of two sibling extent buffers.
2233 * Return true if something is wrong.
2234 * Return false if everything is fine.
2236 * Tree-checker only works inside one tree block, thus the following
2237 * corruption can not be detected by tree-checker:
2239 * Leaf @left | Leaf @right
2240 * --------------------------------------------------------------
2241 * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
2243 * Key f6 in leaf @left itself is valid, but not valid when the next
2244 * key in leaf @right is 7.
2245 * This can only be checked at tree block merge time.
2246 * And since tree checker has ensured all key order in each tree block
2247 * is correct, we only need to bother the last key of @left and the first
2250 static bool check_sibling_keys(struct extent_buffer *left,
2251 struct extent_buffer *right)
2253 struct btrfs_key left_last;
2254 struct btrfs_key right_first;
2255 int level = btrfs_header_level(left);
2256 int nr_left = btrfs_header_nritems(left);
2257 int nr_right = btrfs_header_nritems(right);
2259 /* No key to check in one of the tree blocks */
2260 if (!nr_left || !nr_right)
2264 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2265 btrfs_node_key_to_cpu(right, &right_first, 0);
2267 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2268 btrfs_item_key_to_cpu(right, &right_first, 0);
2271 if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2272 btrfs_crit(left->fs_info,
2273 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2274 left_last.objectid, left_last.type,
2275 left_last.offset, right_first.objectid,
2276 right_first.type, right_first.offset);
2283 * try to push data from one node into the next node left in the
2286 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2287 * error, and > 0 if there was no room in the left hand block.
2289 static int push_node_left(struct btrfs_trans_handle *trans,
2290 struct extent_buffer *dst,
2291 struct extent_buffer *src, int empty)
2293 struct btrfs_fs_info *fs_info = trans->fs_info;
2299 src_nritems = btrfs_header_nritems(src);
2300 dst_nritems = btrfs_header_nritems(dst);
2301 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2302 WARN_ON(btrfs_header_generation(src) != trans->transid);
2303 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2305 if (!empty && src_nritems <= 8)
2308 if (push_items <= 0)
2312 push_items = min(src_nritems, push_items);
2313 if (push_items < src_nritems) {
2314 /* leave at least 8 pointers in the node if
2315 * we aren't going to empty it
2317 if (src_nritems - push_items < 8) {
2318 if (push_items <= 8)
2324 push_items = min(src_nritems - 8, push_items);
2326 /* dst is the left eb, src is the middle eb */
2327 if (check_sibling_keys(dst, src)) {
2329 btrfs_abort_transaction(trans, ret);
2332 ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2334 btrfs_abort_transaction(trans, ret);
2337 copy_extent_buffer(dst, src,
2338 btrfs_node_key_ptr_offset(dst_nritems),
2339 btrfs_node_key_ptr_offset(0),
2340 push_items * sizeof(struct btrfs_key_ptr));
2342 if (push_items < src_nritems) {
2344 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2345 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2347 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2348 btrfs_node_key_ptr_offset(push_items),
2349 (src_nritems - push_items) *
2350 sizeof(struct btrfs_key_ptr));
2352 btrfs_set_header_nritems(src, src_nritems - push_items);
2353 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2354 btrfs_mark_buffer_dirty(src);
2355 btrfs_mark_buffer_dirty(dst);
2361 * try to push data from one node into the next node right in the
2364 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2365 * error, and > 0 if there was no room in the right hand block.
2367 * this will only push up to 1/2 the contents of the left node over
2369 static int balance_node_right(struct btrfs_trans_handle *trans,
2370 struct extent_buffer *dst,
2371 struct extent_buffer *src)
2373 struct btrfs_fs_info *fs_info = trans->fs_info;
2380 WARN_ON(btrfs_header_generation(src) != trans->transid);
2381 WARN_ON(btrfs_header_generation(dst) != trans->transid);
2383 src_nritems = btrfs_header_nritems(src);
2384 dst_nritems = btrfs_header_nritems(dst);
2385 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2386 if (push_items <= 0)
2389 if (src_nritems < 4)
2392 max_push = src_nritems / 2 + 1;
2393 /* don't try to empty the node */
2394 if (max_push >= src_nritems)
2397 if (max_push < push_items)
2398 push_items = max_push;
2400 /* dst is the right eb, src is the middle eb */
2401 if (check_sibling_keys(src, dst)) {
2403 btrfs_abort_transaction(trans, ret);
2406 ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2408 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2409 btrfs_node_key_ptr_offset(0),
2411 sizeof(struct btrfs_key_ptr));
2413 ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2416 btrfs_abort_transaction(trans, ret);
2419 copy_extent_buffer(dst, src,
2420 btrfs_node_key_ptr_offset(0),
2421 btrfs_node_key_ptr_offset(src_nritems - push_items),
2422 push_items * sizeof(struct btrfs_key_ptr));
2424 btrfs_set_header_nritems(src, src_nritems - push_items);
2425 btrfs_set_header_nritems(dst, dst_nritems + push_items);
2427 btrfs_mark_buffer_dirty(src);
2428 btrfs_mark_buffer_dirty(dst);
2434 * helper function to insert a new root level in the tree.
2435 * A new node is allocated, and a single item is inserted to
2436 * point to the existing root
2438 * returns zero on success or < 0 on failure.
2440 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2441 struct btrfs_root *root,
2442 struct btrfs_path *path, int level)
2444 struct btrfs_fs_info *fs_info = root->fs_info;
2446 struct extent_buffer *lower;
2447 struct extent_buffer *c;
2448 struct extent_buffer *old;
2449 struct btrfs_disk_key lower_key;
2452 BUG_ON(path->nodes[level]);
2453 BUG_ON(path->nodes[level-1] != root->node);
2455 lower = path->nodes[level-1];
2457 btrfs_item_key(lower, &lower_key, 0);
2459 btrfs_node_key(lower, &lower_key, 0);
2461 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
2462 root->node->start, 0,
2463 BTRFS_NESTING_NEW_ROOT);
2467 root_add_used(root, fs_info->nodesize);
2469 btrfs_set_header_nritems(c, 1);
2470 btrfs_set_node_key(c, &lower_key, 0);
2471 btrfs_set_node_blockptr(c, 0, lower->start);
2472 lower_gen = btrfs_header_generation(lower);
2473 WARN_ON(lower_gen != trans->transid);
2475 btrfs_set_node_ptr_generation(c, 0, lower_gen);
2477 btrfs_mark_buffer_dirty(c);
2480 ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2482 rcu_assign_pointer(root->node, c);
2484 /* the super has an extra ref to root->node */
2485 free_extent_buffer(old);
2487 add_root_to_dirty_list(root);
2488 atomic_inc(&c->refs);
2489 path->nodes[level] = c;
2490 path->locks[level] = BTRFS_WRITE_LOCK;
2491 path->slots[level] = 0;
2496 * worker function to insert a single pointer in a node.
2497 * the node should have enough room for the pointer already
2499 * slot and level indicate where you want the key to go, and
2500 * blocknr is the block the key points to.
2502 static void insert_ptr(struct btrfs_trans_handle *trans,
2503 struct btrfs_path *path,
2504 struct btrfs_disk_key *key, u64 bytenr,
2505 int slot, int level)
2507 struct extent_buffer *lower;
2511 BUG_ON(!path->nodes[level]);
2512 btrfs_assert_tree_locked(path->nodes[level]);
2513 lower = path->nodes[level];
2514 nritems = btrfs_header_nritems(lower);
2515 BUG_ON(slot > nritems);
2516 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2517 if (slot != nritems) {
2519 ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2520 slot, nritems - slot);
2523 memmove_extent_buffer(lower,
2524 btrfs_node_key_ptr_offset(slot + 1),
2525 btrfs_node_key_ptr_offset(slot),
2526 (nritems - slot) * sizeof(struct btrfs_key_ptr));
2529 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2530 BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2533 btrfs_set_node_key(lower, key, slot);
2534 btrfs_set_node_blockptr(lower, slot, bytenr);
2535 WARN_ON(trans->transid == 0);
2536 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2537 btrfs_set_header_nritems(lower, nritems + 1);
2538 btrfs_mark_buffer_dirty(lower);
2542 * split the node at the specified level in path in two.
2543 * The path is corrected to point to the appropriate node after the split
2545 * Before splitting this tries to make some room in the node by pushing
2546 * left and right, if either one works, it returns right away.
2548 * returns 0 on success and < 0 on failure
2550 static noinline int split_node(struct btrfs_trans_handle *trans,
2551 struct btrfs_root *root,
2552 struct btrfs_path *path, int level)
2554 struct btrfs_fs_info *fs_info = root->fs_info;
2555 struct extent_buffer *c;
2556 struct extent_buffer *split;
2557 struct btrfs_disk_key disk_key;
2562 c = path->nodes[level];
2563 WARN_ON(btrfs_header_generation(c) != trans->transid);
2564 if (c == root->node) {
2566 * trying to split the root, lets make a new one
2568 * tree mod log: We don't log_removal old root in
2569 * insert_new_root, because that root buffer will be kept as a
2570 * normal node. We are going to log removal of half of the
2571 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2572 * holding a tree lock on the buffer, which is why we cannot
2573 * race with other tree_mod_log users.
2575 ret = insert_new_root(trans, root, path, level + 1);
2579 ret = push_nodes_for_insert(trans, root, path, level);
2580 c = path->nodes[level];
2581 if (!ret && btrfs_header_nritems(c) <
2582 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2588 c_nritems = btrfs_header_nritems(c);
2589 mid = (c_nritems + 1) / 2;
2590 btrfs_node_key(c, &disk_key, mid);
2592 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
2593 c->start, 0, BTRFS_NESTING_SPLIT);
2595 return PTR_ERR(split);
2597 root_add_used(root, fs_info->nodesize);
2598 ASSERT(btrfs_header_level(c) == level);
2600 ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2602 btrfs_abort_transaction(trans, ret);
2605 copy_extent_buffer(split, c,
2606 btrfs_node_key_ptr_offset(0),
2607 btrfs_node_key_ptr_offset(mid),
2608 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2609 btrfs_set_header_nritems(split, c_nritems - mid);
2610 btrfs_set_header_nritems(c, mid);
2612 btrfs_mark_buffer_dirty(c);
2613 btrfs_mark_buffer_dirty(split);
2615 insert_ptr(trans, path, &disk_key, split->start,
2616 path->slots[level + 1] + 1, level + 1);
2618 if (path->slots[level] >= mid) {
2619 path->slots[level] -= mid;
2620 btrfs_tree_unlock(c);
2621 free_extent_buffer(c);
2622 path->nodes[level] = split;
2623 path->slots[level + 1] += 1;
2625 btrfs_tree_unlock(split);
2626 free_extent_buffer(split);
2632 * how many bytes are required to store the items in a leaf. start
2633 * and nr indicate which items in the leaf to check. This totals up the
2634 * space used both by the item structs and the item data
2636 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2638 struct btrfs_item *start_item;
2639 struct btrfs_item *end_item;
2641 int nritems = btrfs_header_nritems(l);
2642 int end = min(nritems, start + nr) - 1;
2646 start_item = btrfs_item_nr(start);
2647 end_item = btrfs_item_nr(end);
2648 data_len = btrfs_item_offset(l, start_item) +
2649 btrfs_item_size(l, start_item);
2650 data_len = data_len - btrfs_item_offset(l, end_item);
2651 data_len += sizeof(struct btrfs_item) * nr;
2652 WARN_ON(data_len < 0);
2657 * The space between the end of the leaf items and
2658 * the start of the leaf data. IOW, how much room
2659 * the leaf has left for both items and data
2661 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2663 struct btrfs_fs_info *fs_info = leaf->fs_info;
2664 int nritems = btrfs_header_nritems(leaf);
2667 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2670 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2672 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2673 leaf_space_used(leaf, 0, nritems), nritems);
2679 * min slot controls the lowest index we're willing to push to the
2680 * right. We'll push up to and including min_slot, but no lower
2682 static noinline int __push_leaf_right(struct btrfs_path *path,
2683 int data_size, int empty,
2684 struct extent_buffer *right,
2685 int free_space, u32 left_nritems,
2688 struct btrfs_fs_info *fs_info = right->fs_info;
2689 struct extent_buffer *left = path->nodes[0];
2690 struct extent_buffer *upper = path->nodes[1];
2691 struct btrfs_map_token token;
2692 struct btrfs_disk_key disk_key;
2697 struct btrfs_item *item;
2706 nr = max_t(u32, 1, min_slot);
2708 if (path->slots[0] >= left_nritems)
2709 push_space += data_size;
2711 slot = path->slots[1];
2712 i = left_nritems - 1;
2714 item = btrfs_item_nr(i);
2716 if (!empty && push_items > 0) {
2717 if (path->slots[0] > i)
2719 if (path->slots[0] == i) {
2720 int space = btrfs_leaf_free_space(left);
2722 if (space + push_space * 2 > free_space)
2727 if (path->slots[0] == i)
2728 push_space += data_size;
2730 this_item_size = btrfs_item_size(left, item);
2731 if (this_item_size + sizeof(*item) + push_space > free_space)
2735 push_space += this_item_size + sizeof(*item);
2741 if (push_items == 0)
2744 WARN_ON(!empty && push_items == left_nritems);
2746 /* push left to right */
2747 right_nritems = btrfs_header_nritems(right);
2749 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2750 push_space -= leaf_data_end(left);
2752 /* make room in the right data area */
2753 data_end = leaf_data_end(right);
2754 memmove_extent_buffer(right,
2755 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2756 BTRFS_LEAF_DATA_OFFSET + data_end,
2757 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2759 /* copy from the left data area */
2760 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2761 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2762 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2765 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2766 btrfs_item_nr_offset(0),
2767 right_nritems * sizeof(struct btrfs_item));
2769 /* copy the items from left to right */
2770 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2771 btrfs_item_nr_offset(left_nritems - push_items),
2772 push_items * sizeof(struct btrfs_item));
2774 /* update the item pointers */
2775 btrfs_init_map_token(&token, right);
2776 right_nritems += push_items;
2777 btrfs_set_header_nritems(right, right_nritems);
2778 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2779 for (i = 0; i < right_nritems; i++) {
2780 item = btrfs_item_nr(i);
2781 push_space -= btrfs_token_item_size(&token, item);
2782 btrfs_set_token_item_offset(&token, item, push_space);
2785 left_nritems -= push_items;
2786 btrfs_set_header_nritems(left, left_nritems);
2789 btrfs_mark_buffer_dirty(left);
2791 btrfs_clean_tree_block(left);
2793 btrfs_mark_buffer_dirty(right);
2795 btrfs_item_key(right, &disk_key, 0);
2796 btrfs_set_node_key(upper, &disk_key, slot + 1);
2797 btrfs_mark_buffer_dirty(upper);
2799 /* then fixup the leaf pointer in the path */
2800 if (path->slots[0] >= left_nritems) {
2801 path->slots[0] -= left_nritems;
2802 if (btrfs_header_nritems(path->nodes[0]) == 0)
2803 btrfs_clean_tree_block(path->nodes[0]);
2804 btrfs_tree_unlock(path->nodes[0]);
2805 free_extent_buffer(path->nodes[0]);
2806 path->nodes[0] = right;
2807 path->slots[1] += 1;
2809 btrfs_tree_unlock(right);
2810 free_extent_buffer(right);
2815 btrfs_tree_unlock(right);
2816 free_extent_buffer(right);
2821 * push some data in the path leaf to the right, trying to free up at
2822 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2824 * returns 1 if the push failed because the other node didn't have enough
2825 * room, 0 if everything worked out and < 0 if there were major errors.
2827 * this will push starting from min_slot to the end of the leaf. It won't
2828 * push any slot lower than min_slot
2830 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2831 *root, struct btrfs_path *path,
2832 int min_data_size, int data_size,
2833 int empty, u32 min_slot)
2835 struct extent_buffer *left = path->nodes[0];
2836 struct extent_buffer *right;
2837 struct extent_buffer *upper;
2843 if (!path->nodes[1])
2846 slot = path->slots[1];
2847 upper = path->nodes[1];
2848 if (slot >= btrfs_header_nritems(upper) - 1)
2851 btrfs_assert_tree_locked(path->nodes[1]);
2853 right = btrfs_read_node_slot(upper, slot + 1);
2855 * slot + 1 is not valid or we fail to read the right node,
2856 * no big deal, just return.
2861 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2863 free_space = btrfs_leaf_free_space(right);
2864 if (free_space < data_size)
2867 /* cow and double check */
2868 ret = btrfs_cow_block(trans, root, right, upper,
2869 slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2873 free_space = btrfs_leaf_free_space(right);
2874 if (free_space < data_size)
2877 left_nritems = btrfs_header_nritems(left);
2878 if (left_nritems == 0)
2881 if (check_sibling_keys(left, right)) {
2883 btrfs_tree_unlock(right);
2884 free_extent_buffer(right);
2887 if (path->slots[0] == left_nritems && !empty) {
2888 /* Key greater than all keys in the leaf, right neighbor has
2889 * enough room for it and we're not emptying our leaf to delete
2890 * it, therefore use right neighbor to insert the new item and
2891 * no need to touch/dirty our left leaf. */
2892 btrfs_tree_unlock(left);
2893 free_extent_buffer(left);
2894 path->nodes[0] = right;
2900 return __push_leaf_right(path, min_data_size, empty,
2901 right, free_space, left_nritems, min_slot);
2903 btrfs_tree_unlock(right);
2904 free_extent_buffer(right);
2909 * push some data in the path leaf to the left, trying to free up at
2910 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2912 * max_slot can put a limit on how far into the leaf we'll push items. The
2913 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2916 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2917 int empty, struct extent_buffer *left,
2918 int free_space, u32 right_nritems,
2921 struct btrfs_fs_info *fs_info = left->fs_info;
2922 struct btrfs_disk_key disk_key;
2923 struct extent_buffer *right = path->nodes[0];
2927 struct btrfs_item *item;
2928 u32 old_left_nritems;
2932 u32 old_left_item_size;
2933 struct btrfs_map_token token;
2936 nr = min(right_nritems, max_slot);
2938 nr = min(right_nritems - 1, max_slot);
2940 for (i = 0; i < nr; i++) {
2941 item = btrfs_item_nr(i);
2943 if (!empty && push_items > 0) {
2944 if (path->slots[0] < i)
2946 if (path->slots[0] == i) {
2947 int space = btrfs_leaf_free_space(right);
2949 if (space + push_space * 2 > free_space)
2954 if (path->slots[0] == i)
2955 push_space += data_size;
2957 this_item_size = btrfs_item_size(right, item);
2958 if (this_item_size + sizeof(*item) + push_space > free_space)
2962 push_space += this_item_size + sizeof(*item);
2965 if (push_items == 0) {
2969 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2971 /* push data from right to left */
2972 copy_extent_buffer(left, right,
2973 btrfs_item_nr_offset(btrfs_header_nritems(left)),
2974 btrfs_item_nr_offset(0),
2975 push_items * sizeof(struct btrfs_item));
2977 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
2978 btrfs_item_offset_nr(right, push_items - 1);
2980 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
2981 leaf_data_end(left) - push_space,
2982 BTRFS_LEAF_DATA_OFFSET +
2983 btrfs_item_offset_nr(right, push_items - 1),
2985 old_left_nritems = btrfs_header_nritems(left);
2986 BUG_ON(old_left_nritems <= 0);
2988 btrfs_init_map_token(&token, left);
2989 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2990 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2993 item = btrfs_item_nr(i);
2995 ioff = btrfs_token_item_offset(&token, item);
2996 btrfs_set_token_item_offset(&token, item,
2997 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
2999 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3001 /* fixup right node */
3002 if (push_items > right_nritems)
3003 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3006 if (push_items < right_nritems) {
3007 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3008 leaf_data_end(right);
3009 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3010 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3011 BTRFS_LEAF_DATA_OFFSET +
3012 leaf_data_end(right), push_space);
3014 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3015 btrfs_item_nr_offset(push_items),
3016 (btrfs_header_nritems(right) - push_items) *
3017 sizeof(struct btrfs_item));
3020 btrfs_init_map_token(&token, right);
3021 right_nritems -= push_items;
3022 btrfs_set_header_nritems(right, right_nritems);
3023 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3024 for (i = 0; i < right_nritems; i++) {
3025 item = btrfs_item_nr(i);
3027 push_space = push_space - btrfs_token_item_size(&token, item);
3028 btrfs_set_token_item_offset(&token, item, push_space);
3031 btrfs_mark_buffer_dirty(left);
3033 btrfs_mark_buffer_dirty(right);
3035 btrfs_clean_tree_block(right);
3037 btrfs_item_key(right, &disk_key, 0);
3038 fixup_low_keys(path, &disk_key, 1);
3040 /* then fixup the leaf pointer in the path */
3041 if (path->slots[0] < push_items) {
3042 path->slots[0] += old_left_nritems;
3043 btrfs_tree_unlock(path->nodes[0]);
3044 free_extent_buffer(path->nodes[0]);
3045 path->nodes[0] = left;
3046 path->slots[1] -= 1;
3048 btrfs_tree_unlock(left);
3049 free_extent_buffer(left);
3050 path->slots[0] -= push_items;
3052 BUG_ON(path->slots[0] < 0);
3055 btrfs_tree_unlock(left);
3056 free_extent_buffer(left);
3061 * push some data in the path leaf to the left, trying to free up at
3062 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3064 * max_slot can put a limit on how far into the leaf we'll push items. The
3065 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
3068 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3069 *root, struct btrfs_path *path, int min_data_size,
3070 int data_size, int empty, u32 max_slot)
3072 struct extent_buffer *right = path->nodes[0];
3073 struct extent_buffer *left;
3079 slot = path->slots[1];
3082 if (!path->nodes[1])
3085 right_nritems = btrfs_header_nritems(right);
3086 if (right_nritems == 0)
3089 btrfs_assert_tree_locked(path->nodes[1]);
3091 left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3093 * slot - 1 is not valid or we fail to read the left node,
3094 * no big deal, just return.
3099 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3101 free_space = btrfs_leaf_free_space(left);
3102 if (free_space < data_size) {
3107 /* cow and double check */
3108 ret = btrfs_cow_block(trans, root, left,
3109 path->nodes[1], slot - 1, &left,
3110 BTRFS_NESTING_LEFT_COW);
3112 /* we hit -ENOSPC, but it isn't fatal here */
3118 free_space = btrfs_leaf_free_space(left);
3119 if (free_space < data_size) {
3124 if (check_sibling_keys(left, right)) {
3128 return __push_leaf_left(path, min_data_size,
3129 empty, left, free_space, right_nritems,
3132 btrfs_tree_unlock(left);
3133 free_extent_buffer(left);
3138 * split the path's leaf in two, making sure there is at least data_size
3139 * available for the resulting leaf level of the path.
3141 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3142 struct btrfs_path *path,
3143 struct extent_buffer *l,
3144 struct extent_buffer *right,
3145 int slot, int mid, int nritems)
3147 struct btrfs_fs_info *fs_info = trans->fs_info;
3151 struct btrfs_disk_key disk_key;
3152 struct btrfs_map_token token;
3154 nritems = nritems - mid;
3155 btrfs_set_header_nritems(right, nritems);
3156 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3158 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3159 btrfs_item_nr_offset(mid),
3160 nritems * sizeof(struct btrfs_item));
3162 copy_extent_buffer(right, l,
3163 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3164 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3165 leaf_data_end(l), data_copy_size);
3167 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3169 btrfs_init_map_token(&token, right);
3170 for (i = 0; i < nritems; i++) {
3171 struct btrfs_item *item = btrfs_item_nr(i);
3174 ioff = btrfs_token_item_offset(&token, item);
3175 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3178 btrfs_set_header_nritems(l, mid);
3179 btrfs_item_key(right, &disk_key, 0);
3180 insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3182 btrfs_mark_buffer_dirty(right);
3183 btrfs_mark_buffer_dirty(l);
3184 BUG_ON(path->slots[0] != slot);
3187 btrfs_tree_unlock(path->nodes[0]);
3188 free_extent_buffer(path->nodes[0]);
3189 path->nodes[0] = right;
3190 path->slots[0] -= mid;
3191 path->slots[1] += 1;
3193 btrfs_tree_unlock(right);
3194 free_extent_buffer(right);
3197 BUG_ON(path->slots[0] < 0);
3201 * double splits happen when we need to insert a big item in the middle
3202 * of a leaf. A double split can leave us with 3 mostly empty leaves:
3203 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3206 * We avoid this by trying to push the items on either side of our target
3207 * into the adjacent leaves. If all goes well we can avoid the double split
3210 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3211 struct btrfs_root *root,
3212 struct btrfs_path *path,
3219 int space_needed = data_size;
3221 slot = path->slots[0];
3222 if (slot < btrfs_header_nritems(path->nodes[0]))
3223 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3226 * try to push all the items after our slot into the
3229 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3236 nritems = btrfs_header_nritems(path->nodes[0]);
3238 * our goal is to get our slot at the start or end of a leaf. If
3239 * we've done so we're done
3241 if (path->slots[0] == 0 || path->slots[0] == nritems)
3244 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3247 /* try to push all the items before our slot into the next leaf */
3248 slot = path->slots[0];
3249 space_needed = data_size;
3251 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3252 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3265 * split the path's leaf in two, making sure there is at least data_size
3266 * available for the resulting leaf level of the path.
3268 * returns 0 if all went well and < 0 on failure.
3270 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3271 struct btrfs_root *root,
3272 const struct btrfs_key *ins_key,
3273 struct btrfs_path *path, int data_size,
3276 struct btrfs_disk_key disk_key;
3277 struct extent_buffer *l;
3281 struct extent_buffer *right;
3282 struct btrfs_fs_info *fs_info = root->fs_info;
3286 int num_doubles = 0;
3287 int tried_avoid_double = 0;
3290 slot = path->slots[0];
3291 if (extend && data_size + btrfs_item_size_nr(l, slot) +
3292 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3295 /* first try to make some room by pushing left and right */
3296 if (data_size && path->nodes[1]) {
3297 int space_needed = data_size;
3299 if (slot < btrfs_header_nritems(l))
3300 space_needed -= btrfs_leaf_free_space(l);
3302 wret = push_leaf_right(trans, root, path, space_needed,
3303 space_needed, 0, 0);
3307 space_needed = data_size;
3309 space_needed -= btrfs_leaf_free_space(l);
3310 wret = push_leaf_left(trans, root, path, space_needed,
3311 space_needed, 0, (u32)-1);
3317 /* did the pushes work? */
3318 if (btrfs_leaf_free_space(l) >= data_size)
3322 if (!path->nodes[1]) {
3323 ret = insert_new_root(trans, root, path, 1);
3330 slot = path->slots[0];
3331 nritems = btrfs_header_nritems(l);
3332 mid = (nritems + 1) / 2;
3336 leaf_space_used(l, mid, nritems - mid) + data_size >
3337 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3338 if (slot >= nritems) {
3342 if (mid != nritems &&
3343 leaf_space_used(l, mid, nritems - mid) +
3344 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3345 if (data_size && !tried_avoid_double)
3346 goto push_for_double;
3352 if (leaf_space_used(l, 0, mid) + data_size >
3353 BTRFS_LEAF_DATA_SIZE(fs_info)) {
3354 if (!extend && data_size && slot == 0) {
3356 } else if ((extend || !data_size) && slot == 0) {
3360 if (mid != nritems &&
3361 leaf_space_used(l, mid, nritems - mid) +
3362 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3363 if (data_size && !tried_avoid_double)
3364 goto push_for_double;
3372 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3374 btrfs_item_key(l, &disk_key, mid);
3377 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3378 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3379 * subclasses, which is 8 at the time of this patch, and we've maxed it
3380 * out. In the future we could add a
3381 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3382 * use BTRFS_NESTING_NEW_ROOT.
3384 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
3385 l->start, 0, num_doubles ?
3386 BTRFS_NESTING_NEW_ROOT :
3387 BTRFS_NESTING_SPLIT);
3389 return PTR_ERR(right);
3391 root_add_used(root, fs_info->nodesize);
3395 btrfs_set_header_nritems(right, 0);
3396 insert_ptr(trans, path, &disk_key,
3397 right->start, path->slots[1] + 1, 1);
3398 btrfs_tree_unlock(path->nodes[0]);
3399 free_extent_buffer(path->nodes[0]);
3400 path->nodes[0] = right;
3402 path->slots[1] += 1;
3404 btrfs_set_header_nritems(right, 0);
3405 insert_ptr(trans, path, &disk_key,
3406 right->start, path->slots[1], 1);
3407 btrfs_tree_unlock(path->nodes[0]);
3408 free_extent_buffer(path->nodes[0]);
3409 path->nodes[0] = right;
3411 if (path->slots[1] == 0)
3412 fixup_low_keys(path, &disk_key, 1);
3415 * We create a new leaf 'right' for the required ins_len and
3416 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3417 * the content of ins_len to 'right'.
3422 copy_for_split(trans, path, l, right, slot, mid, nritems);
3425 BUG_ON(num_doubles != 0);
3433 push_for_double_split(trans, root, path, data_size);
3434 tried_avoid_double = 1;
3435 if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3440 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3441 struct btrfs_root *root,
3442 struct btrfs_path *path, int ins_len)
3444 struct btrfs_key key;
3445 struct extent_buffer *leaf;
3446 struct btrfs_file_extent_item *fi;
3451 leaf = path->nodes[0];
3452 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3454 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3455 key.type != BTRFS_EXTENT_CSUM_KEY);
3457 if (btrfs_leaf_free_space(leaf) >= ins_len)
3460 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3461 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3462 fi = btrfs_item_ptr(leaf, path->slots[0],
3463 struct btrfs_file_extent_item);
3464 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3466 btrfs_release_path(path);
3468 path->keep_locks = 1;
3469 path->search_for_split = 1;
3470 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3471 path->search_for_split = 0;
3478 leaf = path->nodes[0];
3479 /* if our item isn't there, return now */
3480 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3483 /* the leaf has changed, it now has room. return now */
3484 if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3487 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3488 fi = btrfs_item_ptr(leaf, path->slots[0],
3489 struct btrfs_file_extent_item);
3490 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3494 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3498 path->keep_locks = 0;
3499 btrfs_unlock_up_safe(path, 1);
3502 path->keep_locks = 0;
3506 static noinline int split_item(struct btrfs_path *path,
3507 const struct btrfs_key *new_key,
3508 unsigned long split_offset)
3510 struct extent_buffer *leaf;
3511 struct btrfs_item *item;
3512 struct btrfs_item *new_item;
3518 struct btrfs_disk_key disk_key;
3520 leaf = path->nodes[0];
3521 BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3523 item = btrfs_item_nr(path->slots[0]);
3524 orig_offset = btrfs_item_offset(leaf, item);
3525 item_size = btrfs_item_size(leaf, item);
3527 buf = kmalloc(item_size, GFP_NOFS);
3531 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3532 path->slots[0]), item_size);
3534 slot = path->slots[0] + 1;
3535 nritems = btrfs_header_nritems(leaf);
3536 if (slot != nritems) {
3537 /* shift the items */
3538 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3539 btrfs_item_nr_offset(slot),
3540 (nritems - slot) * sizeof(struct btrfs_item));
3543 btrfs_cpu_key_to_disk(&disk_key, new_key);
3544 btrfs_set_item_key(leaf, &disk_key, slot);
3546 new_item = btrfs_item_nr(slot);
3548 btrfs_set_item_offset(leaf, new_item, orig_offset);
3549 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3551 btrfs_set_item_offset(leaf, item,
3552 orig_offset + item_size - split_offset);
3553 btrfs_set_item_size(leaf, item, split_offset);
3555 btrfs_set_header_nritems(leaf, nritems + 1);
3557 /* write the data for the start of the original item */
3558 write_extent_buffer(leaf, buf,
3559 btrfs_item_ptr_offset(leaf, path->slots[0]),
3562 /* write the data for the new item */
3563 write_extent_buffer(leaf, buf + split_offset,
3564 btrfs_item_ptr_offset(leaf, slot),
3565 item_size - split_offset);
3566 btrfs_mark_buffer_dirty(leaf);
3568 BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3574 * This function splits a single item into two items,
3575 * giving 'new_key' to the new item and splitting the
3576 * old one at split_offset (from the start of the item).
3578 * The path may be released by this operation. After
3579 * the split, the path is pointing to the old item. The
3580 * new item is going to be in the same node as the old one.
3582 * Note, the item being split must be smaller enough to live alone on
3583 * a tree block with room for one extra struct btrfs_item
3585 * This allows us to split the item in place, keeping a lock on the
3586 * leaf the entire time.
3588 int btrfs_split_item(struct btrfs_trans_handle *trans,
3589 struct btrfs_root *root,
3590 struct btrfs_path *path,
3591 const struct btrfs_key *new_key,
3592 unsigned long split_offset)
3595 ret = setup_leaf_for_split(trans, root, path,
3596 sizeof(struct btrfs_item));
3600 ret = split_item(path, new_key, split_offset);
3605 * This function duplicate a item, giving 'new_key' to the new item.
3606 * It guarantees both items live in the same tree leaf and the new item
3607 * is contiguous with the original item.
3609 * This allows us to split file extent in place, keeping a lock on the
3610 * leaf the entire time.
3612 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3613 struct btrfs_root *root,
3614 struct btrfs_path *path,
3615 const struct btrfs_key *new_key)
3617 struct extent_buffer *leaf;
3621 leaf = path->nodes[0];
3622 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3623 ret = setup_leaf_for_split(trans, root, path,
3624 item_size + sizeof(struct btrfs_item));
3629 setup_items_for_insert(root, path, new_key, &item_size, 1);
3630 leaf = path->nodes[0];
3631 memcpy_extent_buffer(leaf,
3632 btrfs_item_ptr_offset(leaf, path->slots[0]),
3633 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3639 * make the item pointed to by the path smaller. new_size indicates
3640 * how small to make it, and from_end tells us if we just chop bytes
3641 * off the end of the item or if we shift the item to chop bytes off
3644 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3647 struct extent_buffer *leaf;
3648 struct btrfs_item *item;
3650 unsigned int data_end;
3651 unsigned int old_data_start;
3652 unsigned int old_size;
3653 unsigned int size_diff;
3655 struct btrfs_map_token token;
3657 leaf = path->nodes[0];
3658 slot = path->slots[0];
3660 old_size = btrfs_item_size_nr(leaf, slot);
3661 if (old_size == new_size)
3664 nritems = btrfs_header_nritems(leaf);
3665 data_end = leaf_data_end(leaf);
3667 old_data_start = btrfs_item_offset_nr(leaf, slot);
3669 size_diff = old_size - new_size;
3672 BUG_ON(slot >= nritems);
3675 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3677 /* first correct the data pointers */
3678 btrfs_init_map_token(&token, leaf);
3679 for (i = slot; i < nritems; i++) {
3681 item = btrfs_item_nr(i);
3683 ioff = btrfs_token_item_offset(&token, item);
3684 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3687 /* shift the data */
3689 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3690 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3691 data_end, old_data_start + new_size - data_end);
3693 struct btrfs_disk_key disk_key;
3696 btrfs_item_key(leaf, &disk_key, slot);
3698 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3700 struct btrfs_file_extent_item *fi;
3702 fi = btrfs_item_ptr(leaf, slot,
3703 struct btrfs_file_extent_item);
3704 fi = (struct btrfs_file_extent_item *)(
3705 (unsigned long)fi - size_diff);
3707 if (btrfs_file_extent_type(leaf, fi) ==
3708 BTRFS_FILE_EXTENT_INLINE) {
3709 ptr = btrfs_item_ptr_offset(leaf, slot);
3710 memmove_extent_buffer(leaf, ptr,
3712 BTRFS_FILE_EXTENT_INLINE_DATA_START);
3716 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3717 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3718 data_end, old_data_start - data_end);
3720 offset = btrfs_disk_key_offset(&disk_key);
3721 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3722 btrfs_set_item_key(leaf, &disk_key, slot);
3724 fixup_low_keys(path, &disk_key, 1);
3727 item = btrfs_item_nr(slot);
3728 btrfs_set_item_size(leaf, item, new_size);
3729 btrfs_mark_buffer_dirty(leaf);
3731 if (btrfs_leaf_free_space(leaf) < 0) {
3732 btrfs_print_leaf(leaf);
3738 * make the item pointed to by the path bigger, data_size is the added size.
3740 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3743 struct extent_buffer *leaf;
3744 struct btrfs_item *item;
3746 unsigned int data_end;
3747 unsigned int old_data;
3748 unsigned int old_size;
3750 struct btrfs_map_token token;
3752 leaf = path->nodes[0];
3754 nritems = btrfs_header_nritems(leaf);
3755 data_end = leaf_data_end(leaf);
3757 if (btrfs_leaf_free_space(leaf) < data_size) {
3758 btrfs_print_leaf(leaf);
3761 slot = path->slots[0];
3762 old_data = btrfs_item_end_nr(leaf, slot);
3765 if (slot >= nritems) {
3766 btrfs_print_leaf(leaf);
3767 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3773 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3775 /* first correct the data pointers */
3776 btrfs_init_map_token(&token, leaf);
3777 for (i = slot; i < nritems; i++) {
3779 item = btrfs_item_nr(i);
3781 ioff = btrfs_token_item_offset(&token, item);
3782 btrfs_set_token_item_offset(&token, item, ioff - data_size);
3785 /* shift the data */
3786 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3787 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3788 data_end, old_data - data_end);
3790 data_end = old_data;
3791 old_size = btrfs_item_size_nr(leaf, slot);
3792 item = btrfs_item_nr(slot);
3793 btrfs_set_item_size(leaf, item, old_size + data_size);
3794 btrfs_mark_buffer_dirty(leaf);
3796 if (btrfs_leaf_free_space(leaf) < 0) {
3797 btrfs_print_leaf(leaf);
3803 * setup_items_for_insert - Helper called before inserting one or more items
3804 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3805 * in a function that doesn't call btrfs_search_slot
3807 * @root: root we are inserting items to
3808 * @path: points to the leaf/slot where we are going to insert new items
3809 * @cpu_key: array of keys for items to be inserted
3810 * @data_size: size of the body of each item we are going to insert
3811 * @nr: size of @cpu_key/@data_size arrays
3813 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3814 const struct btrfs_key *cpu_key, u32 *data_size,
3817 struct btrfs_fs_info *fs_info = root->fs_info;
3818 struct btrfs_item *item;
3821 unsigned int data_end;
3822 struct btrfs_disk_key disk_key;
3823 struct extent_buffer *leaf;
3825 struct btrfs_map_token token;
3829 for (i = 0; i < nr; i++)
3830 total_data += data_size[i];
3831 total_size = total_data + (nr * sizeof(struct btrfs_item));
3833 if (path->slots[0] == 0) {
3834 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3835 fixup_low_keys(path, &disk_key, 1);
3837 btrfs_unlock_up_safe(path, 1);
3839 leaf = path->nodes[0];
3840 slot = path->slots[0];
3842 nritems = btrfs_header_nritems(leaf);
3843 data_end = leaf_data_end(leaf);
3845 if (btrfs_leaf_free_space(leaf) < total_size) {
3846 btrfs_print_leaf(leaf);
3847 btrfs_crit(fs_info, "not enough freespace need %u have %d",
3848 total_size, btrfs_leaf_free_space(leaf));
3852 btrfs_init_map_token(&token, leaf);
3853 if (slot != nritems) {
3854 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3856 if (old_data < data_end) {
3857 btrfs_print_leaf(leaf);
3859 "item at slot %d with data offset %u beyond data end of leaf %u",
3860 slot, old_data, data_end);
3864 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3866 /* first correct the data pointers */
3867 for (i = slot; i < nritems; i++) {
3870 item = btrfs_item_nr(i);
3871 ioff = btrfs_token_item_offset(&token, item);
3872 btrfs_set_token_item_offset(&token, item,
3875 /* shift the items */
3876 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3877 btrfs_item_nr_offset(slot),
3878 (nritems - slot) * sizeof(struct btrfs_item));
3880 /* shift the data */
3881 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3882 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3883 data_end, old_data - data_end);
3884 data_end = old_data;
3887 /* setup the item for the new data */
3888 for (i = 0; i < nr; i++) {
3889 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3890 btrfs_set_item_key(leaf, &disk_key, slot + i);
3891 item = btrfs_item_nr(slot + i);
3892 data_end -= data_size[i];
3893 btrfs_set_token_item_offset(&token, item, data_end);
3894 btrfs_set_token_item_size(&token, item, data_size[i]);
3897 btrfs_set_header_nritems(leaf, nritems + nr);
3898 btrfs_mark_buffer_dirty(leaf);
3900 if (btrfs_leaf_free_space(leaf) < 0) {
3901 btrfs_print_leaf(leaf);
3907 * Given a key and some data, insert items into the tree.
3908 * This does all the path init required, making room in the tree if needed.
3910 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3911 struct btrfs_root *root,
3912 struct btrfs_path *path,
3913 const struct btrfs_key *cpu_key, u32 *data_size,
3922 for (i = 0; i < nr; i++)
3923 total_data += data_size[i];
3925 total_size = total_data + (nr * sizeof(struct btrfs_item));
3926 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3932 slot = path->slots[0];
3935 setup_items_for_insert(root, path, cpu_key, data_size, nr);
3940 * Given a key and some data, insert an item into the tree.
3941 * This does all the path init required, making room in the tree if needed.
3943 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3944 const struct btrfs_key *cpu_key, void *data,
3948 struct btrfs_path *path;
3949 struct extent_buffer *leaf;
3952 path = btrfs_alloc_path();
3955 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3957 leaf = path->nodes[0];
3958 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3959 write_extent_buffer(leaf, data, ptr, data_size);
3960 btrfs_mark_buffer_dirty(leaf);
3962 btrfs_free_path(path);
3967 * delete the pointer from a given node.
3969 * the tree should have been previously balanced so the deletion does not
3972 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3973 int level, int slot)
3975 struct extent_buffer *parent = path->nodes[level];
3979 nritems = btrfs_header_nritems(parent);
3980 if (slot != nritems - 1) {
3982 ret = btrfs_tree_mod_log_insert_move(parent, slot,
3983 slot + 1, nritems - slot - 1);
3986 memmove_extent_buffer(parent,
3987 btrfs_node_key_ptr_offset(slot),
3988 btrfs_node_key_ptr_offset(slot + 1),
3989 sizeof(struct btrfs_key_ptr) *
3990 (nritems - slot - 1));
3992 ret = btrfs_tree_mod_log_insert_key(parent, slot,
3993 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
3998 btrfs_set_header_nritems(parent, nritems);
3999 if (nritems == 0 && parent == root->node) {
4000 BUG_ON(btrfs_header_level(root->node) != 1);
4001 /* just turn the root into a leaf and break */
4002 btrfs_set_header_level(root->node, 0);
4003 } else if (slot == 0) {
4004 struct btrfs_disk_key disk_key;
4006 btrfs_node_key(parent, &disk_key, 0);
4007 fixup_low_keys(path, &disk_key, level + 1);
4009 btrfs_mark_buffer_dirty(parent);
4013 * a helper function to delete the leaf pointed to by path->slots[1] and
4016 * This deletes the pointer in path->nodes[1] and frees the leaf
4017 * block extent. zero is returned if it all worked out, < 0 otherwise.
4019 * The path must have already been setup for deleting the leaf, including
4020 * all the proper balancing. path->nodes[1] must be locked.
4022 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4023 struct btrfs_root *root,
4024 struct btrfs_path *path,
4025 struct extent_buffer *leaf)
4027 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4028 del_ptr(root, path, 1, path->slots[1]);
4031 * btrfs_free_extent is expensive, we want to make sure we
4032 * aren't holding any locks when we call it
4034 btrfs_unlock_up_safe(path, 0);
4036 root_sub_used(root, leaf->len);
4038 atomic_inc(&leaf->refs);
4039 btrfs_free_tree_block(trans, root, leaf, 0, 1);
4040 free_extent_buffer_stale(leaf);
4043 * delete the item at the leaf level in path. If that empties
4044 * the leaf, remove it from the tree
4046 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4047 struct btrfs_path *path, int slot, int nr)
4049 struct btrfs_fs_info *fs_info = root->fs_info;
4050 struct extent_buffer *leaf;
4051 struct btrfs_item *item;
4059 leaf = path->nodes[0];
4060 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4062 for (i = 0; i < nr; i++)
4063 dsize += btrfs_item_size_nr(leaf, slot + i);
4065 nritems = btrfs_header_nritems(leaf);
4067 if (slot + nr != nritems) {
4068 int data_end = leaf_data_end(leaf);
4069 struct btrfs_map_token token;
4071 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4073 BTRFS_LEAF_DATA_OFFSET + data_end,
4074 last_off - data_end);
4076 btrfs_init_map_token(&token, leaf);
4077 for (i = slot + nr; i < nritems; i++) {
4080 item = btrfs_item_nr(i);
4081 ioff = btrfs_token_item_offset(&token, item);
4082 btrfs_set_token_item_offset(&token, item, ioff + dsize);
4085 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4086 btrfs_item_nr_offset(slot + nr),
4087 sizeof(struct btrfs_item) *
4088 (nritems - slot - nr));
4090 btrfs_set_header_nritems(leaf, nritems - nr);
4093 /* delete the leaf if we've emptied it */
4095 if (leaf == root->node) {
4096 btrfs_set_header_level(leaf, 0);
4098 btrfs_clean_tree_block(leaf);
4099 btrfs_del_leaf(trans, root, path, leaf);
4102 int used = leaf_space_used(leaf, 0, nritems);
4104 struct btrfs_disk_key disk_key;
4106 btrfs_item_key(leaf, &disk_key, 0);
4107 fixup_low_keys(path, &disk_key, 1);
4110 /* delete the leaf if it is mostly empty */
4111 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4112 /* push_leaf_left fixes the path.
4113 * make sure the path still points to our leaf
4114 * for possible call to del_ptr below
4116 slot = path->slots[1];
4117 atomic_inc(&leaf->refs);
4119 wret = push_leaf_left(trans, root, path, 1, 1,
4121 if (wret < 0 && wret != -ENOSPC)
4124 if (path->nodes[0] == leaf &&
4125 btrfs_header_nritems(leaf)) {
4126 wret = push_leaf_right(trans, root, path, 1,
4128 if (wret < 0 && wret != -ENOSPC)
4132 if (btrfs_header_nritems(leaf) == 0) {
4133 path->slots[1] = slot;
4134 btrfs_del_leaf(trans, root, path, leaf);
4135 free_extent_buffer(leaf);
4138 /* if we're still in the path, make sure
4139 * we're dirty. Otherwise, one of the
4140 * push_leaf functions must have already
4141 * dirtied this buffer
4143 if (path->nodes[0] == leaf)
4144 btrfs_mark_buffer_dirty(leaf);
4145 free_extent_buffer(leaf);
4148 btrfs_mark_buffer_dirty(leaf);
4155 * search the tree again to find a leaf with lesser keys
4156 * returns 0 if it found something or 1 if there are no lesser leaves.
4157 * returns < 0 on io errors.
4159 * This may release the path, and so you may lose any locks held at the
4162 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4164 struct btrfs_key key;
4165 struct btrfs_disk_key found_key;
4168 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4170 if (key.offset > 0) {
4172 } else if (key.type > 0) {
4174 key.offset = (u64)-1;
4175 } else if (key.objectid > 0) {
4178 key.offset = (u64)-1;
4183 btrfs_release_path(path);
4184 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4187 btrfs_item_key(path->nodes[0], &found_key, 0);
4188 ret = comp_keys(&found_key, &key);
4190 * We might have had an item with the previous key in the tree right
4191 * before we released our path. And after we released our path, that
4192 * item might have been pushed to the first slot (0) of the leaf we
4193 * were holding due to a tree balance. Alternatively, an item with the
4194 * previous key can exist as the only element of a leaf (big fat item).
4195 * Therefore account for these 2 cases, so that our callers (like
4196 * btrfs_previous_item) don't miss an existing item with a key matching
4197 * the previous key we computed above.
4205 * A helper function to walk down the tree starting at min_key, and looking
4206 * for nodes or leaves that are have a minimum transaction id.
4207 * This is used by the btree defrag code, and tree logging
4209 * This does not cow, but it does stuff the starting key it finds back
4210 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4211 * key and get a writable path.
4213 * This honors path->lowest_level to prevent descent past a given level
4216 * min_trans indicates the oldest transaction that you are interested
4217 * in walking through. Any nodes or leaves older than min_trans are
4218 * skipped over (without reading them).
4220 * returns zero if something useful was found, < 0 on error and 1 if there
4221 * was nothing in the tree that matched the search criteria.
4223 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4224 struct btrfs_path *path,
4227 struct extent_buffer *cur;
4228 struct btrfs_key found_key;
4234 int keep_locks = path->keep_locks;
4236 path->keep_locks = 1;
4238 cur = btrfs_read_lock_root_node(root);
4239 level = btrfs_header_level(cur);
4240 WARN_ON(path->nodes[level]);
4241 path->nodes[level] = cur;
4242 path->locks[level] = BTRFS_READ_LOCK;
4244 if (btrfs_header_generation(cur) < min_trans) {
4249 nritems = btrfs_header_nritems(cur);
4250 level = btrfs_header_level(cur);
4251 sret = btrfs_bin_search(cur, min_key, &slot);
4257 /* at the lowest level, we're done, setup the path and exit */
4258 if (level == path->lowest_level) {
4259 if (slot >= nritems)
4262 path->slots[level] = slot;
4263 btrfs_item_key_to_cpu(cur, &found_key, slot);
4266 if (sret && slot > 0)
4269 * check this node pointer against the min_trans parameters.
4270 * If it is too old, skip to the next one.
4272 while (slot < nritems) {
4275 gen = btrfs_node_ptr_generation(cur, slot);
4276 if (gen < min_trans) {
4284 * we didn't find a candidate key in this node, walk forward
4285 * and find another one
4287 if (slot >= nritems) {
4288 path->slots[level] = slot;
4289 sret = btrfs_find_next_key(root, path, min_key, level,
4292 btrfs_release_path(path);
4298 /* save our key for returning back */
4299 btrfs_node_key_to_cpu(cur, &found_key, slot);
4300 path->slots[level] = slot;
4301 if (level == path->lowest_level) {
4305 cur = btrfs_read_node_slot(cur, slot);
4311 btrfs_tree_read_lock(cur);
4313 path->locks[level - 1] = BTRFS_READ_LOCK;
4314 path->nodes[level - 1] = cur;
4315 unlock_up(path, level, 1, 0, NULL);
4318 path->keep_locks = keep_locks;
4320 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4321 memcpy(min_key, &found_key, sizeof(found_key));
4327 * this is similar to btrfs_next_leaf, but does not try to preserve
4328 * and fixup the path. It looks for and returns the next key in the
4329 * tree based on the current path and the min_trans parameters.
4331 * 0 is returned if another key is found, < 0 if there are any errors
4332 * and 1 is returned if there are no higher keys in the tree
4334 * path->keep_locks should be set to 1 on the search made before
4335 * calling this function.
4337 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4338 struct btrfs_key *key, int level, u64 min_trans)
4341 struct extent_buffer *c;
4343 WARN_ON(!path->keep_locks && !path->skip_locking);
4344 while (level < BTRFS_MAX_LEVEL) {
4345 if (!path->nodes[level])
4348 slot = path->slots[level] + 1;
4349 c = path->nodes[level];
4351 if (slot >= btrfs_header_nritems(c)) {
4354 struct btrfs_key cur_key;
4355 if (level + 1 >= BTRFS_MAX_LEVEL ||
4356 !path->nodes[level + 1])
4359 if (path->locks[level + 1] || path->skip_locking) {
4364 slot = btrfs_header_nritems(c) - 1;
4366 btrfs_item_key_to_cpu(c, &cur_key, slot);
4368 btrfs_node_key_to_cpu(c, &cur_key, slot);
4370 orig_lowest = path->lowest_level;
4371 btrfs_release_path(path);
4372 path->lowest_level = level;
4373 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4375 path->lowest_level = orig_lowest;
4379 c = path->nodes[level];
4380 slot = path->slots[level];
4387 btrfs_item_key_to_cpu(c, key, slot);
4389 u64 gen = btrfs_node_ptr_generation(c, slot);
4391 if (gen < min_trans) {
4395 btrfs_node_key_to_cpu(c, key, slot);
4403 * search the tree again to find a leaf with greater keys
4404 * returns 0 if it found something or 1 if there are no greater leaves.
4405 * returns < 0 on io errors.
4407 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4409 return btrfs_next_old_leaf(root, path, 0);
4412 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4417 struct extent_buffer *c;
4418 struct extent_buffer *next;
4419 struct btrfs_key key;
4424 nritems = btrfs_header_nritems(path->nodes[0]);
4428 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4432 btrfs_release_path(path);
4434 path->keep_locks = 1;
4437 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4439 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4440 path->keep_locks = 0;
4445 nritems = btrfs_header_nritems(path->nodes[0]);
4447 * by releasing the path above we dropped all our locks. A balance
4448 * could have added more items next to the key that used to be
4449 * at the very end of the block. So, check again here and
4450 * advance the path if there are now more items available.
4452 if (nritems > 0 && path->slots[0] < nritems - 1) {
4459 * So the above check misses one case:
4460 * - after releasing the path above, someone has removed the item that
4461 * used to be at the very end of the block, and balance between leafs
4462 * gets another one with bigger key.offset to replace it.
4464 * This one should be returned as well, or we can get leaf corruption
4465 * later(esp. in __btrfs_drop_extents()).
4467 * And a bit more explanation about this check,
4468 * with ret > 0, the key isn't found, the path points to the slot
4469 * where it should be inserted, so the path->slots[0] item must be the
4472 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4477 while (level < BTRFS_MAX_LEVEL) {
4478 if (!path->nodes[level]) {
4483 slot = path->slots[level] + 1;
4484 c = path->nodes[level];
4485 if (slot >= btrfs_header_nritems(c)) {
4487 if (level == BTRFS_MAX_LEVEL) {
4496 * Our current level is where we're going to start from, and to
4497 * make sure lockdep doesn't complain we need to drop our locks
4498 * and nodes from 0 to our current level.
4500 for (i = 0; i < level; i++) {
4501 if (path->locks[level]) {
4502 btrfs_tree_read_unlock(path->nodes[i]);
4505 free_extent_buffer(path->nodes[i]);
4506 path->nodes[i] = NULL;
4510 ret = read_block_for_search(root, path, &next, level,
4516 btrfs_release_path(path);
4520 if (!path->skip_locking) {
4521 ret = btrfs_try_tree_read_lock(next);
4522 if (!ret && time_seq) {
4524 * If we don't get the lock, we may be racing
4525 * with push_leaf_left, holding that lock while
4526 * itself waiting for the leaf we've currently
4527 * locked. To solve this situation, we give up
4528 * on our lock and cycle.
4530 free_extent_buffer(next);
4531 btrfs_release_path(path);
4536 btrfs_tree_read_lock(next);
4540 path->slots[level] = slot;
4543 path->nodes[level] = next;
4544 path->slots[level] = 0;
4545 if (!path->skip_locking)
4546 path->locks[level] = BTRFS_READ_LOCK;
4550 ret = read_block_for_search(root, path, &next, level,
4556 btrfs_release_path(path);
4560 if (!path->skip_locking)
4561 btrfs_tree_read_lock(next);
4565 unlock_up(path, 0, 1, 0, NULL);
4571 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4572 * searching until it gets past min_objectid or finds an item of 'type'
4574 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4576 int btrfs_previous_item(struct btrfs_root *root,
4577 struct btrfs_path *path, u64 min_objectid,
4580 struct btrfs_key found_key;
4581 struct extent_buffer *leaf;
4586 if (path->slots[0] == 0) {
4587 ret = btrfs_prev_leaf(root, path);
4593 leaf = path->nodes[0];
4594 nritems = btrfs_header_nritems(leaf);
4597 if (path->slots[0] == nritems)
4600 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4601 if (found_key.objectid < min_objectid)
4603 if (found_key.type == type)
4605 if (found_key.objectid == min_objectid &&
4606 found_key.type < type)
4613 * search in extent tree to find a previous Metadata/Data extent item with
4616 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4618 int btrfs_previous_extent_item(struct btrfs_root *root,
4619 struct btrfs_path *path, u64 min_objectid)
4621 struct btrfs_key found_key;
4622 struct extent_buffer *leaf;
4627 if (path->slots[0] == 0) {
4628 ret = btrfs_prev_leaf(root, path);
4634 leaf = path->nodes[0];
4635 nritems = btrfs_header_nritems(leaf);
4638 if (path->slots[0] == nritems)
4641 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4642 if (found_key.objectid < min_objectid)
4644 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4645 found_key.type == BTRFS_METADATA_ITEM_KEY)
4647 if (found_key.objectid == min_objectid &&
4648 found_key.type < BTRFS_EXTENT_ITEM_KEY)