btrfs: fix race between quota disable and quota assign ioctls
[platform/kernel/linux-rpi.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17 #include "tree-mod-log.h"
18
19 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
20                       *root, struct btrfs_path *path, int level);
21 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
22                       const struct btrfs_key *ins_key, struct btrfs_path *path,
23                       int data_size, int extend);
24 static int push_node_left(struct btrfs_trans_handle *trans,
25                           struct extent_buffer *dst,
26                           struct extent_buffer *src, int empty);
27 static int balance_node_right(struct btrfs_trans_handle *trans,
28                               struct extent_buffer *dst_buf,
29                               struct extent_buffer *src_buf);
30 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
31                     int level, int slot);
32
33 static const struct btrfs_csums {
34         u16             size;
35         const char      name[10];
36         const char      driver[12];
37 } btrfs_csums[] = {
38         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
39         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
40         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
41         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
42                                      .driver = "blake2b-256" },
43 };
44
45 int btrfs_super_csum_size(const struct btrfs_super_block *s)
46 {
47         u16 t = btrfs_super_csum_type(s);
48         /*
49          * csum type is validated at mount time
50          */
51         return btrfs_csums[t].size;
52 }
53
54 const char *btrfs_super_csum_name(u16 csum_type)
55 {
56         /* csum type is validated at mount time */
57         return btrfs_csums[csum_type].name;
58 }
59
60 /*
61  * Return driver name if defined, otherwise the name that's also a valid driver
62  * name
63  */
64 const char *btrfs_super_csum_driver(u16 csum_type)
65 {
66         /* csum type is validated at mount time */
67         return btrfs_csums[csum_type].driver[0] ?
68                 btrfs_csums[csum_type].driver :
69                 btrfs_csums[csum_type].name;
70 }
71
72 size_t __attribute_const__ btrfs_get_num_csums(void)
73 {
74         return ARRAY_SIZE(btrfs_csums);
75 }
76
77 struct btrfs_path *btrfs_alloc_path(void)
78 {
79         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
80 }
81
82 /* this also releases the path */
83 void btrfs_free_path(struct btrfs_path *p)
84 {
85         if (!p)
86                 return;
87         btrfs_release_path(p);
88         kmem_cache_free(btrfs_path_cachep, p);
89 }
90
91 /*
92  * path release drops references on the extent buffers in the path
93  * and it drops any locks held by this path
94  *
95  * It is safe to call this on paths that no locks or extent buffers held.
96  */
97 noinline void btrfs_release_path(struct btrfs_path *p)
98 {
99         int i;
100
101         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
102                 p->slots[i] = 0;
103                 if (!p->nodes[i])
104                         continue;
105                 if (p->locks[i]) {
106                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
107                         p->locks[i] = 0;
108                 }
109                 free_extent_buffer(p->nodes[i]);
110                 p->nodes[i] = NULL;
111         }
112 }
113
114 /*
115  * safely gets a reference on the root node of a tree.  A lock
116  * is not taken, so a concurrent writer may put a different node
117  * at the root of the tree.  See btrfs_lock_root_node for the
118  * looping required.
119  *
120  * The extent buffer returned by this has a reference taken, so
121  * it won't disappear.  It may stop being the root of the tree
122  * at any time because there are no locks held.
123  */
124 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
125 {
126         struct extent_buffer *eb;
127
128         while (1) {
129                 rcu_read_lock();
130                 eb = rcu_dereference(root->node);
131
132                 /*
133                  * RCU really hurts here, we could free up the root node because
134                  * it was COWed but we may not get the new root node yet so do
135                  * the inc_not_zero dance and if it doesn't work then
136                  * synchronize_rcu and try again.
137                  */
138                 if (atomic_inc_not_zero(&eb->refs)) {
139                         rcu_read_unlock();
140                         break;
141                 }
142                 rcu_read_unlock();
143                 synchronize_rcu();
144         }
145         return eb;
146 }
147
148 /*
149  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
150  * just get put onto a simple dirty list.  Transaction walks this list to make
151  * sure they get properly updated on disk.
152  */
153 static void add_root_to_dirty_list(struct btrfs_root *root)
154 {
155         struct btrfs_fs_info *fs_info = root->fs_info;
156
157         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
158             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
159                 return;
160
161         spin_lock(&fs_info->trans_lock);
162         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
163                 /* Want the extent tree to be the last on the list */
164                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
165                         list_move_tail(&root->dirty_list,
166                                        &fs_info->dirty_cowonly_roots);
167                 else
168                         list_move(&root->dirty_list,
169                                   &fs_info->dirty_cowonly_roots);
170         }
171         spin_unlock(&fs_info->trans_lock);
172 }
173
174 /*
175  * used by snapshot creation to make a copy of a root for a tree with
176  * a given objectid.  The buffer with the new root node is returned in
177  * cow_ret, and this func returns zero on success or a negative error code.
178  */
179 int btrfs_copy_root(struct btrfs_trans_handle *trans,
180                       struct btrfs_root *root,
181                       struct extent_buffer *buf,
182                       struct extent_buffer **cow_ret, u64 new_root_objectid)
183 {
184         struct btrfs_fs_info *fs_info = root->fs_info;
185         struct extent_buffer *cow;
186         int ret = 0;
187         int level;
188         struct btrfs_disk_key disk_key;
189
190         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
191                 trans->transid != fs_info->running_transaction->transid);
192         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
193                 trans->transid != root->last_trans);
194
195         level = btrfs_header_level(buf);
196         if (level == 0)
197                 btrfs_item_key(buf, &disk_key, 0);
198         else
199                 btrfs_node_key(buf, &disk_key, 0);
200
201         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
202                                      &disk_key, level, buf->start, 0,
203                                      BTRFS_NESTING_NEW_ROOT);
204         if (IS_ERR(cow))
205                 return PTR_ERR(cow);
206
207         copy_extent_buffer_full(cow, buf);
208         btrfs_set_header_bytenr(cow, cow->start);
209         btrfs_set_header_generation(cow, trans->transid);
210         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
211         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
212                                      BTRFS_HEADER_FLAG_RELOC);
213         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
214                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
215         else
216                 btrfs_set_header_owner(cow, new_root_objectid);
217
218         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
219
220         WARN_ON(btrfs_header_generation(buf) > trans->transid);
221         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
222                 ret = btrfs_inc_ref(trans, root, cow, 1);
223         else
224                 ret = btrfs_inc_ref(trans, root, cow, 0);
225         if (ret) {
226                 btrfs_tree_unlock(cow);
227                 free_extent_buffer(cow);
228                 btrfs_abort_transaction(trans, ret);
229                 return ret;
230         }
231
232         btrfs_mark_buffer_dirty(cow);
233         *cow_ret = cow;
234         return 0;
235 }
236
237 /*
238  * check if the tree block can be shared by multiple trees
239  */
240 int btrfs_block_can_be_shared(struct btrfs_root *root,
241                               struct extent_buffer *buf)
242 {
243         /*
244          * Tree blocks not in shareable trees and tree roots are never shared.
245          * If a block was allocated after the last snapshot and the block was
246          * not allocated by tree relocation, we know the block is not shared.
247          */
248         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
249             buf != root->node && buf != root->commit_root &&
250             (btrfs_header_generation(buf) <=
251              btrfs_root_last_snapshot(&root->root_item) ||
252              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
253                 return 1;
254
255         return 0;
256 }
257
258 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
259                                        struct btrfs_root *root,
260                                        struct extent_buffer *buf,
261                                        struct extent_buffer *cow,
262                                        int *last_ref)
263 {
264         struct btrfs_fs_info *fs_info = root->fs_info;
265         u64 refs;
266         u64 owner;
267         u64 flags;
268         u64 new_flags = 0;
269         int ret;
270
271         /*
272          * Backrefs update rules:
273          *
274          * Always use full backrefs for extent pointers in tree block
275          * allocated by tree relocation.
276          *
277          * If a shared tree block is no longer referenced by its owner
278          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
279          * use full backrefs for extent pointers in tree block.
280          *
281          * If a tree block is been relocating
282          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
283          * use full backrefs for extent pointers in tree block.
284          * The reason for this is some operations (such as drop tree)
285          * are only allowed for blocks use full backrefs.
286          */
287
288         if (btrfs_block_can_be_shared(root, buf)) {
289                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
290                                                btrfs_header_level(buf), 1,
291                                                &refs, &flags);
292                 if (ret)
293                         return ret;
294                 if (refs == 0) {
295                         ret = -EROFS;
296                         btrfs_handle_fs_error(fs_info, ret, NULL);
297                         return ret;
298                 }
299         } else {
300                 refs = 1;
301                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
302                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
303                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
304                 else
305                         flags = 0;
306         }
307
308         owner = btrfs_header_owner(buf);
309         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
310                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
311
312         if (refs > 1) {
313                 if ((owner == root->root_key.objectid ||
314                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
315                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
316                         ret = btrfs_inc_ref(trans, root, buf, 1);
317                         if (ret)
318                                 return ret;
319
320                         if (root->root_key.objectid ==
321                             BTRFS_TREE_RELOC_OBJECTID) {
322                                 ret = btrfs_dec_ref(trans, root, buf, 0);
323                                 if (ret)
324                                         return ret;
325                                 ret = btrfs_inc_ref(trans, root, cow, 1);
326                                 if (ret)
327                                         return ret;
328                         }
329                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
330                 } else {
331
332                         if (root->root_key.objectid ==
333                             BTRFS_TREE_RELOC_OBJECTID)
334                                 ret = btrfs_inc_ref(trans, root, cow, 1);
335                         else
336                                 ret = btrfs_inc_ref(trans, root, cow, 0);
337                         if (ret)
338                                 return ret;
339                 }
340                 if (new_flags != 0) {
341                         int level = btrfs_header_level(buf);
342
343                         ret = btrfs_set_disk_extent_flags(trans, buf,
344                                                           new_flags, level, 0);
345                         if (ret)
346                                 return ret;
347                 }
348         } else {
349                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
350                         if (root->root_key.objectid ==
351                             BTRFS_TREE_RELOC_OBJECTID)
352                                 ret = btrfs_inc_ref(trans, root, cow, 1);
353                         else
354                                 ret = btrfs_inc_ref(trans, root, cow, 0);
355                         if (ret)
356                                 return ret;
357                         ret = btrfs_dec_ref(trans, root, buf, 1);
358                         if (ret)
359                                 return ret;
360                 }
361                 btrfs_clean_tree_block(buf);
362                 *last_ref = 1;
363         }
364         return 0;
365 }
366
367 /*
368  * does the dirty work in cow of a single block.  The parent block (if
369  * supplied) is updated to point to the new cow copy.  The new buffer is marked
370  * dirty and returned locked.  If you modify the block it needs to be marked
371  * dirty again.
372  *
373  * search_start -- an allocation hint for the new block
374  *
375  * empty_size -- a hint that you plan on doing more cow.  This is the size in
376  * bytes the allocator should try to find free next to the block it returns.
377  * This is just a hint and may be ignored by the allocator.
378  */
379 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
380                              struct btrfs_root *root,
381                              struct extent_buffer *buf,
382                              struct extent_buffer *parent, int parent_slot,
383                              struct extent_buffer **cow_ret,
384                              u64 search_start, u64 empty_size,
385                              enum btrfs_lock_nesting nest)
386 {
387         struct btrfs_fs_info *fs_info = root->fs_info;
388         struct btrfs_disk_key disk_key;
389         struct extent_buffer *cow;
390         int level, ret;
391         int last_ref = 0;
392         int unlock_orig = 0;
393         u64 parent_start = 0;
394
395         if (*cow_ret == buf)
396                 unlock_orig = 1;
397
398         btrfs_assert_tree_locked(buf);
399
400         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
401                 trans->transid != fs_info->running_transaction->transid);
402         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
403                 trans->transid != root->last_trans);
404
405         level = btrfs_header_level(buf);
406
407         if (level == 0)
408                 btrfs_item_key(buf, &disk_key, 0);
409         else
410                 btrfs_node_key(buf, &disk_key, 0);
411
412         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
413                 parent_start = parent->start;
414
415         cow = btrfs_alloc_tree_block(trans, root, parent_start,
416                                      root->root_key.objectid, &disk_key, level,
417                                      search_start, empty_size, nest);
418         if (IS_ERR(cow))
419                 return PTR_ERR(cow);
420
421         /* cow is set to blocking by btrfs_init_new_buffer */
422
423         copy_extent_buffer_full(cow, buf);
424         btrfs_set_header_bytenr(cow, cow->start);
425         btrfs_set_header_generation(cow, trans->transid);
426         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
427         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
428                                      BTRFS_HEADER_FLAG_RELOC);
429         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
430                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
431         else
432                 btrfs_set_header_owner(cow, root->root_key.objectid);
433
434         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
435
436         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
437         if (ret) {
438                 btrfs_tree_unlock(cow);
439                 free_extent_buffer(cow);
440                 btrfs_abort_transaction(trans, ret);
441                 return ret;
442         }
443
444         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
445                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
446                 if (ret) {
447                         btrfs_tree_unlock(cow);
448                         free_extent_buffer(cow);
449                         btrfs_abort_transaction(trans, ret);
450                         return ret;
451                 }
452         }
453
454         if (buf == root->node) {
455                 WARN_ON(parent && parent != buf);
456                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
457                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
458                         parent_start = buf->start;
459
460                 atomic_inc(&cow->refs);
461                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
462                 BUG_ON(ret < 0);
463                 rcu_assign_pointer(root->node, cow);
464
465                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
466                                       parent_start, last_ref);
467                 free_extent_buffer(buf);
468                 add_root_to_dirty_list(root);
469         } else {
470                 WARN_ON(trans->transid != btrfs_header_generation(parent));
471                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
472                                               BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
473                 btrfs_set_node_blockptr(parent, parent_slot,
474                                         cow->start);
475                 btrfs_set_node_ptr_generation(parent, parent_slot,
476                                               trans->transid);
477                 btrfs_mark_buffer_dirty(parent);
478                 if (last_ref) {
479                         ret = btrfs_tree_mod_log_free_eb(buf);
480                         if (ret) {
481                                 btrfs_tree_unlock(cow);
482                                 free_extent_buffer(cow);
483                                 btrfs_abort_transaction(trans, ret);
484                                 return ret;
485                         }
486                 }
487                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
488                                       parent_start, last_ref);
489         }
490         if (unlock_orig)
491                 btrfs_tree_unlock(buf);
492         free_extent_buffer_stale(buf);
493         btrfs_mark_buffer_dirty(cow);
494         *cow_ret = cow;
495         return 0;
496 }
497
498 static inline int should_cow_block(struct btrfs_trans_handle *trans,
499                                    struct btrfs_root *root,
500                                    struct extent_buffer *buf)
501 {
502         if (btrfs_is_testing(root->fs_info))
503                 return 0;
504
505         /* Ensure we can see the FORCE_COW bit */
506         smp_mb__before_atomic();
507
508         /*
509          * We do not need to cow a block if
510          * 1) this block is not created or changed in this transaction;
511          * 2) this block does not belong to TREE_RELOC tree;
512          * 3) the root is not forced COW.
513          *
514          * What is forced COW:
515          *    when we create snapshot during committing the transaction,
516          *    after we've finished copying src root, we must COW the shared
517          *    block to ensure the metadata consistency.
518          */
519         if (btrfs_header_generation(buf) == trans->transid &&
520             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
521             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
522               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
523             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
524                 return 0;
525         return 1;
526 }
527
528 /*
529  * cows a single block, see __btrfs_cow_block for the real work.
530  * This version of it has extra checks so that a block isn't COWed more than
531  * once per transaction, as long as it hasn't been written yet
532  */
533 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
534                     struct btrfs_root *root, struct extent_buffer *buf,
535                     struct extent_buffer *parent, int parent_slot,
536                     struct extent_buffer **cow_ret,
537                     enum btrfs_lock_nesting nest)
538 {
539         struct btrfs_fs_info *fs_info = root->fs_info;
540         u64 search_start;
541         int ret;
542
543         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
544                 btrfs_err(fs_info,
545                         "COW'ing blocks on a fs root that's being dropped");
546
547         if (trans->transaction != fs_info->running_transaction)
548                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
549                        trans->transid,
550                        fs_info->running_transaction->transid);
551
552         if (trans->transid != fs_info->generation)
553                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
554                        trans->transid, fs_info->generation);
555
556         if (!should_cow_block(trans, root, buf)) {
557                 *cow_ret = buf;
558                 return 0;
559         }
560
561         search_start = buf->start & ~((u64)SZ_1G - 1);
562
563         /*
564          * Before CoWing this block for later modification, check if it's
565          * the subtree root and do the delayed subtree trace if needed.
566          *
567          * Also We don't care about the error, as it's handled internally.
568          */
569         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
570         ret = __btrfs_cow_block(trans, root, buf, parent,
571                                  parent_slot, cow_ret, search_start, 0, nest);
572
573         trace_btrfs_cow_block(root, buf, *cow_ret);
574
575         return ret;
576 }
577 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
578
579 /*
580  * helper function for defrag to decide if two blocks pointed to by a
581  * node are actually close by
582  */
583 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
584 {
585         if (blocknr < other && other - (blocknr + blocksize) < 32768)
586                 return 1;
587         if (blocknr > other && blocknr - (other + blocksize) < 32768)
588                 return 1;
589         return 0;
590 }
591
592 #ifdef __LITTLE_ENDIAN
593
594 /*
595  * Compare two keys, on little-endian the disk order is same as CPU order and
596  * we can avoid the conversion.
597  */
598 static int comp_keys(const struct btrfs_disk_key *disk_key,
599                      const struct btrfs_key *k2)
600 {
601         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
602
603         return btrfs_comp_cpu_keys(k1, k2);
604 }
605
606 #else
607
608 /*
609  * compare two keys in a memcmp fashion
610  */
611 static int comp_keys(const struct btrfs_disk_key *disk,
612                      const struct btrfs_key *k2)
613 {
614         struct btrfs_key k1;
615
616         btrfs_disk_key_to_cpu(&k1, disk);
617
618         return btrfs_comp_cpu_keys(&k1, k2);
619 }
620 #endif
621
622 /*
623  * same as comp_keys only with two btrfs_key's
624  */
625 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
626 {
627         if (k1->objectid > k2->objectid)
628                 return 1;
629         if (k1->objectid < k2->objectid)
630                 return -1;
631         if (k1->type > k2->type)
632                 return 1;
633         if (k1->type < k2->type)
634                 return -1;
635         if (k1->offset > k2->offset)
636                 return 1;
637         if (k1->offset < k2->offset)
638                 return -1;
639         return 0;
640 }
641
642 /*
643  * this is used by the defrag code to go through all the
644  * leaves pointed to by a node and reallocate them so that
645  * disk order is close to key order
646  */
647 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
648                        struct btrfs_root *root, struct extent_buffer *parent,
649                        int start_slot, u64 *last_ret,
650                        struct btrfs_key *progress)
651 {
652         struct btrfs_fs_info *fs_info = root->fs_info;
653         struct extent_buffer *cur;
654         u64 blocknr;
655         u64 search_start = *last_ret;
656         u64 last_block = 0;
657         u64 other;
658         u32 parent_nritems;
659         int end_slot;
660         int i;
661         int err = 0;
662         u32 blocksize;
663         int progress_passed = 0;
664         struct btrfs_disk_key disk_key;
665
666         WARN_ON(trans->transaction != fs_info->running_transaction);
667         WARN_ON(trans->transid != fs_info->generation);
668
669         parent_nritems = btrfs_header_nritems(parent);
670         blocksize = fs_info->nodesize;
671         end_slot = parent_nritems - 1;
672
673         if (parent_nritems <= 1)
674                 return 0;
675
676         for (i = start_slot; i <= end_slot; i++) {
677                 int close = 1;
678
679                 btrfs_node_key(parent, &disk_key, i);
680                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
681                         continue;
682
683                 progress_passed = 1;
684                 blocknr = btrfs_node_blockptr(parent, i);
685                 if (last_block == 0)
686                         last_block = blocknr;
687
688                 if (i > 0) {
689                         other = btrfs_node_blockptr(parent, i - 1);
690                         close = close_blocks(blocknr, other, blocksize);
691                 }
692                 if (!close && i < end_slot) {
693                         other = btrfs_node_blockptr(parent, i + 1);
694                         close = close_blocks(blocknr, other, blocksize);
695                 }
696                 if (close) {
697                         last_block = blocknr;
698                         continue;
699                 }
700
701                 cur = btrfs_read_node_slot(parent, i);
702                 if (IS_ERR(cur))
703                         return PTR_ERR(cur);
704                 if (search_start == 0)
705                         search_start = last_block;
706
707                 btrfs_tree_lock(cur);
708                 err = __btrfs_cow_block(trans, root, cur, parent, i,
709                                         &cur, search_start,
710                                         min(16 * blocksize,
711                                             (end_slot - i) * blocksize),
712                                         BTRFS_NESTING_COW);
713                 if (err) {
714                         btrfs_tree_unlock(cur);
715                         free_extent_buffer(cur);
716                         break;
717                 }
718                 search_start = cur->start;
719                 last_block = cur->start;
720                 *last_ret = search_start;
721                 btrfs_tree_unlock(cur);
722                 free_extent_buffer(cur);
723         }
724         return err;
725 }
726
727 /*
728  * search for key in the extent_buffer.  The items start at offset p,
729  * and they are item_size apart.
730  *
731  * the slot in the array is returned via slot, and it points to
732  * the place where you would insert key if it is not found in
733  * the array.
734  *
735  * Slot may point to total number of items if the key is bigger than
736  * all of the keys
737  */
738 static noinline int generic_bin_search(struct extent_buffer *eb,
739                                        unsigned long p, int item_size,
740                                        const struct btrfs_key *key, int *slot)
741 {
742         int low = 0;
743         int high = btrfs_header_nritems(eb);
744         int ret;
745         const int key_size = sizeof(struct btrfs_disk_key);
746
747         if (low > high) {
748                 btrfs_err(eb->fs_info,
749                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
750                           __func__, low, high, eb->start,
751                           btrfs_header_owner(eb), btrfs_header_level(eb));
752                 return -EINVAL;
753         }
754
755         while (low < high) {
756                 unsigned long oip;
757                 unsigned long offset;
758                 struct btrfs_disk_key *tmp;
759                 struct btrfs_disk_key unaligned;
760                 int mid;
761
762                 mid = (low + high) / 2;
763                 offset = p + mid * item_size;
764                 oip = offset_in_page(offset);
765
766                 if (oip + key_size <= PAGE_SIZE) {
767                         const unsigned long idx = get_eb_page_index(offset);
768                         char *kaddr = page_address(eb->pages[idx]);
769
770                         oip = get_eb_offset_in_page(eb, offset);
771                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
772                 } else {
773                         read_extent_buffer(eb, &unaligned, offset, key_size);
774                         tmp = &unaligned;
775                 }
776
777                 ret = comp_keys(tmp, key);
778
779                 if (ret < 0)
780                         low = mid + 1;
781                 else if (ret > 0)
782                         high = mid;
783                 else {
784                         *slot = mid;
785                         return 0;
786                 }
787         }
788         *slot = low;
789         return 1;
790 }
791
792 /*
793  * simple bin_search frontend that does the right thing for
794  * leaves vs nodes
795  */
796 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
797                      int *slot)
798 {
799         if (btrfs_header_level(eb) == 0)
800                 return generic_bin_search(eb,
801                                           offsetof(struct btrfs_leaf, items),
802                                           sizeof(struct btrfs_item), key, slot);
803         else
804                 return generic_bin_search(eb,
805                                           offsetof(struct btrfs_node, ptrs),
806                                           sizeof(struct btrfs_key_ptr), key, slot);
807 }
808
809 static void root_add_used(struct btrfs_root *root, u32 size)
810 {
811         spin_lock(&root->accounting_lock);
812         btrfs_set_root_used(&root->root_item,
813                             btrfs_root_used(&root->root_item) + size);
814         spin_unlock(&root->accounting_lock);
815 }
816
817 static void root_sub_used(struct btrfs_root *root, u32 size)
818 {
819         spin_lock(&root->accounting_lock);
820         btrfs_set_root_used(&root->root_item,
821                             btrfs_root_used(&root->root_item) - size);
822         spin_unlock(&root->accounting_lock);
823 }
824
825 /* given a node and slot number, this reads the blocks it points to.  The
826  * extent buffer is returned with a reference taken (but unlocked).
827  */
828 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
829                                            int slot)
830 {
831         int level = btrfs_header_level(parent);
832         struct extent_buffer *eb;
833         struct btrfs_key first_key;
834
835         if (slot < 0 || slot >= btrfs_header_nritems(parent))
836                 return ERR_PTR(-ENOENT);
837
838         BUG_ON(level == 0);
839
840         btrfs_node_key_to_cpu(parent, &first_key, slot);
841         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
842                              btrfs_header_owner(parent),
843                              btrfs_node_ptr_generation(parent, slot),
844                              level - 1, &first_key);
845         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
846                 free_extent_buffer(eb);
847                 eb = ERR_PTR(-EIO);
848         }
849
850         return eb;
851 }
852
853 /*
854  * node level balancing, used to make sure nodes are in proper order for
855  * item deletion.  We balance from the top down, so we have to make sure
856  * that a deletion won't leave an node completely empty later on.
857  */
858 static noinline int balance_level(struct btrfs_trans_handle *trans,
859                          struct btrfs_root *root,
860                          struct btrfs_path *path, int level)
861 {
862         struct btrfs_fs_info *fs_info = root->fs_info;
863         struct extent_buffer *right = NULL;
864         struct extent_buffer *mid;
865         struct extent_buffer *left = NULL;
866         struct extent_buffer *parent = NULL;
867         int ret = 0;
868         int wret;
869         int pslot;
870         int orig_slot = path->slots[level];
871         u64 orig_ptr;
872
873         ASSERT(level > 0);
874
875         mid = path->nodes[level];
876
877         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
878         WARN_ON(btrfs_header_generation(mid) != trans->transid);
879
880         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
881
882         if (level < BTRFS_MAX_LEVEL - 1) {
883                 parent = path->nodes[level + 1];
884                 pslot = path->slots[level + 1];
885         }
886
887         /*
888          * deal with the case where there is only one pointer in the root
889          * by promoting the node below to a root
890          */
891         if (!parent) {
892                 struct extent_buffer *child;
893
894                 if (btrfs_header_nritems(mid) != 1)
895                         return 0;
896
897                 /* promote the child to a root */
898                 child = btrfs_read_node_slot(mid, 0);
899                 if (IS_ERR(child)) {
900                         ret = PTR_ERR(child);
901                         btrfs_handle_fs_error(fs_info, ret, NULL);
902                         goto enospc;
903                 }
904
905                 btrfs_tree_lock(child);
906                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
907                                       BTRFS_NESTING_COW);
908                 if (ret) {
909                         btrfs_tree_unlock(child);
910                         free_extent_buffer(child);
911                         goto enospc;
912                 }
913
914                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
915                 BUG_ON(ret < 0);
916                 rcu_assign_pointer(root->node, child);
917
918                 add_root_to_dirty_list(root);
919                 btrfs_tree_unlock(child);
920
921                 path->locks[level] = 0;
922                 path->nodes[level] = NULL;
923                 btrfs_clean_tree_block(mid);
924                 btrfs_tree_unlock(mid);
925                 /* once for the path */
926                 free_extent_buffer(mid);
927
928                 root_sub_used(root, mid->len);
929                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
930                 /* once for the root ptr */
931                 free_extent_buffer_stale(mid);
932                 return 0;
933         }
934         if (btrfs_header_nritems(mid) >
935             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
936                 return 0;
937
938         left = btrfs_read_node_slot(parent, pslot - 1);
939         if (IS_ERR(left))
940                 left = NULL;
941
942         if (left) {
943                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
944                 wret = btrfs_cow_block(trans, root, left,
945                                        parent, pslot - 1, &left,
946                                        BTRFS_NESTING_LEFT_COW);
947                 if (wret) {
948                         ret = wret;
949                         goto enospc;
950                 }
951         }
952
953         right = btrfs_read_node_slot(parent, pslot + 1);
954         if (IS_ERR(right))
955                 right = NULL;
956
957         if (right) {
958                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
959                 wret = btrfs_cow_block(trans, root, right,
960                                        parent, pslot + 1, &right,
961                                        BTRFS_NESTING_RIGHT_COW);
962                 if (wret) {
963                         ret = wret;
964                         goto enospc;
965                 }
966         }
967
968         /* first, try to make some room in the middle buffer */
969         if (left) {
970                 orig_slot += btrfs_header_nritems(left);
971                 wret = push_node_left(trans, left, mid, 1);
972                 if (wret < 0)
973                         ret = wret;
974         }
975
976         /*
977          * then try to empty the right most buffer into the middle
978          */
979         if (right) {
980                 wret = push_node_left(trans, mid, right, 1);
981                 if (wret < 0 && wret != -ENOSPC)
982                         ret = wret;
983                 if (btrfs_header_nritems(right) == 0) {
984                         btrfs_clean_tree_block(right);
985                         btrfs_tree_unlock(right);
986                         del_ptr(root, path, level + 1, pslot + 1);
987                         root_sub_used(root, right->len);
988                         btrfs_free_tree_block(trans, btrfs_root_id(root), right,
989                                               0, 1);
990                         free_extent_buffer_stale(right);
991                         right = NULL;
992                 } else {
993                         struct btrfs_disk_key right_key;
994                         btrfs_node_key(right, &right_key, 0);
995                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
996                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
997                         BUG_ON(ret < 0);
998                         btrfs_set_node_key(parent, &right_key, pslot + 1);
999                         btrfs_mark_buffer_dirty(parent);
1000                 }
1001         }
1002         if (btrfs_header_nritems(mid) == 1) {
1003                 /*
1004                  * we're not allowed to leave a node with one item in the
1005                  * tree during a delete.  A deletion from lower in the tree
1006                  * could try to delete the only pointer in this node.
1007                  * So, pull some keys from the left.
1008                  * There has to be a left pointer at this point because
1009                  * otherwise we would have pulled some pointers from the
1010                  * right
1011                  */
1012                 if (!left) {
1013                         ret = -EROFS;
1014                         btrfs_handle_fs_error(fs_info, ret, NULL);
1015                         goto enospc;
1016                 }
1017                 wret = balance_node_right(trans, mid, left);
1018                 if (wret < 0) {
1019                         ret = wret;
1020                         goto enospc;
1021                 }
1022                 if (wret == 1) {
1023                         wret = push_node_left(trans, left, mid, 1);
1024                         if (wret < 0)
1025                                 ret = wret;
1026                 }
1027                 BUG_ON(wret == 1);
1028         }
1029         if (btrfs_header_nritems(mid) == 0) {
1030                 btrfs_clean_tree_block(mid);
1031                 btrfs_tree_unlock(mid);
1032                 del_ptr(root, path, level + 1, pslot);
1033                 root_sub_used(root, mid->len);
1034                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1035                 free_extent_buffer_stale(mid);
1036                 mid = NULL;
1037         } else {
1038                 /* update the parent key to reflect our changes */
1039                 struct btrfs_disk_key mid_key;
1040                 btrfs_node_key(mid, &mid_key, 0);
1041                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1042                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1043                 BUG_ON(ret < 0);
1044                 btrfs_set_node_key(parent, &mid_key, pslot);
1045                 btrfs_mark_buffer_dirty(parent);
1046         }
1047
1048         /* update the path */
1049         if (left) {
1050                 if (btrfs_header_nritems(left) > orig_slot) {
1051                         atomic_inc(&left->refs);
1052                         /* left was locked after cow */
1053                         path->nodes[level] = left;
1054                         path->slots[level + 1] -= 1;
1055                         path->slots[level] = orig_slot;
1056                         if (mid) {
1057                                 btrfs_tree_unlock(mid);
1058                                 free_extent_buffer(mid);
1059                         }
1060                 } else {
1061                         orig_slot -= btrfs_header_nritems(left);
1062                         path->slots[level] = orig_slot;
1063                 }
1064         }
1065         /* double check we haven't messed things up */
1066         if (orig_ptr !=
1067             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1068                 BUG();
1069 enospc:
1070         if (right) {
1071                 btrfs_tree_unlock(right);
1072                 free_extent_buffer(right);
1073         }
1074         if (left) {
1075                 if (path->nodes[level] != left)
1076                         btrfs_tree_unlock(left);
1077                 free_extent_buffer(left);
1078         }
1079         return ret;
1080 }
1081
1082 /* Node balancing for insertion.  Here we only split or push nodes around
1083  * when they are completely full.  This is also done top down, so we
1084  * have to be pessimistic.
1085  */
1086 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1087                                           struct btrfs_root *root,
1088                                           struct btrfs_path *path, int level)
1089 {
1090         struct btrfs_fs_info *fs_info = root->fs_info;
1091         struct extent_buffer *right = NULL;
1092         struct extent_buffer *mid;
1093         struct extent_buffer *left = NULL;
1094         struct extent_buffer *parent = NULL;
1095         int ret = 0;
1096         int wret;
1097         int pslot;
1098         int orig_slot = path->slots[level];
1099
1100         if (level == 0)
1101                 return 1;
1102
1103         mid = path->nodes[level];
1104         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1105
1106         if (level < BTRFS_MAX_LEVEL - 1) {
1107                 parent = path->nodes[level + 1];
1108                 pslot = path->slots[level + 1];
1109         }
1110
1111         if (!parent)
1112                 return 1;
1113
1114         left = btrfs_read_node_slot(parent, pslot - 1);
1115         if (IS_ERR(left))
1116                 left = NULL;
1117
1118         /* first, try to make some room in the middle buffer */
1119         if (left) {
1120                 u32 left_nr;
1121
1122                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1123
1124                 left_nr = btrfs_header_nritems(left);
1125                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1126                         wret = 1;
1127                 } else {
1128                         ret = btrfs_cow_block(trans, root, left, parent,
1129                                               pslot - 1, &left,
1130                                               BTRFS_NESTING_LEFT_COW);
1131                         if (ret)
1132                                 wret = 1;
1133                         else {
1134                                 wret = push_node_left(trans, left, mid, 0);
1135                         }
1136                 }
1137                 if (wret < 0)
1138                         ret = wret;
1139                 if (wret == 0) {
1140                         struct btrfs_disk_key disk_key;
1141                         orig_slot += left_nr;
1142                         btrfs_node_key(mid, &disk_key, 0);
1143                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1144                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1145                         BUG_ON(ret < 0);
1146                         btrfs_set_node_key(parent, &disk_key, pslot);
1147                         btrfs_mark_buffer_dirty(parent);
1148                         if (btrfs_header_nritems(left) > orig_slot) {
1149                                 path->nodes[level] = left;
1150                                 path->slots[level + 1] -= 1;
1151                                 path->slots[level] = orig_slot;
1152                                 btrfs_tree_unlock(mid);
1153                                 free_extent_buffer(mid);
1154                         } else {
1155                                 orig_slot -=
1156                                         btrfs_header_nritems(left);
1157                                 path->slots[level] = orig_slot;
1158                                 btrfs_tree_unlock(left);
1159                                 free_extent_buffer(left);
1160                         }
1161                         return 0;
1162                 }
1163                 btrfs_tree_unlock(left);
1164                 free_extent_buffer(left);
1165         }
1166         right = btrfs_read_node_slot(parent, pslot + 1);
1167         if (IS_ERR(right))
1168                 right = NULL;
1169
1170         /*
1171          * then try to empty the right most buffer into the middle
1172          */
1173         if (right) {
1174                 u32 right_nr;
1175
1176                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1177
1178                 right_nr = btrfs_header_nritems(right);
1179                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1180                         wret = 1;
1181                 } else {
1182                         ret = btrfs_cow_block(trans, root, right,
1183                                               parent, pslot + 1,
1184                                               &right, BTRFS_NESTING_RIGHT_COW);
1185                         if (ret)
1186                                 wret = 1;
1187                         else {
1188                                 wret = balance_node_right(trans, right, mid);
1189                         }
1190                 }
1191                 if (wret < 0)
1192                         ret = wret;
1193                 if (wret == 0) {
1194                         struct btrfs_disk_key disk_key;
1195
1196                         btrfs_node_key(right, &disk_key, 0);
1197                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1198                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1199                         BUG_ON(ret < 0);
1200                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1201                         btrfs_mark_buffer_dirty(parent);
1202
1203                         if (btrfs_header_nritems(mid) <= orig_slot) {
1204                                 path->nodes[level] = right;
1205                                 path->slots[level + 1] += 1;
1206                                 path->slots[level] = orig_slot -
1207                                         btrfs_header_nritems(mid);
1208                                 btrfs_tree_unlock(mid);
1209                                 free_extent_buffer(mid);
1210                         } else {
1211                                 btrfs_tree_unlock(right);
1212                                 free_extent_buffer(right);
1213                         }
1214                         return 0;
1215                 }
1216                 btrfs_tree_unlock(right);
1217                 free_extent_buffer(right);
1218         }
1219         return 1;
1220 }
1221
1222 /*
1223  * readahead one full node of leaves, finding things that are close
1224  * to the block in 'slot', and triggering ra on them.
1225  */
1226 static void reada_for_search(struct btrfs_fs_info *fs_info,
1227                              struct btrfs_path *path,
1228                              int level, int slot, u64 objectid)
1229 {
1230         struct extent_buffer *node;
1231         struct btrfs_disk_key disk_key;
1232         u32 nritems;
1233         u64 search;
1234         u64 target;
1235         u64 nread = 0;
1236         u64 nread_max;
1237         u32 nr;
1238         u32 blocksize;
1239         u32 nscan = 0;
1240
1241         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1242                 return;
1243
1244         if (!path->nodes[level])
1245                 return;
1246
1247         node = path->nodes[level];
1248
1249         /*
1250          * Since the time between visiting leaves is much shorter than the time
1251          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1252          * much IO at once (possibly random).
1253          */
1254         if (path->reada == READA_FORWARD_ALWAYS) {
1255                 if (level > 1)
1256                         nread_max = node->fs_info->nodesize;
1257                 else
1258                         nread_max = SZ_128K;
1259         } else {
1260                 nread_max = SZ_64K;
1261         }
1262
1263         search = btrfs_node_blockptr(node, slot);
1264         blocksize = fs_info->nodesize;
1265         if (path->reada != READA_FORWARD_ALWAYS) {
1266                 struct extent_buffer *eb;
1267
1268                 eb = find_extent_buffer(fs_info, search);
1269                 if (eb) {
1270                         free_extent_buffer(eb);
1271                         return;
1272                 }
1273         }
1274
1275         target = search;
1276
1277         nritems = btrfs_header_nritems(node);
1278         nr = slot;
1279
1280         while (1) {
1281                 if (path->reada == READA_BACK) {
1282                         if (nr == 0)
1283                                 break;
1284                         nr--;
1285                 } else if (path->reada == READA_FORWARD ||
1286                            path->reada == READA_FORWARD_ALWAYS) {
1287                         nr++;
1288                         if (nr >= nritems)
1289                                 break;
1290                 }
1291                 if (path->reada == READA_BACK && objectid) {
1292                         btrfs_node_key(node, &disk_key, nr);
1293                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1294                                 break;
1295                 }
1296                 search = btrfs_node_blockptr(node, nr);
1297                 if (path->reada == READA_FORWARD_ALWAYS ||
1298                     (search <= target && target - search <= 65536) ||
1299                     (search > target && search - target <= 65536)) {
1300                         btrfs_readahead_node_child(node, nr);
1301                         nread += blocksize;
1302                 }
1303                 nscan++;
1304                 if (nread > nread_max || nscan > 32)
1305                         break;
1306         }
1307 }
1308
1309 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1310 {
1311         struct extent_buffer *parent;
1312         int slot;
1313         int nritems;
1314
1315         parent = path->nodes[level + 1];
1316         if (!parent)
1317                 return;
1318
1319         nritems = btrfs_header_nritems(parent);
1320         slot = path->slots[level + 1];
1321
1322         if (slot > 0)
1323                 btrfs_readahead_node_child(parent, slot - 1);
1324         if (slot + 1 < nritems)
1325                 btrfs_readahead_node_child(parent, slot + 1);
1326 }
1327
1328
1329 /*
1330  * when we walk down the tree, it is usually safe to unlock the higher layers
1331  * in the tree.  The exceptions are when our path goes through slot 0, because
1332  * operations on the tree might require changing key pointers higher up in the
1333  * tree.
1334  *
1335  * callers might also have set path->keep_locks, which tells this code to keep
1336  * the lock if the path points to the last slot in the block.  This is part of
1337  * walking through the tree, and selecting the next slot in the higher block.
1338  *
1339  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1340  * if lowest_unlock is 1, level 0 won't be unlocked
1341  */
1342 static noinline void unlock_up(struct btrfs_path *path, int level,
1343                                int lowest_unlock, int min_write_lock_level,
1344                                int *write_lock_level)
1345 {
1346         int i;
1347         int skip_level = level;
1348         int no_skips = 0;
1349         struct extent_buffer *t;
1350
1351         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1352                 if (!path->nodes[i])
1353                         break;
1354                 if (!path->locks[i])
1355                         break;
1356                 if (!no_skips && path->slots[i] == 0) {
1357                         skip_level = i + 1;
1358                         continue;
1359                 }
1360                 if (!no_skips && path->keep_locks) {
1361                         u32 nritems;
1362                         t = path->nodes[i];
1363                         nritems = btrfs_header_nritems(t);
1364                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1365                                 skip_level = i + 1;
1366                                 continue;
1367                         }
1368                 }
1369                 if (skip_level < i && i >= lowest_unlock)
1370                         no_skips = 1;
1371
1372                 t = path->nodes[i];
1373                 if (i >= lowest_unlock && i > skip_level) {
1374                         btrfs_tree_unlock_rw(t, path->locks[i]);
1375                         path->locks[i] = 0;
1376                         if (write_lock_level &&
1377                             i > min_write_lock_level &&
1378                             i <= *write_lock_level) {
1379                                 *write_lock_level = i - 1;
1380                         }
1381                 }
1382         }
1383 }
1384
1385 /*
1386  * helper function for btrfs_search_slot.  The goal is to find a block
1387  * in cache without setting the path to blocking.  If we find the block
1388  * we return zero and the path is unchanged.
1389  *
1390  * If we can't find the block, we set the path blocking and do some
1391  * reada.  -EAGAIN is returned and the search must be repeated.
1392  */
1393 static int
1394 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1395                       struct extent_buffer **eb_ret, int level, int slot,
1396                       const struct btrfs_key *key)
1397 {
1398         struct btrfs_fs_info *fs_info = root->fs_info;
1399         u64 blocknr;
1400         u64 gen;
1401         struct extent_buffer *tmp;
1402         struct btrfs_key first_key;
1403         int ret;
1404         int parent_level;
1405
1406         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1407         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1408         parent_level = btrfs_header_level(*eb_ret);
1409         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1410
1411         tmp = find_extent_buffer(fs_info, blocknr);
1412         if (tmp) {
1413                 if (p->reada == READA_FORWARD_ALWAYS)
1414                         reada_for_search(fs_info, p, level, slot, key->objectid);
1415
1416                 /* first we do an atomic uptodate check */
1417                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1418                         /*
1419                          * Do extra check for first_key, eb can be stale due to
1420                          * being cached, read from scrub, or have multiple
1421                          * parents (shared tree blocks).
1422                          */
1423                         if (btrfs_verify_level_key(tmp,
1424                                         parent_level - 1, &first_key, gen)) {
1425                                 free_extent_buffer(tmp);
1426                                 return -EUCLEAN;
1427                         }
1428                         *eb_ret = tmp;
1429                         return 0;
1430                 }
1431
1432                 /* now we're allowed to do a blocking uptodate check */
1433                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1434                 if (!ret) {
1435                         *eb_ret = tmp;
1436                         return 0;
1437                 }
1438                 free_extent_buffer(tmp);
1439                 btrfs_release_path(p);
1440                 return -EIO;
1441         }
1442
1443         /*
1444          * reduce lock contention at high levels
1445          * of the btree by dropping locks before
1446          * we read.  Don't release the lock on the current
1447          * level because we need to walk this node to figure
1448          * out which blocks to read.
1449          */
1450         btrfs_unlock_up_safe(p, level + 1);
1451
1452         if (p->reada != READA_NONE)
1453                 reada_for_search(fs_info, p, level, slot, key->objectid);
1454
1455         ret = -EAGAIN;
1456         tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1457                               gen, parent_level - 1, &first_key);
1458         if (!IS_ERR(tmp)) {
1459                 /*
1460                  * If the read above didn't mark this buffer up to date,
1461                  * it will never end up being up to date.  Set ret to EIO now
1462                  * and give up so that our caller doesn't loop forever
1463                  * on our EAGAINs.
1464                  */
1465                 if (!extent_buffer_uptodate(tmp))
1466                         ret = -EIO;
1467                 free_extent_buffer(tmp);
1468         } else {
1469                 ret = PTR_ERR(tmp);
1470         }
1471
1472         btrfs_release_path(p);
1473         return ret;
1474 }
1475
1476 /*
1477  * helper function for btrfs_search_slot.  This does all of the checks
1478  * for node-level blocks and does any balancing required based on
1479  * the ins_len.
1480  *
1481  * If no extra work was required, zero is returned.  If we had to
1482  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1483  * start over
1484  */
1485 static int
1486 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1487                        struct btrfs_root *root, struct btrfs_path *p,
1488                        struct extent_buffer *b, int level, int ins_len,
1489                        int *write_lock_level)
1490 {
1491         struct btrfs_fs_info *fs_info = root->fs_info;
1492         int ret = 0;
1493
1494         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1495             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1496
1497                 if (*write_lock_level < level + 1) {
1498                         *write_lock_level = level + 1;
1499                         btrfs_release_path(p);
1500                         return -EAGAIN;
1501                 }
1502
1503                 reada_for_balance(p, level);
1504                 ret = split_node(trans, root, p, level);
1505
1506                 b = p->nodes[level];
1507         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1508                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1509
1510                 if (*write_lock_level < level + 1) {
1511                         *write_lock_level = level + 1;
1512                         btrfs_release_path(p);
1513                         return -EAGAIN;
1514                 }
1515
1516                 reada_for_balance(p, level);
1517                 ret = balance_level(trans, root, p, level);
1518                 if (ret)
1519                         return ret;
1520
1521                 b = p->nodes[level];
1522                 if (!b) {
1523                         btrfs_release_path(p);
1524                         return -EAGAIN;
1525                 }
1526                 BUG_ON(btrfs_header_nritems(b) == 1);
1527         }
1528         return ret;
1529 }
1530
1531 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1532                 u64 iobjectid, u64 ioff, u8 key_type,
1533                 struct btrfs_key *found_key)
1534 {
1535         int ret;
1536         struct btrfs_key key;
1537         struct extent_buffer *eb;
1538
1539         ASSERT(path);
1540         ASSERT(found_key);
1541
1542         key.type = key_type;
1543         key.objectid = iobjectid;
1544         key.offset = ioff;
1545
1546         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1547         if (ret < 0)
1548                 return ret;
1549
1550         eb = path->nodes[0];
1551         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1552                 ret = btrfs_next_leaf(fs_root, path);
1553                 if (ret)
1554                         return ret;
1555                 eb = path->nodes[0];
1556         }
1557
1558         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1559         if (found_key->type != key.type ||
1560                         found_key->objectid != key.objectid)
1561                 return 1;
1562
1563         return 0;
1564 }
1565
1566 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1567                                                         struct btrfs_path *p,
1568                                                         int write_lock_level)
1569 {
1570         struct extent_buffer *b;
1571         int root_lock = 0;
1572         int level = 0;
1573
1574         if (p->search_commit_root) {
1575                 b = root->commit_root;
1576                 atomic_inc(&b->refs);
1577                 level = btrfs_header_level(b);
1578                 /*
1579                  * Ensure that all callers have set skip_locking when
1580                  * p->search_commit_root = 1.
1581                  */
1582                 ASSERT(p->skip_locking == 1);
1583
1584                 goto out;
1585         }
1586
1587         if (p->skip_locking) {
1588                 b = btrfs_root_node(root);
1589                 level = btrfs_header_level(b);
1590                 goto out;
1591         }
1592
1593         /* We try very hard to do read locks on the root */
1594         root_lock = BTRFS_READ_LOCK;
1595
1596         /*
1597          * If the level is set to maximum, we can skip trying to get the read
1598          * lock.
1599          */
1600         if (write_lock_level < BTRFS_MAX_LEVEL) {
1601                 /*
1602                  * We don't know the level of the root node until we actually
1603                  * have it read locked
1604                  */
1605                 b = btrfs_read_lock_root_node(root);
1606                 level = btrfs_header_level(b);
1607                 if (level > write_lock_level)
1608                         goto out;
1609
1610                 /* Whoops, must trade for write lock */
1611                 btrfs_tree_read_unlock(b);
1612                 free_extent_buffer(b);
1613         }
1614
1615         b = btrfs_lock_root_node(root);
1616         root_lock = BTRFS_WRITE_LOCK;
1617
1618         /* The level might have changed, check again */
1619         level = btrfs_header_level(b);
1620
1621 out:
1622         /*
1623          * The root may have failed to write out at some point, and thus is no
1624          * longer valid, return an error in this case.
1625          */
1626         if (!extent_buffer_uptodate(b)) {
1627                 if (root_lock)
1628                         btrfs_tree_unlock_rw(b, root_lock);
1629                 free_extent_buffer(b);
1630                 return ERR_PTR(-EIO);
1631         }
1632
1633         p->nodes[level] = b;
1634         if (!p->skip_locking)
1635                 p->locks[level] = root_lock;
1636         /*
1637          * Callers are responsible for dropping b's references.
1638          */
1639         return b;
1640 }
1641
1642 /*
1643  * Replace the extent buffer at the lowest level of the path with a cloned
1644  * version. The purpose is to be able to use it safely, after releasing the
1645  * commit root semaphore, even if relocation is happening in parallel, the
1646  * transaction used for relocation is committed and the extent buffer is
1647  * reallocated in the next transaction.
1648  *
1649  * This is used in a context where the caller does not prevent transaction
1650  * commits from happening, either by holding a transaction handle or holding
1651  * some lock, while it's doing searches through a commit root.
1652  * At the moment it's only used for send operations.
1653  */
1654 static int finish_need_commit_sem_search(struct btrfs_path *path)
1655 {
1656         const int i = path->lowest_level;
1657         const int slot = path->slots[i];
1658         struct extent_buffer *lowest = path->nodes[i];
1659         struct extent_buffer *clone;
1660
1661         ASSERT(path->need_commit_sem);
1662
1663         if (!lowest)
1664                 return 0;
1665
1666         lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1667
1668         clone = btrfs_clone_extent_buffer(lowest);
1669         if (!clone)
1670                 return -ENOMEM;
1671
1672         btrfs_release_path(path);
1673         path->nodes[i] = clone;
1674         path->slots[i] = slot;
1675
1676         return 0;
1677 }
1678
1679 /*
1680  * btrfs_search_slot - look for a key in a tree and perform necessary
1681  * modifications to preserve tree invariants.
1682  *
1683  * @trans:      Handle of transaction, used when modifying the tree
1684  * @p:          Holds all btree nodes along the search path
1685  * @root:       The root node of the tree
1686  * @key:        The key we are looking for
1687  * @ins_len:    Indicates purpose of search:
1688  *              >0  for inserts it's size of item inserted (*)
1689  *              <0  for deletions
1690  *               0  for plain searches, not modifying the tree
1691  *
1692  *              (*) If size of item inserted doesn't include
1693  *              sizeof(struct btrfs_item), then p->search_for_extension must
1694  *              be set.
1695  * @cow:        boolean should CoW operations be performed. Must always be 1
1696  *              when modifying the tree.
1697  *
1698  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1699  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1700  *
1701  * If @key is found, 0 is returned and you can find the item in the leaf level
1702  * of the path (level 0)
1703  *
1704  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1705  * points to the slot where it should be inserted
1706  *
1707  * If an error is encountered while searching the tree a negative error number
1708  * is returned
1709  */
1710 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1711                       const struct btrfs_key *key, struct btrfs_path *p,
1712                       int ins_len, int cow)
1713 {
1714         struct btrfs_fs_info *fs_info = root->fs_info;
1715         struct extent_buffer *b;
1716         int slot;
1717         int ret;
1718         int err;
1719         int level;
1720         int lowest_unlock = 1;
1721         /* everything at write_lock_level or lower must be write locked */
1722         int write_lock_level = 0;
1723         u8 lowest_level = 0;
1724         int min_write_lock_level;
1725         int prev_cmp;
1726
1727         lowest_level = p->lowest_level;
1728         WARN_ON(lowest_level && ins_len > 0);
1729         WARN_ON(p->nodes[0] != NULL);
1730         BUG_ON(!cow && ins_len);
1731
1732         if (ins_len < 0) {
1733                 lowest_unlock = 2;
1734
1735                 /* when we are removing items, we might have to go up to level
1736                  * two as we update tree pointers  Make sure we keep write
1737                  * for those levels as well
1738                  */
1739                 write_lock_level = 2;
1740         } else if (ins_len > 0) {
1741                 /*
1742                  * for inserting items, make sure we have a write lock on
1743                  * level 1 so we can update keys
1744                  */
1745                 write_lock_level = 1;
1746         }
1747
1748         if (!cow)
1749                 write_lock_level = -1;
1750
1751         if (cow && (p->keep_locks || p->lowest_level))
1752                 write_lock_level = BTRFS_MAX_LEVEL;
1753
1754         min_write_lock_level = write_lock_level;
1755
1756         if (p->need_commit_sem) {
1757                 ASSERT(p->search_commit_root);
1758                 down_read(&fs_info->commit_root_sem);
1759         }
1760
1761 again:
1762         prev_cmp = -1;
1763         b = btrfs_search_slot_get_root(root, p, write_lock_level);
1764         if (IS_ERR(b)) {
1765                 ret = PTR_ERR(b);
1766                 goto done;
1767         }
1768
1769         while (b) {
1770                 int dec = 0;
1771
1772                 level = btrfs_header_level(b);
1773
1774                 if (cow) {
1775                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1776
1777                         /*
1778                          * if we don't really need to cow this block
1779                          * then we don't want to set the path blocking,
1780                          * so we test it here
1781                          */
1782                         if (!should_cow_block(trans, root, b))
1783                                 goto cow_done;
1784
1785                         /*
1786                          * must have write locks on this node and the
1787                          * parent
1788                          */
1789                         if (level > write_lock_level ||
1790                             (level + 1 > write_lock_level &&
1791                             level + 1 < BTRFS_MAX_LEVEL &&
1792                             p->nodes[level + 1])) {
1793                                 write_lock_level = level + 1;
1794                                 btrfs_release_path(p);
1795                                 goto again;
1796                         }
1797
1798                         if (last_level)
1799                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
1800                                                       &b,
1801                                                       BTRFS_NESTING_COW);
1802                         else
1803                                 err = btrfs_cow_block(trans, root, b,
1804                                                       p->nodes[level + 1],
1805                                                       p->slots[level + 1], &b,
1806                                                       BTRFS_NESTING_COW);
1807                         if (err) {
1808                                 ret = err;
1809                                 goto done;
1810                         }
1811                 }
1812 cow_done:
1813                 p->nodes[level] = b;
1814                 /*
1815                  * Leave path with blocking locks to avoid massive
1816                  * lock context switch, this is made on purpose.
1817                  */
1818
1819                 /*
1820                  * we have a lock on b and as long as we aren't changing
1821                  * the tree, there is no way to for the items in b to change.
1822                  * It is safe to drop the lock on our parent before we
1823                  * go through the expensive btree search on b.
1824                  *
1825                  * If we're inserting or deleting (ins_len != 0), then we might
1826                  * be changing slot zero, which may require changing the parent.
1827                  * So, we can't drop the lock until after we know which slot
1828                  * we're operating on.
1829                  */
1830                 if (!ins_len && !p->keep_locks) {
1831                         int u = level + 1;
1832
1833                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1834                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1835                                 p->locks[u] = 0;
1836                         }
1837                 }
1838
1839                 /*
1840                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1841                  * we can safely assume the target key will always be in slot 0
1842                  * on lower levels due to the invariants BTRFS' btree provides,
1843                  * namely that a btrfs_key_ptr entry always points to the
1844                  * lowest key in the child node, thus we can skip searching
1845                  * lower levels
1846                  */
1847                 if (prev_cmp == 0) {
1848                         slot = 0;
1849                         ret = 0;
1850                 } else {
1851                         ret = btrfs_bin_search(b, key, &slot);
1852                         prev_cmp = ret;
1853                         if (ret < 0)
1854                                 goto done;
1855                 }
1856
1857                 if (level == 0) {
1858                         p->slots[level] = slot;
1859                         /*
1860                          * Item key already exists. In this case, if we are
1861                          * allowed to insert the item (for example, in dir_item
1862                          * case, item key collision is allowed), it will be
1863                          * merged with the original item. Only the item size
1864                          * grows, no new btrfs item will be added. If
1865                          * search_for_extension is not set, ins_len already
1866                          * accounts the size btrfs_item, deduct it here so leaf
1867                          * space check will be correct.
1868                          */
1869                         if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1870                                 ASSERT(ins_len >= sizeof(struct btrfs_item));
1871                                 ins_len -= sizeof(struct btrfs_item);
1872                         }
1873                         if (ins_len > 0 &&
1874                             btrfs_leaf_free_space(b) < ins_len) {
1875                                 if (write_lock_level < 1) {
1876                                         write_lock_level = 1;
1877                                         btrfs_release_path(p);
1878                                         goto again;
1879                                 }
1880
1881                                 err = split_leaf(trans, root, key,
1882                                                  p, ins_len, ret == 0);
1883
1884                                 BUG_ON(err > 0);
1885                                 if (err) {
1886                                         ret = err;
1887                                         goto done;
1888                                 }
1889                         }
1890                         if (!p->search_for_split)
1891                                 unlock_up(p, level, lowest_unlock,
1892                                           min_write_lock_level, NULL);
1893                         goto done;
1894                 }
1895                 if (ret && slot > 0) {
1896                         dec = 1;
1897                         slot--;
1898                 }
1899                 p->slots[level] = slot;
1900                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1901                                              &write_lock_level);
1902                 if (err == -EAGAIN)
1903                         goto again;
1904                 if (err) {
1905                         ret = err;
1906                         goto done;
1907                 }
1908                 b = p->nodes[level];
1909                 slot = p->slots[level];
1910
1911                 /*
1912                  * Slot 0 is special, if we change the key we have to update
1913                  * the parent pointer which means we must have a write lock on
1914                  * the parent
1915                  */
1916                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
1917                         write_lock_level = level + 1;
1918                         btrfs_release_path(p);
1919                         goto again;
1920                 }
1921
1922                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
1923                           &write_lock_level);
1924
1925                 if (level == lowest_level) {
1926                         if (dec)
1927                                 p->slots[level]++;
1928                         goto done;
1929                 }
1930
1931                 err = read_block_for_search(root, p, &b, level, slot, key);
1932                 if (err == -EAGAIN)
1933                         goto again;
1934                 if (err) {
1935                         ret = err;
1936                         goto done;
1937                 }
1938
1939                 if (!p->skip_locking) {
1940                         level = btrfs_header_level(b);
1941
1942                         btrfs_maybe_reset_lockdep_class(root, b);
1943
1944                         if (level <= write_lock_level) {
1945                                 btrfs_tree_lock(b);
1946                                 p->locks[level] = BTRFS_WRITE_LOCK;
1947                         } else {
1948                                 btrfs_tree_read_lock(b);
1949                                 p->locks[level] = BTRFS_READ_LOCK;
1950                         }
1951                         p->nodes[level] = b;
1952                 }
1953         }
1954         ret = 1;
1955 done:
1956         if (ret < 0 && !p->skip_release_on_error)
1957                 btrfs_release_path(p);
1958
1959         if (p->need_commit_sem) {
1960                 int ret2;
1961
1962                 ret2 = finish_need_commit_sem_search(p);
1963                 up_read(&fs_info->commit_root_sem);
1964                 if (ret2)
1965                         ret = ret2;
1966         }
1967
1968         return ret;
1969 }
1970 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1971
1972 /*
1973  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1974  * current state of the tree together with the operations recorded in the tree
1975  * modification log to search for the key in a previous version of this tree, as
1976  * denoted by the time_seq parameter.
1977  *
1978  * Naturally, there is no support for insert, delete or cow operations.
1979  *
1980  * The resulting path and return value will be set up as if we called
1981  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1982  */
1983 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1984                           struct btrfs_path *p, u64 time_seq)
1985 {
1986         struct btrfs_fs_info *fs_info = root->fs_info;
1987         struct extent_buffer *b;
1988         int slot;
1989         int ret;
1990         int err;
1991         int level;
1992         int lowest_unlock = 1;
1993         u8 lowest_level = 0;
1994
1995         lowest_level = p->lowest_level;
1996         WARN_ON(p->nodes[0] != NULL);
1997
1998         if (p->search_commit_root) {
1999                 BUG_ON(time_seq);
2000                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2001         }
2002
2003 again:
2004         b = btrfs_get_old_root(root, time_seq);
2005         if (!b) {
2006                 ret = -EIO;
2007                 goto done;
2008         }
2009         level = btrfs_header_level(b);
2010         p->locks[level] = BTRFS_READ_LOCK;
2011
2012         while (b) {
2013                 int dec = 0;
2014
2015                 level = btrfs_header_level(b);
2016                 p->nodes[level] = b;
2017
2018                 /*
2019                  * we have a lock on b and as long as we aren't changing
2020                  * the tree, there is no way to for the items in b to change.
2021                  * It is safe to drop the lock on our parent before we
2022                  * go through the expensive btree search on b.
2023                  */
2024                 btrfs_unlock_up_safe(p, level + 1);
2025
2026                 ret = btrfs_bin_search(b, key, &slot);
2027                 if (ret < 0)
2028                         goto done;
2029
2030                 if (level == 0) {
2031                         p->slots[level] = slot;
2032                         unlock_up(p, level, lowest_unlock, 0, NULL);
2033                         goto done;
2034                 }
2035
2036                 if (ret && slot > 0) {
2037                         dec = 1;
2038                         slot--;
2039                 }
2040                 p->slots[level] = slot;
2041                 unlock_up(p, level, lowest_unlock, 0, NULL);
2042
2043                 if (level == lowest_level) {
2044                         if (dec)
2045                                 p->slots[level]++;
2046                         goto done;
2047                 }
2048
2049                 err = read_block_for_search(root, p, &b, level, slot, key);
2050                 if (err == -EAGAIN)
2051                         goto again;
2052                 if (err) {
2053                         ret = err;
2054                         goto done;
2055                 }
2056
2057                 level = btrfs_header_level(b);
2058                 btrfs_tree_read_lock(b);
2059                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2060                 if (!b) {
2061                         ret = -ENOMEM;
2062                         goto done;
2063                 }
2064                 p->locks[level] = BTRFS_READ_LOCK;
2065                 p->nodes[level] = b;
2066         }
2067         ret = 1;
2068 done:
2069         if (ret < 0)
2070                 btrfs_release_path(p);
2071
2072         return ret;
2073 }
2074
2075 /*
2076  * helper to use instead of search slot if no exact match is needed but
2077  * instead the next or previous item should be returned.
2078  * When find_higher is true, the next higher item is returned, the next lower
2079  * otherwise.
2080  * When return_any and find_higher are both true, and no higher item is found,
2081  * return the next lower instead.
2082  * When return_any is true and find_higher is false, and no lower item is found,
2083  * return the next higher instead.
2084  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2085  * < 0 on error
2086  */
2087 int btrfs_search_slot_for_read(struct btrfs_root *root,
2088                                const struct btrfs_key *key,
2089                                struct btrfs_path *p, int find_higher,
2090                                int return_any)
2091 {
2092         int ret;
2093         struct extent_buffer *leaf;
2094
2095 again:
2096         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2097         if (ret <= 0)
2098                 return ret;
2099         /*
2100          * a return value of 1 means the path is at the position where the
2101          * item should be inserted. Normally this is the next bigger item,
2102          * but in case the previous item is the last in a leaf, path points
2103          * to the first free slot in the previous leaf, i.e. at an invalid
2104          * item.
2105          */
2106         leaf = p->nodes[0];
2107
2108         if (find_higher) {
2109                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2110                         ret = btrfs_next_leaf(root, p);
2111                         if (ret <= 0)
2112                                 return ret;
2113                         if (!return_any)
2114                                 return 1;
2115                         /*
2116                          * no higher item found, return the next
2117                          * lower instead
2118                          */
2119                         return_any = 0;
2120                         find_higher = 0;
2121                         btrfs_release_path(p);
2122                         goto again;
2123                 }
2124         } else {
2125                 if (p->slots[0] == 0) {
2126                         ret = btrfs_prev_leaf(root, p);
2127                         if (ret < 0)
2128                                 return ret;
2129                         if (!ret) {
2130                                 leaf = p->nodes[0];
2131                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2132                                         p->slots[0]--;
2133                                 return 0;
2134                         }
2135                         if (!return_any)
2136                                 return 1;
2137                         /*
2138                          * no lower item found, return the next
2139                          * higher instead
2140                          */
2141                         return_any = 0;
2142                         find_higher = 1;
2143                         btrfs_release_path(p);
2144                         goto again;
2145                 } else {
2146                         --p->slots[0];
2147                 }
2148         }
2149         return 0;
2150 }
2151
2152 /*
2153  * Execute search and call btrfs_previous_item to traverse backwards if the item
2154  * was not found.
2155  *
2156  * Return 0 if found, 1 if not found and < 0 if error.
2157  */
2158 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2159                            struct btrfs_path *path)
2160 {
2161         int ret;
2162
2163         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2164         if (ret > 0)
2165                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2166
2167         if (ret == 0)
2168                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2169
2170         return ret;
2171 }
2172
2173 /*
2174  * adjust the pointers going up the tree, starting at level
2175  * making sure the right key of each node is points to 'key'.
2176  * This is used after shifting pointers to the left, so it stops
2177  * fixing up pointers when a given leaf/node is not in slot 0 of the
2178  * higher levels
2179  *
2180  */
2181 static void fixup_low_keys(struct btrfs_path *path,
2182                            struct btrfs_disk_key *key, int level)
2183 {
2184         int i;
2185         struct extent_buffer *t;
2186         int ret;
2187
2188         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2189                 int tslot = path->slots[i];
2190
2191                 if (!path->nodes[i])
2192                         break;
2193                 t = path->nodes[i];
2194                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2195                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2196                 BUG_ON(ret < 0);
2197                 btrfs_set_node_key(t, key, tslot);
2198                 btrfs_mark_buffer_dirty(path->nodes[i]);
2199                 if (tslot != 0)
2200                         break;
2201         }
2202 }
2203
2204 /*
2205  * update item key.
2206  *
2207  * This function isn't completely safe. It's the caller's responsibility
2208  * that the new key won't break the order
2209  */
2210 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2211                              struct btrfs_path *path,
2212                              const struct btrfs_key *new_key)
2213 {
2214         struct btrfs_disk_key disk_key;
2215         struct extent_buffer *eb;
2216         int slot;
2217
2218         eb = path->nodes[0];
2219         slot = path->slots[0];
2220         if (slot > 0) {
2221                 btrfs_item_key(eb, &disk_key, slot - 1);
2222                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2223                         btrfs_crit(fs_info,
2224                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2225                                    slot, btrfs_disk_key_objectid(&disk_key),
2226                                    btrfs_disk_key_type(&disk_key),
2227                                    btrfs_disk_key_offset(&disk_key),
2228                                    new_key->objectid, new_key->type,
2229                                    new_key->offset);
2230                         btrfs_print_leaf(eb);
2231                         BUG();
2232                 }
2233         }
2234         if (slot < btrfs_header_nritems(eb) - 1) {
2235                 btrfs_item_key(eb, &disk_key, slot + 1);
2236                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2237                         btrfs_crit(fs_info,
2238                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2239                                    slot, btrfs_disk_key_objectid(&disk_key),
2240                                    btrfs_disk_key_type(&disk_key),
2241                                    btrfs_disk_key_offset(&disk_key),
2242                                    new_key->objectid, new_key->type,
2243                                    new_key->offset);
2244                         btrfs_print_leaf(eb);
2245                         BUG();
2246                 }
2247         }
2248
2249         btrfs_cpu_key_to_disk(&disk_key, new_key);
2250         btrfs_set_item_key(eb, &disk_key, slot);
2251         btrfs_mark_buffer_dirty(eb);
2252         if (slot == 0)
2253                 fixup_low_keys(path, &disk_key, 1);
2254 }
2255
2256 /*
2257  * Check key order of two sibling extent buffers.
2258  *
2259  * Return true if something is wrong.
2260  * Return false if everything is fine.
2261  *
2262  * Tree-checker only works inside one tree block, thus the following
2263  * corruption can not be detected by tree-checker:
2264  *
2265  * Leaf @left                   | Leaf @right
2266  * --------------------------------------------------------------
2267  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2268  *
2269  * Key f6 in leaf @left itself is valid, but not valid when the next
2270  * key in leaf @right is 7.
2271  * This can only be checked at tree block merge time.
2272  * And since tree checker has ensured all key order in each tree block
2273  * is correct, we only need to bother the last key of @left and the first
2274  * key of @right.
2275  */
2276 static bool check_sibling_keys(struct extent_buffer *left,
2277                                struct extent_buffer *right)
2278 {
2279         struct btrfs_key left_last;
2280         struct btrfs_key right_first;
2281         int level = btrfs_header_level(left);
2282         int nr_left = btrfs_header_nritems(left);
2283         int nr_right = btrfs_header_nritems(right);
2284
2285         /* No key to check in one of the tree blocks */
2286         if (!nr_left || !nr_right)
2287                 return false;
2288
2289         if (level) {
2290                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2291                 btrfs_node_key_to_cpu(right, &right_first, 0);
2292         } else {
2293                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2294                 btrfs_item_key_to_cpu(right, &right_first, 0);
2295         }
2296
2297         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2298                 btrfs_crit(left->fs_info,
2299 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2300                            left_last.objectid, left_last.type,
2301                            left_last.offset, right_first.objectid,
2302                            right_first.type, right_first.offset);
2303                 return true;
2304         }
2305         return false;
2306 }
2307
2308 /*
2309  * try to push data from one node into the next node left in the
2310  * tree.
2311  *
2312  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2313  * error, and > 0 if there was no room in the left hand block.
2314  */
2315 static int push_node_left(struct btrfs_trans_handle *trans,
2316                           struct extent_buffer *dst,
2317                           struct extent_buffer *src, int empty)
2318 {
2319         struct btrfs_fs_info *fs_info = trans->fs_info;
2320         int push_items = 0;
2321         int src_nritems;
2322         int dst_nritems;
2323         int ret = 0;
2324
2325         src_nritems = btrfs_header_nritems(src);
2326         dst_nritems = btrfs_header_nritems(dst);
2327         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2328         WARN_ON(btrfs_header_generation(src) != trans->transid);
2329         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2330
2331         if (!empty && src_nritems <= 8)
2332                 return 1;
2333
2334         if (push_items <= 0)
2335                 return 1;
2336
2337         if (empty) {
2338                 push_items = min(src_nritems, push_items);
2339                 if (push_items < src_nritems) {
2340                         /* leave at least 8 pointers in the node if
2341                          * we aren't going to empty it
2342                          */
2343                         if (src_nritems - push_items < 8) {
2344                                 if (push_items <= 8)
2345                                         return 1;
2346                                 push_items -= 8;
2347                         }
2348                 }
2349         } else
2350                 push_items = min(src_nritems - 8, push_items);
2351
2352         /* dst is the left eb, src is the middle eb */
2353         if (check_sibling_keys(dst, src)) {
2354                 ret = -EUCLEAN;
2355                 btrfs_abort_transaction(trans, ret);
2356                 return ret;
2357         }
2358         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2359         if (ret) {
2360                 btrfs_abort_transaction(trans, ret);
2361                 return ret;
2362         }
2363         copy_extent_buffer(dst, src,
2364                            btrfs_node_key_ptr_offset(dst_nritems),
2365                            btrfs_node_key_ptr_offset(0),
2366                            push_items * sizeof(struct btrfs_key_ptr));
2367
2368         if (push_items < src_nritems) {
2369                 /*
2370                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2371                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2372                  */
2373                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2374                                       btrfs_node_key_ptr_offset(push_items),
2375                                       (src_nritems - push_items) *
2376                                       sizeof(struct btrfs_key_ptr));
2377         }
2378         btrfs_set_header_nritems(src, src_nritems - push_items);
2379         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2380         btrfs_mark_buffer_dirty(src);
2381         btrfs_mark_buffer_dirty(dst);
2382
2383         return ret;
2384 }
2385
2386 /*
2387  * try to push data from one node into the next node right in the
2388  * tree.
2389  *
2390  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2391  * error, and > 0 if there was no room in the right hand block.
2392  *
2393  * this will  only push up to 1/2 the contents of the left node over
2394  */
2395 static int balance_node_right(struct btrfs_trans_handle *trans,
2396                               struct extent_buffer *dst,
2397                               struct extent_buffer *src)
2398 {
2399         struct btrfs_fs_info *fs_info = trans->fs_info;
2400         int push_items = 0;
2401         int max_push;
2402         int src_nritems;
2403         int dst_nritems;
2404         int ret = 0;
2405
2406         WARN_ON(btrfs_header_generation(src) != trans->transid);
2407         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2408
2409         src_nritems = btrfs_header_nritems(src);
2410         dst_nritems = btrfs_header_nritems(dst);
2411         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2412         if (push_items <= 0)
2413                 return 1;
2414
2415         if (src_nritems < 4)
2416                 return 1;
2417
2418         max_push = src_nritems / 2 + 1;
2419         /* don't try to empty the node */
2420         if (max_push >= src_nritems)
2421                 return 1;
2422
2423         if (max_push < push_items)
2424                 push_items = max_push;
2425
2426         /* dst is the right eb, src is the middle eb */
2427         if (check_sibling_keys(src, dst)) {
2428                 ret = -EUCLEAN;
2429                 btrfs_abort_transaction(trans, ret);
2430                 return ret;
2431         }
2432         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2433         BUG_ON(ret < 0);
2434         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2435                                       btrfs_node_key_ptr_offset(0),
2436                                       (dst_nritems) *
2437                                       sizeof(struct btrfs_key_ptr));
2438
2439         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2440                                          push_items);
2441         if (ret) {
2442                 btrfs_abort_transaction(trans, ret);
2443                 return ret;
2444         }
2445         copy_extent_buffer(dst, src,
2446                            btrfs_node_key_ptr_offset(0),
2447                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2448                            push_items * sizeof(struct btrfs_key_ptr));
2449
2450         btrfs_set_header_nritems(src, src_nritems - push_items);
2451         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2452
2453         btrfs_mark_buffer_dirty(src);
2454         btrfs_mark_buffer_dirty(dst);
2455
2456         return ret;
2457 }
2458
2459 /*
2460  * helper function to insert a new root level in the tree.
2461  * A new node is allocated, and a single item is inserted to
2462  * point to the existing root
2463  *
2464  * returns zero on success or < 0 on failure.
2465  */
2466 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2467                            struct btrfs_root *root,
2468                            struct btrfs_path *path, int level)
2469 {
2470         struct btrfs_fs_info *fs_info = root->fs_info;
2471         u64 lower_gen;
2472         struct extent_buffer *lower;
2473         struct extent_buffer *c;
2474         struct extent_buffer *old;
2475         struct btrfs_disk_key lower_key;
2476         int ret;
2477
2478         BUG_ON(path->nodes[level]);
2479         BUG_ON(path->nodes[level-1] != root->node);
2480
2481         lower = path->nodes[level-1];
2482         if (level == 1)
2483                 btrfs_item_key(lower, &lower_key, 0);
2484         else
2485                 btrfs_node_key(lower, &lower_key, 0);
2486
2487         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2488                                    &lower_key, level, root->node->start, 0,
2489                                    BTRFS_NESTING_NEW_ROOT);
2490         if (IS_ERR(c))
2491                 return PTR_ERR(c);
2492
2493         root_add_used(root, fs_info->nodesize);
2494
2495         btrfs_set_header_nritems(c, 1);
2496         btrfs_set_node_key(c, &lower_key, 0);
2497         btrfs_set_node_blockptr(c, 0, lower->start);
2498         lower_gen = btrfs_header_generation(lower);
2499         WARN_ON(lower_gen != trans->transid);
2500
2501         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2502
2503         btrfs_mark_buffer_dirty(c);
2504
2505         old = root->node;
2506         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2507         BUG_ON(ret < 0);
2508         rcu_assign_pointer(root->node, c);
2509
2510         /* the super has an extra ref to root->node */
2511         free_extent_buffer(old);
2512
2513         add_root_to_dirty_list(root);
2514         atomic_inc(&c->refs);
2515         path->nodes[level] = c;
2516         path->locks[level] = BTRFS_WRITE_LOCK;
2517         path->slots[level] = 0;
2518         return 0;
2519 }
2520
2521 /*
2522  * worker function to insert a single pointer in a node.
2523  * the node should have enough room for the pointer already
2524  *
2525  * slot and level indicate where you want the key to go, and
2526  * blocknr is the block the key points to.
2527  */
2528 static void insert_ptr(struct btrfs_trans_handle *trans,
2529                        struct btrfs_path *path,
2530                        struct btrfs_disk_key *key, u64 bytenr,
2531                        int slot, int level)
2532 {
2533         struct extent_buffer *lower;
2534         int nritems;
2535         int ret;
2536
2537         BUG_ON(!path->nodes[level]);
2538         btrfs_assert_tree_locked(path->nodes[level]);
2539         lower = path->nodes[level];
2540         nritems = btrfs_header_nritems(lower);
2541         BUG_ON(slot > nritems);
2542         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2543         if (slot != nritems) {
2544                 if (level) {
2545                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2546                                         slot, nritems - slot);
2547                         BUG_ON(ret < 0);
2548                 }
2549                 memmove_extent_buffer(lower,
2550                               btrfs_node_key_ptr_offset(slot + 1),
2551                               btrfs_node_key_ptr_offset(slot),
2552                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2553         }
2554         if (level) {
2555                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2556                                             BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2557                 BUG_ON(ret < 0);
2558         }
2559         btrfs_set_node_key(lower, key, slot);
2560         btrfs_set_node_blockptr(lower, slot, bytenr);
2561         WARN_ON(trans->transid == 0);
2562         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2563         btrfs_set_header_nritems(lower, nritems + 1);
2564         btrfs_mark_buffer_dirty(lower);
2565 }
2566
2567 /*
2568  * split the node at the specified level in path in two.
2569  * The path is corrected to point to the appropriate node after the split
2570  *
2571  * Before splitting this tries to make some room in the node by pushing
2572  * left and right, if either one works, it returns right away.
2573  *
2574  * returns 0 on success and < 0 on failure
2575  */
2576 static noinline int split_node(struct btrfs_trans_handle *trans,
2577                                struct btrfs_root *root,
2578                                struct btrfs_path *path, int level)
2579 {
2580         struct btrfs_fs_info *fs_info = root->fs_info;
2581         struct extent_buffer *c;
2582         struct extent_buffer *split;
2583         struct btrfs_disk_key disk_key;
2584         int mid;
2585         int ret;
2586         u32 c_nritems;
2587
2588         c = path->nodes[level];
2589         WARN_ON(btrfs_header_generation(c) != trans->transid);
2590         if (c == root->node) {
2591                 /*
2592                  * trying to split the root, lets make a new one
2593                  *
2594                  * tree mod log: We don't log_removal old root in
2595                  * insert_new_root, because that root buffer will be kept as a
2596                  * normal node. We are going to log removal of half of the
2597                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2598                  * holding a tree lock on the buffer, which is why we cannot
2599                  * race with other tree_mod_log users.
2600                  */
2601                 ret = insert_new_root(trans, root, path, level + 1);
2602                 if (ret)
2603                         return ret;
2604         } else {
2605                 ret = push_nodes_for_insert(trans, root, path, level);
2606                 c = path->nodes[level];
2607                 if (!ret && btrfs_header_nritems(c) <
2608                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2609                         return 0;
2610                 if (ret < 0)
2611                         return ret;
2612         }
2613
2614         c_nritems = btrfs_header_nritems(c);
2615         mid = (c_nritems + 1) / 2;
2616         btrfs_node_key(c, &disk_key, mid);
2617
2618         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2619                                        &disk_key, level, c->start, 0,
2620                                        BTRFS_NESTING_SPLIT);
2621         if (IS_ERR(split))
2622                 return PTR_ERR(split);
2623
2624         root_add_used(root, fs_info->nodesize);
2625         ASSERT(btrfs_header_level(c) == level);
2626
2627         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2628         if (ret) {
2629                 btrfs_abort_transaction(trans, ret);
2630                 return ret;
2631         }
2632         copy_extent_buffer(split, c,
2633                            btrfs_node_key_ptr_offset(0),
2634                            btrfs_node_key_ptr_offset(mid),
2635                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2636         btrfs_set_header_nritems(split, c_nritems - mid);
2637         btrfs_set_header_nritems(c, mid);
2638
2639         btrfs_mark_buffer_dirty(c);
2640         btrfs_mark_buffer_dirty(split);
2641
2642         insert_ptr(trans, path, &disk_key, split->start,
2643                    path->slots[level + 1] + 1, level + 1);
2644
2645         if (path->slots[level] >= mid) {
2646                 path->slots[level] -= mid;
2647                 btrfs_tree_unlock(c);
2648                 free_extent_buffer(c);
2649                 path->nodes[level] = split;
2650                 path->slots[level + 1] += 1;
2651         } else {
2652                 btrfs_tree_unlock(split);
2653                 free_extent_buffer(split);
2654         }
2655         return 0;
2656 }
2657
2658 /*
2659  * how many bytes are required to store the items in a leaf.  start
2660  * and nr indicate which items in the leaf to check.  This totals up the
2661  * space used both by the item structs and the item data
2662  */
2663 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2664 {
2665         struct btrfs_item *start_item;
2666         struct btrfs_item *end_item;
2667         int data_len;
2668         int nritems = btrfs_header_nritems(l);
2669         int end = min(nritems, start + nr) - 1;
2670
2671         if (!nr)
2672                 return 0;
2673         start_item = btrfs_item_nr(start);
2674         end_item = btrfs_item_nr(end);
2675         data_len = btrfs_item_offset(l, start_item) +
2676                    btrfs_item_size(l, start_item);
2677         data_len = data_len - btrfs_item_offset(l, end_item);
2678         data_len += sizeof(struct btrfs_item) * nr;
2679         WARN_ON(data_len < 0);
2680         return data_len;
2681 }
2682
2683 /*
2684  * The space between the end of the leaf items and
2685  * the start of the leaf data.  IOW, how much room
2686  * the leaf has left for both items and data
2687  */
2688 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2689 {
2690         struct btrfs_fs_info *fs_info = leaf->fs_info;
2691         int nritems = btrfs_header_nritems(leaf);
2692         int ret;
2693
2694         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2695         if (ret < 0) {
2696                 btrfs_crit(fs_info,
2697                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2698                            ret,
2699                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2700                            leaf_space_used(leaf, 0, nritems), nritems);
2701         }
2702         return ret;
2703 }
2704
2705 /*
2706  * min slot controls the lowest index we're willing to push to the
2707  * right.  We'll push up to and including min_slot, but no lower
2708  */
2709 static noinline int __push_leaf_right(struct btrfs_path *path,
2710                                       int data_size, int empty,
2711                                       struct extent_buffer *right,
2712                                       int free_space, u32 left_nritems,
2713                                       u32 min_slot)
2714 {
2715         struct btrfs_fs_info *fs_info = right->fs_info;
2716         struct extent_buffer *left = path->nodes[0];
2717         struct extent_buffer *upper = path->nodes[1];
2718         struct btrfs_map_token token;
2719         struct btrfs_disk_key disk_key;
2720         int slot;
2721         u32 i;
2722         int push_space = 0;
2723         int push_items = 0;
2724         struct btrfs_item *item;
2725         u32 nr;
2726         u32 right_nritems;
2727         u32 data_end;
2728         u32 this_item_size;
2729
2730         if (empty)
2731                 nr = 0;
2732         else
2733                 nr = max_t(u32, 1, min_slot);
2734
2735         if (path->slots[0] >= left_nritems)
2736                 push_space += data_size;
2737
2738         slot = path->slots[1];
2739         i = left_nritems - 1;
2740         while (i >= nr) {
2741                 item = btrfs_item_nr(i);
2742
2743                 if (!empty && push_items > 0) {
2744                         if (path->slots[0] > i)
2745                                 break;
2746                         if (path->slots[0] == i) {
2747                                 int space = btrfs_leaf_free_space(left);
2748
2749                                 if (space + push_space * 2 > free_space)
2750                                         break;
2751                         }
2752                 }
2753
2754                 if (path->slots[0] == i)
2755                         push_space += data_size;
2756
2757                 this_item_size = btrfs_item_size(left, item);
2758                 if (this_item_size + sizeof(*item) + push_space > free_space)
2759                         break;
2760
2761                 push_items++;
2762                 push_space += this_item_size + sizeof(*item);
2763                 if (i == 0)
2764                         break;
2765                 i--;
2766         }
2767
2768         if (push_items == 0)
2769                 goto out_unlock;
2770
2771         WARN_ON(!empty && push_items == left_nritems);
2772
2773         /* push left to right */
2774         right_nritems = btrfs_header_nritems(right);
2775
2776         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2777         push_space -= leaf_data_end(left);
2778
2779         /* make room in the right data area */
2780         data_end = leaf_data_end(right);
2781         memmove_extent_buffer(right,
2782                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2783                               BTRFS_LEAF_DATA_OFFSET + data_end,
2784                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2785
2786         /* copy from the left data area */
2787         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2788                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2789                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2790                      push_space);
2791
2792         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2793                               btrfs_item_nr_offset(0),
2794                               right_nritems * sizeof(struct btrfs_item));
2795
2796         /* copy the items from left to right */
2797         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2798                    btrfs_item_nr_offset(left_nritems - push_items),
2799                    push_items * sizeof(struct btrfs_item));
2800
2801         /* update the item pointers */
2802         btrfs_init_map_token(&token, right);
2803         right_nritems += push_items;
2804         btrfs_set_header_nritems(right, right_nritems);
2805         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2806         for (i = 0; i < right_nritems; i++) {
2807                 item = btrfs_item_nr(i);
2808                 push_space -= btrfs_token_item_size(&token, item);
2809                 btrfs_set_token_item_offset(&token, item, push_space);
2810         }
2811
2812         left_nritems -= push_items;
2813         btrfs_set_header_nritems(left, left_nritems);
2814
2815         if (left_nritems)
2816                 btrfs_mark_buffer_dirty(left);
2817         else
2818                 btrfs_clean_tree_block(left);
2819
2820         btrfs_mark_buffer_dirty(right);
2821
2822         btrfs_item_key(right, &disk_key, 0);
2823         btrfs_set_node_key(upper, &disk_key, slot + 1);
2824         btrfs_mark_buffer_dirty(upper);
2825
2826         /* then fixup the leaf pointer in the path */
2827         if (path->slots[0] >= left_nritems) {
2828                 path->slots[0] -= left_nritems;
2829                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2830                         btrfs_clean_tree_block(path->nodes[0]);
2831                 btrfs_tree_unlock(path->nodes[0]);
2832                 free_extent_buffer(path->nodes[0]);
2833                 path->nodes[0] = right;
2834                 path->slots[1] += 1;
2835         } else {
2836                 btrfs_tree_unlock(right);
2837                 free_extent_buffer(right);
2838         }
2839         return 0;
2840
2841 out_unlock:
2842         btrfs_tree_unlock(right);
2843         free_extent_buffer(right);
2844         return 1;
2845 }
2846
2847 /*
2848  * push some data in the path leaf to the right, trying to free up at
2849  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2850  *
2851  * returns 1 if the push failed because the other node didn't have enough
2852  * room, 0 if everything worked out and < 0 if there were major errors.
2853  *
2854  * this will push starting from min_slot to the end of the leaf.  It won't
2855  * push any slot lower than min_slot
2856  */
2857 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2858                            *root, struct btrfs_path *path,
2859                            int min_data_size, int data_size,
2860                            int empty, u32 min_slot)
2861 {
2862         struct extent_buffer *left = path->nodes[0];
2863         struct extent_buffer *right;
2864         struct extent_buffer *upper;
2865         int slot;
2866         int free_space;
2867         u32 left_nritems;
2868         int ret;
2869
2870         if (!path->nodes[1])
2871                 return 1;
2872
2873         slot = path->slots[1];
2874         upper = path->nodes[1];
2875         if (slot >= btrfs_header_nritems(upper) - 1)
2876                 return 1;
2877
2878         btrfs_assert_tree_locked(path->nodes[1]);
2879
2880         right = btrfs_read_node_slot(upper, slot + 1);
2881         /*
2882          * slot + 1 is not valid or we fail to read the right node,
2883          * no big deal, just return.
2884          */
2885         if (IS_ERR(right))
2886                 return 1;
2887
2888         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2889
2890         free_space = btrfs_leaf_free_space(right);
2891         if (free_space < data_size)
2892                 goto out_unlock;
2893
2894         /* cow and double check */
2895         ret = btrfs_cow_block(trans, root, right, upper,
2896                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2897         if (ret)
2898                 goto out_unlock;
2899
2900         free_space = btrfs_leaf_free_space(right);
2901         if (free_space < data_size)
2902                 goto out_unlock;
2903
2904         left_nritems = btrfs_header_nritems(left);
2905         if (left_nritems == 0)
2906                 goto out_unlock;
2907
2908         if (check_sibling_keys(left, right)) {
2909                 ret = -EUCLEAN;
2910                 btrfs_tree_unlock(right);
2911                 free_extent_buffer(right);
2912                 return ret;
2913         }
2914         if (path->slots[0] == left_nritems && !empty) {
2915                 /* Key greater than all keys in the leaf, right neighbor has
2916                  * enough room for it and we're not emptying our leaf to delete
2917                  * it, therefore use right neighbor to insert the new item and
2918                  * no need to touch/dirty our left leaf. */
2919                 btrfs_tree_unlock(left);
2920                 free_extent_buffer(left);
2921                 path->nodes[0] = right;
2922                 path->slots[0] = 0;
2923                 path->slots[1]++;
2924                 return 0;
2925         }
2926
2927         return __push_leaf_right(path, min_data_size, empty,
2928                                 right, free_space, left_nritems, min_slot);
2929 out_unlock:
2930         btrfs_tree_unlock(right);
2931         free_extent_buffer(right);
2932         return 1;
2933 }
2934
2935 /*
2936  * push some data in the path leaf to the left, trying to free up at
2937  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2938  *
2939  * max_slot can put a limit on how far into the leaf we'll push items.  The
2940  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2941  * items
2942  */
2943 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2944                                      int empty, struct extent_buffer *left,
2945                                      int free_space, u32 right_nritems,
2946                                      u32 max_slot)
2947 {
2948         struct btrfs_fs_info *fs_info = left->fs_info;
2949         struct btrfs_disk_key disk_key;
2950         struct extent_buffer *right = path->nodes[0];
2951         int i;
2952         int push_space = 0;
2953         int push_items = 0;
2954         struct btrfs_item *item;
2955         u32 old_left_nritems;
2956         u32 nr;
2957         int ret = 0;
2958         u32 this_item_size;
2959         u32 old_left_item_size;
2960         struct btrfs_map_token token;
2961
2962         if (empty)
2963                 nr = min(right_nritems, max_slot);
2964         else
2965                 nr = min(right_nritems - 1, max_slot);
2966
2967         for (i = 0; i < nr; i++) {
2968                 item = btrfs_item_nr(i);
2969
2970                 if (!empty && push_items > 0) {
2971                         if (path->slots[0] < i)
2972                                 break;
2973                         if (path->slots[0] == i) {
2974                                 int space = btrfs_leaf_free_space(right);
2975
2976                                 if (space + push_space * 2 > free_space)
2977                                         break;
2978                         }
2979                 }
2980
2981                 if (path->slots[0] == i)
2982                         push_space += data_size;
2983
2984                 this_item_size = btrfs_item_size(right, item);
2985                 if (this_item_size + sizeof(*item) + push_space > free_space)
2986                         break;
2987
2988                 push_items++;
2989                 push_space += this_item_size + sizeof(*item);
2990         }
2991
2992         if (push_items == 0) {
2993                 ret = 1;
2994                 goto out;
2995         }
2996         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2997
2998         /* push data from right to left */
2999         copy_extent_buffer(left, right,
3000                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3001                            btrfs_item_nr_offset(0),
3002                            push_items * sizeof(struct btrfs_item));
3003
3004         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3005                      btrfs_item_offset_nr(right, push_items - 1);
3006
3007         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3008                      leaf_data_end(left) - push_space,
3009                      BTRFS_LEAF_DATA_OFFSET +
3010                      btrfs_item_offset_nr(right, push_items - 1),
3011                      push_space);
3012         old_left_nritems = btrfs_header_nritems(left);
3013         BUG_ON(old_left_nritems <= 0);
3014
3015         btrfs_init_map_token(&token, left);
3016         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3017         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3018                 u32 ioff;
3019
3020                 item = btrfs_item_nr(i);
3021
3022                 ioff = btrfs_token_item_offset(&token, item);
3023                 btrfs_set_token_item_offset(&token, item,
3024                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3025         }
3026         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3027
3028         /* fixup right node */
3029         if (push_items > right_nritems)
3030                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3031                        right_nritems);
3032
3033         if (push_items < right_nritems) {
3034                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3035                                                   leaf_data_end(right);
3036                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3037                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3038                                       BTRFS_LEAF_DATA_OFFSET +
3039                                       leaf_data_end(right), push_space);
3040
3041                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3042                               btrfs_item_nr_offset(push_items),
3043                              (btrfs_header_nritems(right) - push_items) *
3044                              sizeof(struct btrfs_item));
3045         }
3046
3047         btrfs_init_map_token(&token, right);
3048         right_nritems -= push_items;
3049         btrfs_set_header_nritems(right, right_nritems);
3050         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3051         for (i = 0; i < right_nritems; i++) {
3052                 item = btrfs_item_nr(i);
3053
3054                 push_space = push_space - btrfs_token_item_size(&token, item);
3055                 btrfs_set_token_item_offset(&token, item, push_space);
3056         }
3057
3058         btrfs_mark_buffer_dirty(left);
3059         if (right_nritems)
3060                 btrfs_mark_buffer_dirty(right);
3061         else
3062                 btrfs_clean_tree_block(right);
3063
3064         btrfs_item_key(right, &disk_key, 0);
3065         fixup_low_keys(path, &disk_key, 1);
3066
3067         /* then fixup the leaf pointer in the path */
3068         if (path->slots[0] < push_items) {
3069                 path->slots[0] += old_left_nritems;
3070                 btrfs_tree_unlock(path->nodes[0]);
3071                 free_extent_buffer(path->nodes[0]);
3072                 path->nodes[0] = left;
3073                 path->slots[1] -= 1;
3074         } else {
3075                 btrfs_tree_unlock(left);
3076                 free_extent_buffer(left);
3077                 path->slots[0] -= push_items;
3078         }
3079         BUG_ON(path->slots[0] < 0);
3080         return ret;
3081 out:
3082         btrfs_tree_unlock(left);
3083         free_extent_buffer(left);
3084         return ret;
3085 }
3086
3087 /*
3088  * push some data in the path leaf to the left, trying to free up at
3089  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3090  *
3091  * max_slot can put a limit on how far into the leaf we'll push items.  The
3092  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3093  * items
3094  */
3095 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3096                           *root, struct btrfs_path *path, int min_data_size,
3097                           int data_size, int empty, u32 max_slot)
3098 {
3099         struct extent_buffer *right = path->nodes[0];
3100         struct extent_buffer *left;
3101         int slot;
3102         int free_space;
3103         u32 right_nritems;
3104         int ret = 0;
3105
3106         slot = path->slots[1];
3107         if (slot == 0)
3108                 return 1;
3109         if (!path->nodes[1])
3110                 return 1;
3111
3112         right_nritems = btrfs_header_nritems(right);
3113         if (right_nritems == 0)
3114                 return 1;
3115
3116         btrfs_assert_tree_locked(path->nodes[1]);
3117
3118         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3119         /*
3120          * slot - 1 is not valid or we fail to read the left node,
3121          * no big deal, just return.
3122          */
3123         if (IS_ERR(left))
3124                 return 1;
3125
3126         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3127
3128         free_space = btrfs_leaf_free_space(left);
3129         if (free_space < data_size) {
3130                 ret = 1;
3131                 goto out;
3132         }
3133
3134         /* cow and double check */
3135         ret = btrfs_cow_block(trans, root, left,
3136                               path->nodes[1], slot - 1, &left,
3137                               BTRFS_NESTING_LEFT_COW);
3138         if (ret) {
3139                 /* we hit -ENOSPC, but it isn't fatal here */
3140                 if (ret == -ENOSPC)
3141                         ret = 1;
3142                 goto out;
3143         }
3144
3145         free_space = btrfs_leaf_free_space(left);
3146         if (free_space < data_size) {
3147                 ret = 1;
3148                 goto out;
3149         }
3150
3151         if (check_sibling_keys(left, right)) {
3152                 ret = -EUCLEAN;
3153                 goto out;
3154         }
3155         return __push_leaf_left(path, min_data_size,
3156                                empty, left, free_space, right_nritems,
3157                                max_slot);
3158 out:
3159         btrfs_tree_unlock(left);
3160         free_extent_buffer(left);
3161         return ret;
3162 }
3163
3164 /*
3165  * split the path's leaf in two, making sure there is at least data_size
3166  * available for the resulting leaf level of the path.
3167  */
3168 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3169                                     struct btrfs_path *path,
3170                                     struct extent_buffer *l,
3171                                     struct extent_buffer *right,
3172                                     int slot, int mid, int nritems)
3173 {
3174         struct btrfs_fs_info *fs_info = trans->fs_info;
3175         int data_copy_size;
3176         int rt_data_off;
3177         int i;
3178         struct btrfs_disk_key disk_key;
3179         struct btrfs_map_token token;
3180
3181         nritems = nritems - mid;
3182         btrfs_set_header_nritems(right, nritems);
3183         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3184
3185         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3186                            btrfs_item_nr_offset(mid),
3187                            nritems * sizeof(struct btrfs_item));
3188
3189         copy_extent_buffer(right, l,
3190                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3191                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3192                      leaf_data_end(l), data_copy_size);
3193
3194         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3195
3196         btrfs_init_map_token(&token, right);
3197         for (i = 0; i < nritems; i++) {
3198                 struct btrfs_item *item = btrfs_item_nr(i);
3199                 u32 ioff;
3200
3201                 ioff = btrfs_token_item_offset(&token, item);
3202                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3203         }
3204
3205         btrfs_set_header_nritems(l, mid);
3206         btrfs_item_key(right, &disk_key, 0);
3207         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3208
3209         btrfs_mark_buffer_dirty(right);
3210         btrfs_mark_buffer_dirty(l);
3211         BUG_ON(path->slots[0] != slot);
3212
3213         if (mid <= slot) {
3214                 btrfs_tree_unlock(path->nodes[0]);
3215                 free_extent_buffer(path->nodes[0]);
3216                 path->nodes[0] = right;
3217                 path->slots[0] -= mid;
3218                 path->slots[1] += 1;
3219         } else {
3220                 btrfs_tree_unlock(right);
3221                 free_extent_buffer(right);
3222         }
3223
3224         BUG_ON(path->slots[0] < 0);
3225 }
3226
3227 /*
3228  * double splits happen when we need to insert a big item in the middle
3229  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3230  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3231  *          A                 B                 C
3232  *
3233  * We avoid this by trying to push the items on either side of our target
3234  * into the adjacent leaves.  If all goes well we can avoid the double split
3235  * completely.
3236  */
3237 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3238                                           struct btrfs_root *root,
3239                                           struct btrfs_path *path,
3240                                           int data_size)
3241 {
3242         int ret;
3243         int progress = 0;
3244         int slot;
3245         u32 nritems;
3246         int space_needed = data_size;
3247
3248         slot = path->slots[0];
3249         if (slot < btrfs_header_nritems(path->nodes[0]))
3250                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3251
3252         /*
3253          * try to push all the items after our slot into the
3254          * right leaf
3255          */
3256         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3257         if (ret < 0)
3258                 return ret;
3259
3260         if (ret == 0)
3261                 progress++;
3262
3263         nritems = btrfs_header_nritems(path->nodes[0]);
3264         /*
3265          * our goal is to get our slot at the start or end of a leaf.  If
3266          * we've done so we're done
3267          */
3268         if (path->slots[0] == 0 || path->slots[0] == nritems)
3269                 return 0;
3270
3271         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3272                 return 0;
3273
3274         /* try to push all the items before our slot into the next leaf */
3275         slot = path->slots[0];
3276         space_needed = data_size;
3277         if (slot > 0)
3278                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3279         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3280         if (ret < 0)
3281                 return ret;
3282
3283         if (ret == 0)
3284                 progress++;
3285
3286         if (progress)
3287                 return 0;
3288         return 1;
3289 }
3290
3291 /*
3292  * split the path's leaf in two, making sure there is at least data_size
3293  * available for the resulting leaf level of the path.
3294  *
3295  * returns 0 if all went well and < 0 on failure.
3296  */
3297 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3298                                struct btrfs_root *root,
3299                                const struct btrfs_key *ins_key,
3300                                struct btrfs_path *path, int data_size,
3301                                int extend)
3302 {
3303         struct btrfs_disk_key disk_key;
3304         struct extent_buffer *l;
3305         u32 nritems;
3306         int mid;
3307         int slot;
3308         struct extent_buffer *right;
3309         struct btrfs_fs_info *fs_info = root->fs_info;
3310         int ret = 0;
3311         int wret;
3312         int split;
3313         int num_doubles = 0;
3314         int tried_avoid_double = 0;
3315
3316         l = path->nodes[0];
3317         slot = path->slots[0];
3318         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3319             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3320                 return -EOVERFLOW;
3321
3322         /* first try to make some room by pushing left and right */
3323         if (data_size && path->nodes[1]) {
3324                 int space_needed = data_size;
3325
3326                 if (slot < btrfs_header_nritems(l))
3327                         space_needed -= btrfs_leaf_free_space(l);
3328
3329                 wret = push_leaf_right(trans, root, path, space_needed,
3330                                        space_needed, 0, 0);
3331                 if (wret < 0)
3332                         return wret;
3333                 if (wret) {
3334                         space_needed = data_size;
3335                         if (slot > 0)
3336                                 space_needed -= btrfs_leaf_free_space(l);
3337                         wret = push_leaf_left(trans, root, path, space_needed,
3338                                               space_needed, 0, (u32)-1);
3339                         if (wret < 0)
3340                                 return wret;
3341                 }
3342                 l = path->nodes[0];
3343
3344                 /* did the pushes work? */
3345                 if (btrfs_leaf_free_space(l) >= data_size)
3346                         return 0;
3347         }
3348
3349         if (!path->nodes[1]) {
3350                 ret = insert_new_root(trans, root, path, 1);
3351                 if (ret)
3352                         return ret;
3353         }
3354 again:
3355         split = 1;
3356         l = path->nodes[0];
3357         slot = path->slots[0];
3358         nritems = btrfs_header_nritems(l);
3359         mid = (nritems + 1) / 2;
3360
3361         if (mid <= slot) {
3362                 if (nritems == 1 ||
3363                     leaf_space_used(l, mid, nritems - mid) + data_size >
3364                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3365                         if (slot >= nritems) {
3366                                 split = 0;
3367                         } else {
3368                                 mid = slot;
3369                                 if (mid != nritems &&
3370                                     leaf_space_used(l, mid, nritems - mid) +
3371                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3372                                         if (data_size && !tried_avoid_double)
3373                                                 goto push_for_double;
3374                                         split = 2;
3375                                 }
3376                         }
3377                 }
3378         } else {
3379                 if (leaf_space_used(l, 0, mid) + data_size >
3380                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3381                         if (!extend && data_size && slot == 0) {
3382                                 split = 0;
3383                         } else if ((extend || !data_size) && slot == 0) {
3384                                 mid = 1;
3385                         } else {
3386                                 mid = slot;
3387                                 if (mid != nritems &&
3388                                     leaf_space_used(l, mid, nritems - mid) +
3389                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3390                                         if (data_size && !tried_avoid_double)
3391                                                 goto push_for_double;
3392                                         split = 2;
3393                                 }
3394                         }
3395                 }
3396         }
3397
3398         if (split == 0)
3399                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3400         else
3401                 btrfs_item_key(l, &disk_key, mid);
3402
3403         /*
3404          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3405          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3406          * subclasses, which is 8 at the time of this patch, and we've maxed it
3407          * out.  In the future we could add a
3408          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3409          * use BTRFS_NESTING_NEW_ROOT.
3410          */
3411         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3412                                        &disk_key, 0, l->start, 0,
3413                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3414                                        BTRFS_NESTING_SPLIT);
3415         if (IS_ERR(right))
3416                 return PTR_ERR(right);
3417
3418         root_add_used(root, fs_info->nodesize);
3419
3420         if (split == 0) {
3421                 if (mid <= slot) {
3422                         btrfs_set_header_nritems(right, 0);
3423                         insert_ptr(trans, path, &disk_key,
3424                                    right->start, path->slots[1] + 1, 1);
3425                         btrfs_tree_unlock(path->nodes[0]);
3426                         free_extent_buffer(path->nodes[0]);
3427                         path->nodes[0] = right;
3428                         path->slots[0] = 0;
3429                         path->slots[1] += 1;
3430                 } else {
3431                         btrfs_set_header_nritems(right, 0);
3432                         insert_ptr(trans, path, &disk_key,
3433                                    right->start, path->slots[1], 1);
3434                         btrfs_tree_unlock(path->nodes[0]);
3435                         free_extent_buffer(path->nodes[0]);
3436                         path->nodes[0] = right;
3437                         path->slots[0] = 0;
3438                         if (path->slots[1] == 0)
3439                                 fixup_low_keys(path, &disk_key, 1);
3440                 }
3441                 /*
3442                  * We create a new leaf 'right' for the required ins_len and
3443                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3444                  * the content of ins_len to 'right'.
3445                  */
3446                 return ret;
3447         }
3448
3449         copy_for_split(trans, path, l, right, slot, mid, nritems);
3450
3451         if (split == 2) {
3452                 BUG_ON(num_doubles != 0);
3453                 num_doubles++;
3454                 goto again;
3455         }
3456
3457         return 0;
3458
3459 push_for_double:
3460         push_for_double_split(trans, root, path, data_size);
3461         tried_avoid_double = 1;
3462         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3463                 return 0;
3464         goto again;
3465 }
3466
3467 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3468                                          struct btrfs_root *root,
3469                                          struct btrfs_path *path, int ins_len)
3470 {
3471         struct btrfs_key key;
3472         struct extent_buffer *leaf;
3473         struct btrfs_file_extent_item *fi;
3474         u64 extent_len = 0;
3475         u32 item_size;
3476         int ret;
3477
3478         leaf = path->nodes[0];
3479         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3480
3481         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3482                key.type != BTRFS_EXTENT_CSUM_KEY);
3483
3484         if (btrfs_leaf_free_space(leaf) >= ins_len)
3485                 return 0;
3486
3487         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3488         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3489                 fi = btrfs_item_ptr(leaf, path->slots[0],
3490                                     struct btrfs_file_extent_item);
3491                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3492         }
3493         btrfs_release_path(path);
3494
3495         path->keep_locks = 1;
3496         path->search_for_split = 1;
3497         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3498         path->search_for_split = 0;
3499         if (ret > 0)
3500                 ret = -EAGAIN;
3501         if (ret < 0)
3502                 goto err;
3503
3504         ret = -EAGAIN;
3505         leaf = path->nodes[0];
3506         /* if our item isn't there, return now */
3507         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3508                 goto err;
3509
3510         /* the leaf has  changed, it now has room.  return now */
3511         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3512                 goto err;
3513
3514         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3515                 fi = btrfs_item_ptr(leaf, path->slots[0],
3516                                     struct btrfs_file_extent_item);
3517                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3518                         goto err;
3519         }
3520
3521         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3522         if (ret)
3523                 goto err;
3524
3525         path->keep_locks = 0;
3526         btrfs_unlock_up_safe(path, 1);
3527         return 0;
3528 err:
3529         path->keep_locks = 0;
3530         return ret;
3531 }
3532
3533 static noinline int split_item(struct btrfs_path *path,
3534                                const struct btrfs_key *new_key,
3535                                unsigned long split_offset)
3536 {
3537         struct extent_buffer *leaf;
3538         struct btrfs_item *item;
3539         struct btrfs_item *new_item;
3540         int slot;
3541         char *buf;
3542         u32 nritems;
3543         u32 item_size;
3544         u32 orig_offset;
3545         struct btrfs_disk_key disk_key;
3546
3547         leaf = path->nodes[0];
3548         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3549
3550         item = btrfs_item_nr(path->slots[0]);
3551         orig_offset = btrfs_item_offset(leaf, item);
3552         item_size = btrfs_item_size(leaf, item);
3553
3554         buf = kmalloc(item_size, GFP_NOFS);
3555         if (!buf)
3556                 return -ENOMEM;
3557
3558         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3559                             path->slots[0]), item_size);
3560
3561         slot = path->slots[0] + 1;
3562         nritems = btrfs_header_nritems(leaf);
3563         if (slot != nritems) {
3564                 /* shift the items */
3565                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3566                                 btrfs_item_nr_offset(slot),
3567                                 (nritems - slot) * sizeof(struct btrfs_item));
3568         }
3569
3570         btrfs_cpu_key_to_disk(&disk_key, new_key);
3571         btrfs_set_item_key(leaf, &disk_key, slot);
3572
3573         new_item = btrfs_item_nr(slot);
3574
3575         btrfs_set_item_offset(leaf, new_item, orig_offset);
3576         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3577
3578         btrfs_set_item_offset(leaf, item,
3579                               orig_offset + item_size - split_offset);
3580         btrfs_set_item_size(leaf, item, split_offset);
3581
3582         btrfs_set_header_nritems(leaf, nritems + 1);
3583
3584         /* write the data for the start of the original item */
3585         write_extent_buffer(leaf, buf,
3586                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3587                             split_offset);
3588
3589         /* write the data for the new item */
3590         write_extent_buffer(leaf, buf + split_offset,
3591                             btrfs_item_ptr_offset(leaf, slot),
3592                             item_size - split_offset);
3593         btrfs_mark_buffer_dirty(leaf);
3594
3595         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3596         kfree(buf);
3597         return 0;
3598 }
3599
3600 /*
3601  * This function splits a single item into two items,
3602  * giving 'new_key' to the new item and splitting the
3603  * old one at split_offset (from the start of the item).
3604  *
3605  * The path may be released by this operation.  After
3606  * the split, the path is pointing to the old item.  The
3607  * new item is going to be in the same node as the old one.
3608  *
3609  * Note, the item being split must be smaller enough to live alone on
3610  * a tree block with room for one extra struct btrfs_item
3611  *
3612  * This allows us to split the item in place, keeping a lock on the
3613  * leaf the entire time.
3614  */
3615 int btrfs_split_item(struct btrfs_trans_handle *trans,
3616                      struct btrfs_root *root,
3617                      struct btrfs_path *path,
3618                      const struct btrfs_key *new_key,
3619                      unsigned long split_offset)
3620 {
3621         int ret;
3622         ret = setup_leaf_for_split(trans, root, path,
3623                                    sizeof(struct btrfs_item));
3624         if (ret)
3625                 return ret;
3626
3627         ret = split_item(path, new_key, split_offset);
3628         return ret;
3629 }
3630
3631 /*
3632  * This function duplicate a item, giving 'new_key' to the new item.
3633  * It guarantees both items live in the same tree leaf and the new item
3634  * is contiguous with the original item.
3635  *
3636  * This allows us to split file extent in place, keeping a lock on the
3637  * leaf the entire time.
3638  */
3639 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3640                          struct btrfs_root *root,
3641                          struct btrfs_path *path,
3642                          const struct btrfs_key *new_key)
3643 {
3644         struct extent_buffer *leaf;
3645         int ret;
3646         u32 item_size;
3647
3648         leaf = path->nodes[0];
3649         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3650         ret = setup_leaf_for_split(trans, root, path,
3651                                    item_size + sizeof(struct btrfs_item));
3652         if (ret)
3653                 return ret;
3654
3655         path->slots[0]++;
3656         setup_items_for_insert(root, path, new_key, &item_size, 1);
3657         leaf = path->nodes[0];
3658         memcpy_extent_buffer(leaf,
3659                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3660                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3661                              item_size);
3662         return 0;
3663 }
3664
3665 /*
3666  * make the item pointed to by the path smaller.  new_size indicates
3667  * how small to make it, and from_end tells us if we just chop bytes
3668  * off the end of the item or if we shift the item to chop bytes off
3669  * the front.
3670  */
3671 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3672 {
3673         int slot;
3674         struct extent_buffer *leaf;
3675         struct btrfs_item *item;
3676         u32 nritems;
3677         unsigned int data_end;
3678         unsigned int old_data_start;
3679         unsigned int old_size;
3680         unsigned int size_diff;
3681         int i;
3682         struct btrfs_map_token token;
3683
3684         leaf = path->nodes[0];
3685         slot = path->slots[0];
3686
3687         old_size = btrfs_item_size_nr(leaf, slot);
3688         if (old_size == new_size)
3689                 return;
3690
3691         nritems = btrfs_header_nritems(leaf);
3692         data_end = leaf_data_end(leaf);
3693
3694         old_data_start = btrfs_item_offset_nr(leaf, slot);
3695
3696         size_diff = old_size - new_size;
3697
3698         BUG_ON(slot < 0);
3699         BUG_ON(slot >= nritems);
3700
3701         /*
3702          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3703          */
3704         /* first correct the data pointers */
3705         btrfs_init_map_token(&token, leaf);
3706         for (i = slot; i < nritems; i++) {
3707                 u32 ioff;
3708                 item = btrfs_item_nr(i);
3709
3710                 ioff = btrfs_token_item_offset(&token, item);
3711                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3712         }
3713
3714         /* shift the data */
3715         if (from_end) {
3716                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3717                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3718                               data_end, old_data_start + new_size - data_end);
3719         } else {
3720                 struct btrfs_disk_key disk_key;
3721                 u64 offset;
3722
3723                 btrfs_item_key(leaf, &disk_key, slot);
3724
3725                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3726                         unsigned long ptr;
3727                         struct btrfs_file_extent_item *fi;
3728
3729                         fi = btrfs_item_ptr(leaf, slot,
3730                                             struct btrfs_file_extent_item);
3731                         fi = (struct btrfs_file_extent_item *)(
3732                              (unsigned long)fi - size_diff);
3733
3734                         if (btrfs_file_extent_type(leaf, fi) ==
3735                             BTRFS_FILE_EXTENT_INLINE) {
3736                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3737                                 memmove_extent_buffer(leaf, ptr,
3738                                       (unsigned long)fi,
3739                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
3740                         }
3741                 }
3742
3743                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3744                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3745                               data_end, old_data_start - data_end);
3746
3747                 offset = btrfs_disk_key_offset(&disk_key);
3748                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3749                 btrfs_set_item_key(leaf, &disk_key, slot);
3750                 if (slot == 0)
3751                         fixup_low_keys(path, &disk_key, 1);
3752         }
3753
3754         item = btrfs_item_nr(slot);
3755         btrfs_set_item_size(leaf, item, new_size);
3756         btrfs_mark_buffer_dirty(leaf);
3757
3758         if (btrfs_leaf_free_space(leaf) < 0) {
3759                 btrfs_print_leaf(leaf);
3760                 BUG();
3761         }
3762 }
3763
3764 /*
3765  * make the item pointed to by the path bigger, data_size is the added size.
3766  */
3767 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3768 {
3769         int slot;
3770         struct extent_buffer *leaf;
3771         struct btrfs_item *item;
3772         u32 nritems;
3773         unsigned int data_end;
3774         unsigned int old_data;
3775         unsigned int old_size;
3776         int i;
3777         struct btrfs_map_token token;
3778
3779         leaf = path->nodes[0];
3780
3781         nritems = btrfs_header_nritems(leaf);
3782         data_end = leaf_data_end(leaf);
3783
3784         if (btrfs_leaf_free_space(leaf) < data_size) {
3785                 btrfs_print_leaf(leaf);
3786                 BUG();
3787         }
3788         slot = path->slots[0];
3789         old_data = btrfs_item_end_nr(leaf, slot);
3790
3791         BUG_ON(slot < 0);
3792         if (slot >= nritems) {
3793                 btrfs_print_leaf(leaf);
3794                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3795                            slot, nritems);
3796                 BUG();
3797         }
3798
3799         /*
3800          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3801          */
3802         /* first correct the data pointers */
3803         btrfs_init_map_token(&token, leaf);
3804         for (i = slot; i < nritems; i++) {
3805                 u32 ioff;
3806                 item = btrfs_item_nr(i);
3807
3808                 ioff = btrfs_token_item_offset(&token, item);
3809                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
3810         }
3811
3812         /* shift the data */
3813         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3814                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3815                       data_end, old_data - data_end);
3816
3817         data_end = old_data;
3818         old_size = btrfs_item_size_nr(leaf, slot);
3819         item = btrfs_item_nr(slot);
3820         btrfs_set_item_size(leaf, item, old_size + data_size);
3821         btrfs_mark_buffer_dirty(leaf);
3822
3823         if (btrfs_leaf_free_space(leaf) < 0) {
3824                 btrfs_print_leaf(leaf);
3825                 BUG();
3826         }
3827 }
3828
3829 /**
3830  * setup_items_for_insert - Helper called before inserting one or more items
3831  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3832  * in a function that doesn't call btrfs_search_slot
3833  *
3834  * @root:       root we are inserting items to
3835  * @path:       points to the leaf/slot where we are going to insert new items
3836  * @cpu_key:    array of keys for items to be inserted
3837  * @data_size:  size of the body of each item we are going to insert
3838  * @nr:         size of @cpu_key/@data_size arrays
3839  */
3840 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3841                             const struct btrfs_key *cpu_key, u32 *data_size,
3842                             int nr)
3843 {
3844         struct btrfs_fs_info *fs_info = root->fs_info;
3845         struct btrfs_item *item;
3846         int i;
3847         u32 nritems;
3848         unsigned int data_end;
3849         struct btrfs_disk_key disk_key;
3850         struct extent_buffer *leaf;
3851         int slot;
3852         struct btrfs_map_token token;
3853         u32 total_size;
3854         u32 total_data = 0;
3855
3856         for (i = 0; i < nr; i++)
3857                 total_data += data_size[i];
3858         total_size = total_data + (nr * sizeof(struct btrfs_item));
3859
3860         if (path->slots[0] == 0) {
3861                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3862                 fixup_low_keys(path, &disk_key, 1);
3863         }
3864         btrfs_unlock_up_safe(path, 1);
3865
3866         leaf = path->nodes[0];
3867         slot = path->slots[0];
3868
3869         nritems = btrfs_header_nritems(leaf);
3870         data_end = leaf_data_end(leaf);
3871
3872         if (btrfs_leaf_free_space(leaf) < total_size) {
3873                 btrfs_print_leaf(leaf);
3874                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
3875                            total_size, btrfs_leaf_free_space(leaf));
3876                 BUG();
3877         }
3878
3879         btrfs_init_map_token(&token, leaf);
3880         if (slot != nritems) {
3881                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3882
3883                 if (old_data < data_end) {
3884                         btrfs_print_leaf(leaf);
3885                         btrfs_crit(fs_info,
3886                 "item at slot %d with data offset %u beyond data end of leaf %u",
3887                                    slot, old_data, data_end);
3888                         BUG();
3889                 }
3890                 /*
3891                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3892                  */
3893                 /* first correct the data pointers */
3894                 for (i = slot; i < nritems; i++) {
3895                         u32 ioff;
3896
3897                         item = btrfs_item_nr(i);
3898                         ioff = btrfs_token_item_offset(&token, item);
3899                         btrfs_set_token_item_offset(&token, item,
3900                                                     ioff - total_data);
3901                 }
3902                 /* shift the items */
3903                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3904                               btrfs_item_nr_offset(slot),
3905                               (nritems - slot) * sizeof(struct btrfs_item));
3906
3907                 /* shift the data */
3908                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3909                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3910                               data_end, old_data - data_end);
3911                 data_end = old_data;
3912         }
3913
3914         /* setup the item for the new data */
3915         for (i = 0; i < nr; i++) {
3916                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3917                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3918                 item = btrfs_item_nr(slot + i);
3919                 data_end -= data_size[i];
3920                 btrfs_set_token_item_offset(&token, item, data_end);
3921                 btrfs_set_token_item_size(&token, item, data_size[i]);
3922         }
3923
3924         btrfs_set_header_nritems(leaf, nritems + nr);
3925         btrfs_mark_buffer_dirty(leaf);
3926
3927         if (btrfs_leaf_free_space(leaf) < 0) {
3928                 btrfs_print_leaf(leaf);
3929                 BUG();
3930         }
3931 }
3932
3933 /*
3934  * Given a key and some data, insert items into the tree.
3935  * This does all the path init required, making room in the tree if needed.
3936  */
3937 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3938                             struct btrfs_root *root,
3939                             struct btrfs_path *path,
3940                             const struct btrfs_key *cpu_key, u32 *data_size,
3941                             int nr)
3942 {
3943         int ret = 0;
3944         int slot;
3945         int i;
3946         u32 total_size = 0;
3947         u32 total_data = 0;
3948
3949         for (i = 0; i < nr; i++)
3950                 total_data += data_size[i];
3951
3952         total_size = total_data + (nr * sizeof(struct btrfs_item));
3953         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3954         if (ret == 0)
3955                 return -EEXIST;
3956         if (ret < 0)
3957                 return ret;
3958
3959         slot = path->slots[0];
3960         BUG_ON(slot < 0);
3961
3962         setup_items_for_insert(root, path, cpu_key, data_size, nr);
3963         return 0;
3964 }
3965
3966 /*
3967  * Given a key and some data, insert an item into the tree.
3968  * This does all the path init required, making room in the tree if needed.
3969  */
3970 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3971                       const struct btrfs_key *cpu_key, void *data,
3972                       u32 data_size)
3973 {
3974         int ret = 0;
3975         struct btrfs_path *path;
3976         struct extent_buffer *leaf;
3977         unsigned long ptr;
3978
3979         path = btrfs_alloc_path();
3980         if (!path)
3981                 return -ENOMEM;
3982         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3983         if (!ret) {
3984                 leaf = path->nodes[0];
3985                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3986                 write_extent_buffer(leaf, data, ptr, data_size);
3987                 btrfs_mark_buffer_dirty(leaf);
3988         }
3989         btrfs_free_path(path);
3990         return ret;
3991 }
3992
3993 /*
3994  * delete the pointer from a given node.
3995  *
3996  * the tree should have been previously balanced so the deletion does not
3997  * empty a node.
3998  */
3999 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4000                     int level, int slot)
4001 {
4002         struct extent_buffer *parent = path->nodes[level];
4003         u32 nritems;
4004         int ret;
4005
4006         nritems = btrfs_header_nritems(parent);
4007         if (slot != nritems - 1) {
4008                 if (level) {
4009                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
4010                                         slot + 1, nritems - slot - 1);
4011                         BUG_ON(ret < 0);
4012                 }
4013                 memmove_extent_buffer(parent,
4014                               btrfs_node_key_ptr_offset(slot),
4015                               btrfs_node_key_ptr_offset(slot + 1),
4016                               sizeof(struct btrfs_key_ptr) *
4017                               (nritems - slot - 1));
4018         } else if (level) {
4019                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4020                                 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
4021                 BUG_ON(ret < 0);
4022         }
4023
4024         nritems--;
4025         btrfs_set_header_nritems(parent, nritems);
4026         if (nritems == 0 && parent == root->node) {
4027                 BUG_ON(btrfs_header_level(root->node) != 1);
4028                 /* just turn the root into a leaf and break */
4029                 btrfs_set_header_level(root->node, 0);
4030         } else if (slot == 0) {
4031                 struct btrfs_disk_key disk_key;
4032
4033                 btrfs_node_key(parent, &disk_key, 0);
4034                 fixup_low_keys(path, &disk_key, level + 1);
4035         }
4036         btrfs_mark_buffer_dirty(parent);
4037 }
4038
4039 /*
4040  * a helper function to delete the leaf pointed to by path->slots[1] and
4041  * path->nodes[1].
4042  *
4043  * This deletes the pointer in path->nodes[1] and frees the leaf
4044  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4045  *
4046  * The path must have already been setup for deleting the leaf, including
4047  * all the proper balancing.  path->nodes[1] must be locked.
4048  */
4049 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4050                                     struct btrfs_root *root,
4051                                     struct btrfs_path *path,
4052                                     struct extent_buffer *leaf)
4053 {
4054         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4055         del_ptr(root, path, 1, path->slots[1]);
4056
4057         /*
4058          * btrfs_free_extent is expensive, we want to make sure we
4059          * aren't holding any locks when we call it
4060          */
4061         btrfs_unlock_up_safe(path, 0);
4062
4063         root_sub_used(root, leaf->len);
4064
4065         atomic_inc(&leaf->refs);
4066         btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4067         free_extent_buffer_stale(leaf);
4068 }
4069 /*
4070  * delete the item at the leaf level in path.  If that empties
4071  * the leaf, remove it from the tree
4072  */
4073 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4074                     struct btrfs_path *path, int slot, int nr)
4075 {
4076         struct btrfs_fs_info *fs_info = root->fs_info;
4077         struct extent_buffer *leaf;
4078         struct btrfs_item *item;
4079         u32 last_off;
4080         u32 dsize = 0;
4081         int ret = 0;
4082         int wret;
4083         int i;
4084         u32 nritems;
4085
4086         leaf = path->nodes[0];
4087         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4088
4089         for (i = 0; i < nr; i++)
4090                 dsize += btrfs_item_size_nr(leaf, slot + i);
4091
4092         nritems = btrfs_header_nritems(leaf);
4093
4094         if (slot + nr != nritems) {
4095                 int data_end = leaf_data_end(leaf);
4096                 struct btrfs_map_token token;
4097
4098                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4099                               data_end + dsize,
4100                               BTRFS_LEAF_DATA_OFFSET + data_end,
4101                               last_off - data_end);
4102
4103                 btrfs_init_map_token(&token, leaf);
4104                 for (i = slot + nr; i < nritems; i++) {
4105                         u32 ioff;
4106
4107                         item = btrfs_item_nr(i);
4108                         ioff = btrfs_token_item_offset(&token, item);
4109                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
4110                 }
4111
4112                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4113                               btrfs_item_nr_offset(slot + nr),
4114                               sizeof(struct btrfs_item) *
4115                               (nritems - slot - nr));
4116         }
4117         btrfs_set_header_nritems(leaf, nritems - nr);
4118         nritems -= nr;
4119
4120         /* delete the leaf if we've emptied it */
4121         if (nritems == 0) {
4122                 if (leaf == root->node) {
4123                         btrfs_set_header_level(leaf, 0);
4124                 } else {
4125                         btrfs_clean_tree_block(leaf);
4126                         btrfs_del_leaf(trans, root, path, leaf);
4127                 }
4128         } else {
4129                 int used = leaf_space_used(leaf, 0, nritems);
4130                 if (slot == 0) {
4131                         struct btrfs_disk_key disk_key;
4132
4133                         btrfs_item_key(leaf, &disk_key, 0);
4134                         fixup_low_keys(path, &disk_key, 1);
4135                 }
4136
4137                 /* delete the leaf if it is mostly empty */
4138                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4139                         /* push_leaf_left fixes the path.
4140                          * make sure the path still points to our leaf
4141                          * for possible call to del_ptr below
4142                          */
4143                         slot = path->slots[1];
4144                         atomic_inc(&leaf->refs);
4145
4146                         wret = push_leaf_left(trans, root, path, 1, 1,
4147                                               1, (u32)-1);
4148                         if (wret < 0 && wret != -ENOSPC)
4149                                 ret = wret;
4150
4151                         if (path->nodes[0] == leaf &&
4152                             btrfs_header_nritems(leaf)) {
4153                                 wret = push_leaf_right(trans, root, path, 1,
4154                                                        1, 1, 0);
4155                                 if (wret < 0 && wret != -ENOSPC)
4156                                         ret = wret;
4157                         }
4158
4159                         if (btrfs_header_nritems(leaf) == 0) {
4160                                 path->slots[1] = slot;
4161                                 btrfs_del_leaf(trans, root, path, leaf);
4162                                 free_extent_buffer(leaf);
4163                                 ret = 0;
4164                         } else {
4165                                 /* if we're still in the path, make sure
4166                                  * we're dirty.  Otherwise, one of the
4167                                  * push_leaf functions must have already
4168                                  * dirtied this buffer
4169                                  */
4170                                 if (path->nodes[0] == leaf)
4171                                         btrfs_mark_buffer_dirty(leaf);
4172                                 free_extent_buffer(leaf);
4173                         }
4174                 } else {
4175                         btrfs_mark_buffer_dirty(leaf);
4176                 }
4177         }
4178         return ret;
4179 }
4180
4181 /*
4182  * search the tree again to find a leaf with lesser keys
4183  * returns 0 if it found something or 1 if there are no lesser leaves.
4184  * returns < 0 on io errors.
4185  *
4186  * This may release the path, and so you may lose any locks held at the
4187  * time you call it.
4188  */
4189 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4190 {
4191         struct btrfs_key key;
4192         struct btrfs_disk_key found_key;
4193         int ret;
4194
4195         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4196
4197         if (key.offset > 0) {
4198                 key.offset--;
4199         } else if (key.type > 0) {
4200                 key.type--;
4201                 key.offset = (u64)-1;
4202         } else if (key.objectid > 0) {
4203                 key.objectid--;
4204                 key.type = (u8)-1;
4205                 key.offset = (u64)-1;
4206         } else {
4207                 return 1;
4208         }
4209
4210         btrfs_release_path(path);
4211         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4212         if (ret < 0)
4213                 return ret;
4214         btrfs_item_key(path->nodes[0], &found_key, 0);
4215         ret = comp_keys(&found_key, &key);
4216         /*
4217          * We might have had an item with the previous key in the tree right
4218          * before we released our path. And after we released our path, that
4219          * item might have been pushed to the first slot (0) of the leaf we
4220          * were holding due to a tree balance. Alternatively, an item with the
4221          * previous key can exist as the only element of a leaf (big fat item).
4222          * Therefore account for these 2 cases, so that our callers (like
4223          * btrfs_previous_item) don't miss an existing item with a key matching
4224          * the previous key we computed above.
4225          */
4226         if (ret <= 0)
4227                 return 0;
4228         return 1;
4229 }
4230
4231 /*
4232  * A helper function to walk down the tree starting at min_key, and looking
4233  * for nodes or leaves that are have a minimum transaction id.
4234  * This is used by the btree defrag code, and tree logging
4235  *
4236  * This does not cow, but it does stuff the starting key it finds back
4237  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4238  * key and get a writable path.
4239  *
4240  * This honors path->lowest_level to prevent descent past a given level
4241  * of the tree.
4242  *
4243  * min_trans indicates the oldest transaction that you are interested
4244  * in walking through.  Any nodes or leaves older than min_trans are
4245  * skipped over (without reading them).
4246  *
4247  * returns zero if something useful was found, < 0 on error and 1 if there
4248  * was nothing in the tree that matched the search criteria.
4249  */
4250 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4251                          struct btrfs_path *path,
4252                          u64 min_trans)
4253 {
4254         struct extent_buffer *cur;
4255         struct btrfs_key found_key;
4256         int slot;
4257         int sret;
4258         u32 nritems;
4259         int level;
4260         int ret = 1;
4261         int keep_locks = path->keep_locks;
4262
4263         path->keep_locks = 1;
4264 again:
4265         cur = btrfs_read_lock_root_node(root);
4266         level = btrfs_header_level(cur);
4267         WARN_ON(path->nodes[level]);
4268         path->nodes[level] = cur;
4269         path->locks[level] = BTRFS_READ_LOCK;
4270
4271         if (btrfs_header_generation(cur) < min_trans) {
4272                 ret = 1;
4273                 goto out;
4274         }
4275         while (1) {
4276                 nritems = btrfs_header_nritems(cur);
4277                 level = btrfs_header_level(cur);
4278                 sret = btrfs_bin_search(cur, min_key, &slot);
4279                 if (sret < 0) {
4280                         ret = sret;
4281                         goto out;
4282                 }
4283
4284                 /* at the lowest level, we're done, setup the path and exit */
4285                 if (level == path->lowest_level) {
4286                         if (slot >= nritems)
4287                                 goto find_next_key;
4288                         ret = 0;
4289                         path->slots[level] = slot;
4290                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4291                         goto out;
4292                 }
4293                 if (sret && slot > 0)
4294                         slot--;
4295                 /*
4296                  * check this node pointer against the min_trans parameters.
4297                  * If it is too old, skip to the next one.
4298                  */
4299                 while (slot < nritems) {
4300                         u64 gen;
4301
4302                         gen = btrfs_node_ptr_generation(cur, slot);
4303                         if (gen < min_trans) {
4304                                 slot++;
4305                                 continue;
4306                         }
4307                         break;
4308                 }
4309 find_next_key:
4310                 /*
4311                  * we didn't find a candidate key in this node, walk forward
4312                  * and find another one
4313                  */
4314                 if (slot >= nritems) {
4315                         path->slots[level] = slot;
4316                         sret = btrfs_find_next_key(root, path, min_key, level,
4317                                                   min_trans);
4318                         if (sret == 0) {
4319                                 btrfs_release_path(path);
4320                                 goto again;
4321                         } else {
4322                                 goto out;
4323                         }
4324                 }
4325                 /* save our key for returning back */
4326                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4327                 path->slots[level] = slot;
4328                 if (level == path->lowest_level) {
4329                         ret = 0;
4330                         goto out;
4331                 }
4332                 cur = btrfs_read_node_slot(cur, slot);
4333                 if (IS_ERR(cur)) {
4334                         ret = PTR_ERR(cur);
4335                         goto out;
4336                 }
4337
4338                 btrfs_tree_read_lock(cur);
4339
4340                 path->locks[level - 1] = BTRFS_READ_LOCK;
4341                 path->nodes[level - 1] = cur;
4342                 unlock_up(path, level, 1, 0, NULL);
4343         }
4344 out:
4345         path->keep_locks = keep_locks;
4346         if (ret == 0) {
4347                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4348                 memcpy(min_key, &found_key, sizeof(found_key));
4349         }
4350         return ret;
4351 }
4352
4353 /*
4354  * this is similar to btrfs_next_leaf, but does not try to preserve
4355  * and fixup the path.  It looks for and returns the next key in the
4356  * tree based on the current path and the min_trans parameters.
4357  *
4358  * 0 is returned if another key is found, < 0 if there are any errors
4359  * and 1 is returned if there are no higher keys in the tree
4360  *
4361  * path->keep_locks should be set to 1 on the search made before
4362  * calling this function.
4363  */
4364 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4365                         struct btrfs_key *key, int level, u64 min_trans)
4366 {
4367         int slot;
4368         struct extent_buffer *c;
4369
4370         WARN_ON(!path->keep_locks && !path->skip_locking);
4371         while (level < BTRFS_MAX_LEVEL) {
4372                 if (!path->nodes[level])
4373                         return 1;
4374
4375                 slot = path->slots[level] + 1;
4376                 c = path->nodes[level];
4377 next:
4378                 if (slot >= btrfs_header_nritems(c)) {
4379                         int ret;
4380                         int orig_lowest;
4381                         struct btrfs_key cur_key;
4382                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4383                             !path->nodes[level + 1])
4384                                 return 1;
4385
4386                         if (path->locks[level + 1] || path->skip_locking) {
4387                                 level++;
4388                                 continue;
4389                         }
4390
4391                         slot = btrfs_header_nritems(c) - 1;
4392                         if (level == 0)
4393                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4394                         else
4395                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4396
4397                         orig_lowest = path->lowest_level;
4398                         btrfs_release_path(path);
4399                         path->lowest_level = level;
4400                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4401                                                 0, 0);
4402                         path->lowest_level = orig_lowest;
4403                         if (ret < 0)
4404                                 return ret;
4405
4406                         c = path->nodes[level];
4407                         slot = path->slots[level];
4408                         if (ret == 0)
4409                                 slot++;
4410                         goto next;
4411                 }
4412
4413                 if (level == 0)
4414                         btrfs_item_key_to_cpu(c, key, slot);
4415                 else {
4416                         u64 gen = btrfs_node_ptr_generation(c, slot);
4417
4418                         if (gen < min_trans) {
4419                                 slot++;
4420                                 goto next;
4421                         }
4422                         btrfs_node_key_to_cpu(c, key, slot);
4423                 }
4424                 return 0;
4425         }
4426         return 1;
4427 }
4428
4429 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4430                         u64 time_seq)
4431 {
4432         int slot;
4433         int level;
4434         struct extent_buffer *c;
4435         struct extent_buffer *next;
4436         struct btrfs_fs_info *fs_info = root->fs_info;
4437         struct btrfs_key key;
4438         bool need_commit_sem = false;
4439         u32 nritems;
4440         int ret;
4441         int i;
4442
4443         nritems = btrfs_header_nritems(path->nodes[0]);
4444         if (nritems == 0)
4445                 return 1;
4446
4447         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4448 again:
4449         level = 1;
4450         next = NULL;
4451         btrfs_release_path(path);
4452
4453         path->keep_locks = 1;
4454
4455         if (time_seq) {
4456                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4457         } else {
4458                 if (path->need_commit_sem) {
4459                         path->need_commit_sem = 0;
4460                         need_commit_sem = true;
4461                         down_read(&fs_info->commit_root_sem);
4462                 }
4463                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4464         }
4465         path->keep_locks = 0;
4466
4467         if (ret < 0)
4468                 goto done;
4469
4470         nritems = btrfs_header_nritems(path->nodes[0]);
4471         /*
4472          * by releasing the path above we dropped all our locks.  A balance
4473          * could have added more items next to the key that used to be
4474          * at the very end of the block.  So, check again here and
4475          * advance the path if there are now more items available.
4476          */
4477         if (nritems > 0 && path->slots[0] < nritems - 1) {
4478                 if (ret == 0)
4479                         path->slots[0]++;
4480                 ret = 0;
4481                 goto done;
4482         }
4483         /*
4484          * So the above check misses one case:
4485          * - after releasing the path above, someone has removed the item that
4486          *   used to be at the very end of the block, and balance between leafs
4487          *   gets another one with bigger key.offset to replace it.
4488          *
4489          * This one should be returned as well, or we can get leaf corruption
4490          * later(esp. in __btrfs_drop_extents()).
4491          *
4492          * And a bit more explanation about this check,
4493          * with ret > 0, the key isn't found, the path points to the slot
4494          * where it should be inserted, so the path->slots[0] item must be the
4495          * bigger one.
4496          */
4497         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4498                 ret = 0;
4499                 goto done;
4500         }
4501
4502         while (level < BTRFS_MAX_LEVEL) {
4503                 if (!path->nodes[level]) {
4504                         ret = 1;
4505                         goto done;
4506                 }
4507
4508                 slot = path->slots[level] + 1;
4509                 c = path->nodes[level];
4510                 if (slot >= btrfs_header_nritems(c)) {
4511                         level++;
4512                         if (level == BTRFS_MAX_LEVEL) {
4513                                 ret = 1;
4514                                 goto done;
4515                         }
4516                         continue;
4517                 }
4518
4519
4520                 /*
4521                  * Our current level is where we're going to start from, and to
4522                  * make sure lockdep doesn't complain we need to drop our locks
4523                  * and nodes from 0 to our current level.
4524                  */
4525                 for (i = 0; i < level; i++) {
4526                         if (path->locks[level]) {
4527                                 btrfs_tree_read_unlock(path->nodes[i]);
4528                                 path->locks[i] = 0;
4529                         }
4530                         free_extent_buffer(path->nodes[i]);
4531                         path->nodes[i] = NULL;
4532                 }
4533
4534                 next = c;
4535                 ret = read_block_for_search(root, path, &next, level,
4536                                             slot, &key);
4537                 if (ret == -EAGAIN)
4538                         goto again;
4539
4540                 if (ret < 0) {
4541                         btrfs_release_path(path);
4542                         goto done;
4543                 }
4544
4545                 if (!path->skip_locking) {
4546                         ret = btrfs_try_tree_read_lock(next);
4547                         if (!ret && time_seq) {
4548                                 /*
4549                                  * If we don't get the lock, we may be racing
4550                                  * with push_leaf_left, holding that lock while
4551                                  * itself waiting for the leaf we've currently
4552                                  * locked. To solve this situation, we give up
4553                                  * on our lock and cycle.
4554                                  */
4555                                 free_extent_buffer(next);
4556                                 btrfs_release_path(path);
4557                                 cond_resched();
4558                                 goto again;
4559                         }
4560                         if (!ret)
4561                                 btrfs_tree_read_lock(next);
4562                 }
4563                 break;
4564         }
4565         path->slots[level] = slot;
4566         while (1) {
4567                 level--;
4568                 path->nodes[level] = next;
4569                 path->slots[level] = 0;
4570                 if (!path->skip_locking)
4571                         path->locks[level] = BTRFS_READ_LOCK;
4572                 if (!level)
4573                         break;
4574
4575                 ret = read_block_for_search(root, path, &next, level,
4576                                             0, &key);
4577                 if (ret == -EAGAIN)
4578                         goto again;
4579
4580                 if (ret < 0) {
4581                         btrfs_release_path(path);
4582                         goto done;
4583                 }
4584
4585                 if (!path->skip_locking)
4586                         btrfs_tree_read_lock(next);
4587         }
4588         ret = 0;
4589 done:
4590         unlock_up(path, 0, 1, 0, NULL);
4591         if (need_commit_sem) {
4592                 int ret2;
4593
4594                 path->need_commit_sem = 1;
4595                 ret2 = finish_need_commit_sem_search(path);
4596                 up_read(&fs_info->commit_root_sem);
4597                 if (ret2)
4598                         ret = ret2;
4599         }
4600
4601         return ret;
4602 }
4603
4604 /*
4605  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4606  * searching until it gets past min_objectid or finds an item of 'type'
4607  *
4608  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4609  */
4610 int btrfs_previous_item(struct btrfs_root *root,
4611                         struct btrfs_path *path, u64 min_objectid,
4612                         int type)
4613 {
4614         struct btrfs_key found_key;
4615         struct extent_buffer *leaf;
4616         u32 nritems;
4617         int ret;
4618
4619         while (1) {
4620                 if (path->slots[0] == 0) {
4621                         ret = btrfs_prev_leaf(root, path);
4622                         if (ret != 0)
4623                                 return ret;
4624                 } else {
4625                         path->slots[0]--;
4626                 }
4627                 leaf = path->nodes[0];
4628                 nritems = btrfs_header_nritems(leaf);
4629                 if (nritems == 0)
4630                         return 1;
4631                 if (path->slots[0] == nritems)
4632                         path->slots[0]--;
4633
4634                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4635                 if (found_key.objectid < min_objectid)
4636                         break;
4637                 if (found_key.type == type)
4638                         return 0;
4639                 if (found_key.objectid == min_objectid &&
4640                     found_key.type < type)
4641                         break;
4642         }
4643         return 1;
4644 }
4645
4646 /*
4647  * search in extent tree to find a previous Metadata/Data extent item with
4648  * min objecitd.
4649  *
4650  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4651  */
4652 int btrfs_previous_extent_item(struct btrfs_root *root,
4653                         struct btrfs_path *path, u64 min_objectid)
4654 {
4655         struct btrfs_key found_key;
4656         struct extent_buffer *leaf;
4657         u32 nritems;
4658         int ret;
4659
4660         while (1) {
4661                 if (path->slots[0] == 0) {
4662                         ret = btrfs_prev_leaf(root, path);
4663                         if (ret != 0)
4664                                 return ret;
4665                 } else {
4666                         path->slots[0]--;
4667                 }
4668                 leaf = path->nodes[0];
4669                 nritems = btrfs_header_nritems(leaf);
4670                 if (nritems == 0)
4671                         return 1;
4672                 if (path->slots[0] == nritems)
4673                         path->slots[0]--;
4674
4675                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4676                 if (found_key.objectid < min_objectid)
4677                         break;
4678                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4679                     found_key.type == BTRFS_METADATA_ITEM_KEY)
4680                         return 0;
4681                 if (found_key.objectid == min_objectid &&
4682                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
4683                         break;
4684         }
4685         return 1;
4686 }