Merge tag 'perf-tools-fixes-for-v5.12-2020-03-28' of git://git.kernel.org/pub/scm...
[platform/kernel/linux-rpi.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      name[10];
35         const char      driver[12];
36 } btrfs_csums[] = {
37         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41                                      .driver = "blake2b-256" },
42 };
43
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46         u16 t = btrfs_super_csum_type(s);
47         /*
48          * csum type is validated at mount time
49          */
50         return btrfs_csums[t].size;
51 }
52
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55         /* csum type is validated at mount time */
56         return btrfs_csums[csum_type].name;
57 }
58
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65         /* csum type is validated at mount time */
66         return btrfs_csums[csum_type].driver[0] ?
67                 btrfs_csums[csum_type].driver :
68                 btrfs_csums[csum_type].name;
69 }
70
71 size_t __attribute_const__ btrfs_get_num_csums(void)
72 {
73         return ARRAY_SIZE(btrfs_csums);
74 }
75
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84         if (!p)
85                 return;
86         btrfs_release_path(p);
87         kmem_cache_free(btrfs_path_cachep, p);
88 }
89
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98         int i;
99
100         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101                 p->slots[i] = 0;
102                 if (!p->nodes[i])
103                         continue;
104                 if (p->locks[i]) {
105                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106                         p->locks[i] = 0;
107                 }
108                 free_extent_buffer(p->nodes[i]);
109                 p->nodes[i] = NULL;
110         }
111 }
112
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125         struct extent_buffer *eb;
126
127         while (1) {
128                 rcu_read_lock();
129                 eb = rcu_dereference(root->node);
130
131                 /*
132                  * RCU really hurts here, we could free up the root node because
133                  * it was COWed but we may not get the new root node yet so do
134                  * the inc_not_zero dance and if it doesn't work then
135                  * synchronize_rcu and try again.
136                  */
137                 if (atomic_inc_not_zero(&eb->refs)) {
138                         rcu_read_unlock();
139                         break;
140                 }
141                 rcu_read_unlock();
142                 synchronize_rcu();
143         }
144         return eb;
145 }
146
147 /*
148  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149  * just get put onto a simple dirty list.  Transaction walks this list to make
150  * sure they get properly updated on disk.
151  */
152 static void add_root_to_dirty_list(struct btrfs_root *root)
153 {
154         struct btrfs_fs_info *fs_info = root->fs_info;
155
156         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158                 return;
159
160         spin_lock(&fs_info->trans_lock);
161         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162                 /* Want the extent tree to be the last on the list */
163                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164                         list_move_tail(&root->dirty_list,
165                                        &fs_info->dirty_cowonly_roots);
166                 else
167                         list_move(&root->dirty_list,
168                                   &fs_info->dirty_cowonly_roots);
169         }
170         spin_unlock(&fs_info->trans_lock);
171 }
172
173 /*
174  * used by snapshot creation to make a copy of a root for a tree with
175  * a given objectid.  The buffer with the new root node is returned in
176  * cow_ret, and this func returns zero on success or a negative error code.
177  */
178 int btrfs_copy_root(struct btrfs_trans_handle *trans,
179                       struct btrfs_root *root,
180                       struct extent_buffer *buf,
181                       struct extent_buffer **cow_ret, u64 new_root_objectid)
182 {
183         struct btrfs_fs_info *fs_info = root->fs_info;
184         struct extent_buffer *cow;
185         int ret = 0;
186         int level;
187         struct btrfs_disk_key disk_key;
188
189         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190                 trans->transid != fs_info->running_transaction->transid);
191         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192                 trans->transid != root->last_trans);
193
194         level = btrfs_header_level(buf);
195         if (level == 0)
196                 btrfs_item_key(buf, &disk_key, 0);
197         else
198                 btrfs_node_key(buf, &disk_key, 0);
199
200         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201                                      &disk_key, level, buf->start, 0,
202                                      BTRFS_NESTING_NEW_ROOT);
203         if (IS_ERR(cow))
204                 return PTR_ERR(cow);
205
206         copy_extent_buffer_full(cow, buf);
207         btrfs_set_header_bytenr(cow, cow->start);
208         btrfs_set_header_generation(cow, trans->transid);
209         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
210         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
211                                      BTRFS_HEADER_FLAG_RELOC);
212         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
213                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
214         else
215                 btrfs_set_header_owner(cow, new_root_objectid);
216
217         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
218
219         WARN_ON(btrfs_header_generation(buf) > trans->transid);
220         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221                 ret = btrfs_inc_ref(trans, root, cow, 1);
222         else
223                 ret = btrfs_inc_ref(trans, root, cow, 0);
224         if (ret) {
225                 btrfs_tree_unlock(cow);
226                 free_extent_buffer(cow);
227                 btrfs_abort_transaction(trans, ret);
228                 return ret;
229         }
230
231         btrfs_mark_buffer_dirty(cow);
232         *cow_ret = cow;
233         return 0;
234 }
235
236 enum mod_log_op {
237         MOD_LOG_KEY_REPLACE,
238         MOD_LOG_KEY_ADD,
239         MOD_LOG_KEY_REMOVE,
240         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
241         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
242         MOD_LOG_MOVE_KEYS,
243         MOD_LOG_ROOT_REPLACE,
244 };
245
246 struct tree_mod_root {
247         u64 logical;
248         u8 level;
249 };
250
251 struct tree_mod_elem {
252         struct rb_node node;
253         u64 logical;
254         u64 seq;
255         enum mod_log_op op;
256
257         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
258         int slot;
259
260         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
261         u64 generation;
262
263         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
264         struct btrfs_disk_key key;
265         u64 blockptr;
266
267         /* this is used for op == MOD_LOG_MOVE_KEYS */
268         struct {
269                 int dst_slot;
270                 int nr_items;
271         } move;
272
273         /* this is used for op == MOD_LOG_ROOT_REPLACE */
274         struct tree_mod_root old_root;
275 };
276
277 /*
278  * Pull a new tree mod seq number for our operation.
279  */
280 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
281 {
282         return atomic64_inc_return(&fs_info->tree_mod_seq);
283 }
284
285 /*
286  * This adds a new blocker to the tree mod log's blocker list if the @elem
287  * passed does not already have a sequence number set. So when a caller expects
288  * to record tree modifications, it should ensure to set elem->seq to zero
289  * before calling btrfs_get_tree_mod_seq.
290  * Returns a fresh, unused tree log modification sequence number, even if no new
291  * blocker was added.
292  */
293 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
294                            struct seq_list *elem)
295 {
296         write_lock(&fs_info->tree_mod_log_lock);
297         if (!elem->seq) {
298                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
299                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
300         }
301         write_unlock(&fs_info->tree_mod_log_lock);
302
303         return elem->seq;
304 }
305
306 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
307                             struct seq_list *elem)
308 {
309         struct rb_root *tm_root;
310         struct rb_node *node;
311         struct rb_node *next;
312         struct tree_mod_elem *tm;
313         u64 min_seq = (u64)-1;
314         u64 seq_putting = elem->seq;
315
316         if (!seq_putting)
317                 return;
318
319         write_lock(&fs_info->tree_mod_log_lock);
320         list_del(&elem->list);
321         elem->seq = 0;
322
323         if (!list_empty(&fs_info->tree_mod_seq_list)) {
324                 struct seq_list *first;
325
326                 first = list_first_entry(&fs_info->tree_mod_seq_list,
327                                          struct seq_list, list);
328                 if (seq_putting > first->seq) {
329                         /*
330                          * Blocker with lower sequence number exists, we
331                          * cannot remove anything from the log.
332                          */
333                         write_unlock(&fs_info->tree_mod_log_lock);
334                         return;
335                 }
336                 min_seq = first->seq;
337         }
338
339         /*
340          * anything that's lower than the lowest existing (read: blocked)
341          * sequence number can be removed from the tree.
342          */
343         tm_root = &fs_info->tree_mod_log;
344         for (node = rb_first(tm_root); node; node = next) {
345                 next = rb_next(node);
346                 tm = rb_entry(node, struct tree_mod_elem, node);
347                 if (tm->seq >= min_seq)
348                         continue;
349                 rb_erase(node, tm_root);
350                 kfree(tm);
351         }
352         write_unlock(&fs_info->tree_mod_log_lock);
353 }
354
355 /*
356  * key order of the log:
357  *       node/leaf start address -> sequence
358  *
359  * The 'start address' is the logical address of the *new* root node
360  * for root replace operations, or the logical address of the affected
361  * block for all other operations.
362  */
363 static noinline int
364 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
365 {
366         struct rb_root *tm_root;
367         struct rb_node **new;
368         struct rb_node *parent = NULL;
369         struct tree_mod_elem *cur;
370
371         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
372
373         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
374
375         tm_root = &fs_info->tree_mod_log;
376         new = &tm_root->rb_node;
377         while (*new) {
378                 cur = rb_entry(*new, struct tree_mod_elem, node);
379                 parent = *new;
380                 if (cur->logical < tm->logical)
381                         new = &((*new)->rb_left);
382                 else if (cur->logical > tm->logical)
383                         new = &((*new)->rb_right);
384                 else if (cur->seq < tm->seq)
385                         new = &((*new)->rb_left);
386                 else if (cur->seq > tm->seq)
387                         new = &((*new)->rb_right);
388                 else
389                         return -EEXIST;
390         }
391
392         rb_link_node(&tm->node, parent, new);
393         rb_insert_color(&tm->node, tm_root);
394         return 0;
395 }
396
397 /*
398  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
399  * returns zero with the tree_mod_log_lock acquired. The caller must hold
400  * this until all tree mod log insertions are recorded in the rb tree and then
401  * write unlock fs_info::tree_mod_log_lock.
402  */
403 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
404                                     struct extent_buffer *eb) {
405         smp_mb();
406         if (list_empty(&(fs_info)->tree_mod_seq_list))
407                 return 1;
408         if (eb && btrfs_header_level(eb) == 0)
409                 return 1;
410
411         write_lock(&fs_info->tree_mod_log_lock);
412         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
413                 write_unlock(&fs_info->tree_mod_log_lock);
414                 return 1;
415         }
416
417         return 0;
418 }
419
420 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
421 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
422                                     struct extent_buffer *eb)
423 {
424         smp_mb();
425         if (list_empty(&(fs_info)->tree_mod_seq_list))
426                 return 0;
427         if (eb && btrfs_header_level(eb) == 0)
428                 return 0;
429
430         return 1;
431 }
432
433 static struct tree_mod_elem *
434 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
435                     enum mod_log_op op, gfp_t flags)
436 {
437         struct tree_mod_elem *tm;
438
439         tm = kzalloc(sizeof(*tm), flags);
440         if (!tm)
441                 return NULL;
442
443         tm->logical = eb->start;
444         if (op != MOD_LOG_KEY_ADD) {
445                 btrfs_node_key(eb, &tm->key, slot);
446                 tm->blockptr = btrfs_node_blockptr(eb, slot);
447         }
448         tm->op = op;
449         tm->slot = slot;
450         tm->generation = btrfs_node_ptr_generation(eb, slot);
451         RB_CLEAR_NODE(&tm->node);
452
453         return tm;
454 }
455
456 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
457                 enum mod_log_op op, gfp_t flags)
458 {
459         struct tree_mod_elem *tm;
460         int ret;
461
462         if (!tree_mod_need_log(eb->fs_info, eb))
463                 return 0;
464
465         tm = alloc_tree_mod_elem(eb, slot, op, flags);
466         if (!tm)
467                 return -ENOMEM;
468
469         if (tree_mod_dont_log(eb->fs_info, eb)) {
470                 kfree(tm);
471                 return 0;
472         }
473
474         ret = __tree_mod_log_insert(eb->fs_info, tm);
475         write_unlock(&eb->fs_info->tree_mod_log_lock);
476         if (ret)
477                 kfree(tm);
478
479         return ret;
480 }
481
482 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
483                 int dst_slot, int src_slot, int nr_items)
484 {
485         struct tree_mod_elem *tm = NULL;
486         struct tree_mod_elem **tm_list = NULL;
487         int ret = 0;
488         int i;
489         int locked = 0;
490
491         if (!tree_mod_need_log(eb->fs_info, eb))
492                 return 0;
493
494         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
495         if (!tm_list)
496                 return -ENOMEM;
497
498         tm = kzalloc(sizeof(*tm), GFP_NOFS);
499         if (!tm) {
500                 ret = -ENOMEM;
501                 goto free_tms;
502         }
503
504         tm->logical = eb->start;
505         tm->slot = src_slot;
506         tm->move.dst_slot = dst_slot;
507         tm->move.nr_items = nr_items;
508         tm->op = MOD_LOG_MOVE_KEYS;
509
510         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
511                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
512                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
513                 if (!tm_list[i]) {
514                         ret = -ENOMEM;
515                         goto free_tms;
516                 }
517         }
518
519         if (tree_mod_dont_log(eb->fs_info, eb))
520                 goto free_tms;
521         locked = 1;
522
523         /*
524          * When we override something during the move, we log these removals.
525          * This can only happen when we move towards the beginning of the
526          * buffer, i.e. dst_slot < src_slot.
527          */
528         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
529                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
530                 if (ret)
531                         goto free_tms;
532         }
533
534         ret = __tree_mod_log_insert(eb->fs_info, tm);
535         if (ret)
536                 goto free_tms;
537         write_unlock(&eb->fs_info->tree_mod_log_lock);
538         kfree(tm_list);
539
540         return 0;
541 free_tms:
542         for (i = 0; i < nr_items; i++) {
543                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
544                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
545                 kfree(tm_list[i]);
546         }
547         if (locked)
548                 write_unlock(&eb->fs_info->tree_mod_log_lock);
549         kfree(tm_list);
550         kfree(tm);
551
552         return ret;
553 }
554
555 static inline int
556 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
557                        struct tree_mod_elem **tm_list,
558                        int nritems)
559 {
560         int i, j;
561         int ret;
562
563         for (i = nritems - 1; i >= 0; i--) {
564                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
565                 if (ret) {
566                         for (j = nritems - 1; j > i; j--)
567                                 rb_erase(&tm_list[j]->node,
568                                          &fs_info->tree_mod_log);
569                         return ret;
570                 }
571         }
572
573         return 0;
574 }
575
576 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
577                          struct extent_buffer *new_root, int log_removal)
578 {
579         struct btrfs_fs_info *fs_info = old_root->fs_info;
580         struct tree_mod_elem *tm = NULL;
581         struct tree_mod_elem **tm_list = NULL;
582         int nritems = 0;
583         int ret = 0;
584         int i;
585
586         if (!tree_mod_need_log(fs_info, NULL))
587                 return 0;
588
589         if (log_removal && btrfs_header_level(old_root) > 0) {
590                 nritems = btrfs_header_nritems(old_root);
591                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
592                                   GFP_NOFS);
593                 if (!tm_list) {
594                         ret = -ENOMEM;
595                         goto free_tms;
596                 }
597                 for (i = 0; i < nritems; i++) {
598                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
599                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
600                         if (!tm_list[i]) {
601                                 ret = -ENOMEM;
602                                 goto free_tms;
603                         }
604                 }
605         }
606
607         tm = kzalloc(sizeof(*tm), GFP_NOFS);
608         if (!tm) {
609                 ret = -ENOMEM;
610                 goto free_tms;
611         }
612
613         tm->logical = new_root->start;
614         tm->old_root.logical = old_root->start;
615         tm->old_root.level = btrfs_header_level(old_root);
616         tm->generation = btrfs_header_generation(old_root);
617         tm->op = MOD_LOG_ROOT_REPLACE;
618
619         if (tree_mod_dont_log(fs_info, NULL))
620                 goto free_tms;
621
622         if (tm_list)
623                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
624         if (!ret)
625                 ret = __tree_mod_log_insert(fs_info, tm);
626
627         write_unlock(&fs_info->tree_mod_log_lock);
628         if (ret)
629                 goto free_tms;
630         kfree(tm_list);
631
632         return ret;
633
634 free_tms:
635         if (tm_list) {
636                 for (i = 0; i < nritems; i++)
637                         kfree(tm_list[i]);
638                 kfree(tm_list);
639         }
640         kfree(tm);
641
642         return ret;
643 }
644
645 static struct tree_mod_elem *
646 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
647                       int smallest)
648 {
649         struct rb_root *tm_root;
650         struct rb_node *node;
651         struct tree_mod_elem *cur = NULL;
652         struct tree_mod_elem *found = NULL;
653
654         read_lock(&fs_info->tree_mod_log_lock);
655         tm_root = &fs_info->tree_mod_log;
656         node = tm_root->rb_node;
657         while (node) {
658                 cur = rb_entry(node, struct tree_mod_elem, node);
659                 if (cur->logical < start) {
660                         node = node->rb_left;
661                 } else if (cur->logical > start) {
662                         node = node->rb_right;
663                 } else if (cur->seq < min_seq) {
664                         node = node->rb_left;
665                 } else if (!smallest) {
666                         /* we want the node with the highest seq */
667                         if (found)
668                                 BUG_ON(found->seq > cur->seq);
669                         found = cur;
670                         node = node->rb_left;
671                 } else if (cur->seq > min_seq) {
672                         /* we want the node with the smallest seq */
673                         if (found)
674                                 BUG_ON(found->seq < cur->seq);
675                         found = cur;
676                         node = node->rb_right;
677                 } else {
678                         found = cur;
679                         break;
680                 }
681         }
682         read_unlock(&fs_info->tree_mod_log_lock);
683
684         return found;
685 }
686
687 /*
688  * this returns the element from the log with the smallest time sequence
689  * value that's in the log (the oldest log item). any element with a time
690  * sequence lower than min_seq will be ignored.
691  */
692 static struct tree_mod_elem *
693 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
694                            u64 min_seq)
695 {
696         return __tree_mod_log_search(fs_info, start, min_seq, 1);
697 }
698
699 /*
700  * this returns the element from the log with the largest time sequence
701  * value that's in the log (the most recent log item). any element with
702  * a time sequence lower than min_seq will be ignored.
703  */
704 static struct tree_mod_elem *
705 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
706 {
707         return __tree_mod_log_search(fs_info, start, min_seq, 0);
708 }
709
710 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
711                      struct extent_buffer *src, unsigned long dst_offset,
712                      unsigned long src_offset, int nr_items)
713 {
714         struct btrfs_fs_info *fs_info = dst->fs_info;
715         int ret = 0;
716         struct tree_mod_elem **tm_list = NULL;
717         struct tree_mod_elem **tm_list_add, **tm_list_rem;
718         int i;
719         int locked = 0;
720
721         if (!tree_mod_need_log(fs_info, NULL))
722                 return 0;
723
724         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
725                 return 0;
726
727         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
728                           GFP_NOFS);
729         if (!tm_list)
730                 return -ENOMEM;
731
732         tm_list_add = tm_list;
733         tm_list_rem = tm_list + nr_items;
734         for (i = 0; i < nr_items; i++) {
735                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
736                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
737                 if (!tm_list_rem[i]) {
738                         ret = -ENOMEM;
739                         goto free_tms;
740                 }
741
742                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
743                     MOD_LOG_KEY_ADD, GFP_NOFS);
744                 if (!tm_list_add[i]) {
745                         ret = -ENOMEM;
746                         goto free_tms;
747                 }
748         }
749
750         if (tree_mod_dont_log(fs_info, NULL))
751                 goto free_tms;
752         locked = 1;
753
754         for (i = 0; i < nr_items; i++) {
755                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
756                 if (ret)
757                         goto free_tms;
758                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
759                 if (ret)
760                         goto free_tms;
761         }
762
763         write_unlock(&fs_info->tree_mod_log_lock);
764         kfree(tm_list);
765
766         return 0;
767
768 free_tms:
769         for (i = 0; i < nr_items * 2; i++) {
770                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
771                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
772                 kfree(tm_list[i]);
773         }
774         if (locked)
775                 write_unlock(&fs_info->tree_mod_log_lock);
776         kfree(tm_list);
777
778         return ret;
779 }
780
781 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
782 {
783         struct tree_mod_elem **tm_list = NULL;
784         int nritems = 0;
785         int i;
786         int ret = 0;
787
788         if (btrfs_header_level(eb) == 0)
789                 return 0;
790
791         if (!tree_mod_need_log(eb->fs_info, NULL))
792                 return 0;
793
794         nritems = btrfs_header_nritems(eb);
795         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
796         if (!tm_list)
797                 return -ENOMEM;
798
799         for (i = 0; i < nritems; i++) {
800                 tm_list[i] = alloc_tree_mod_elem(eb, i,
801                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
802                 if (!tm_list[i]) {
803                         ret = -ENOMEM;
804                         goto free_tms;
805                 }
806         }
807
808         if (tree_mod_dont_log(eb->fs_info, eb))
809                 goto free_tms;
810
811         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
812         write_unlock(&eb->fs_info->tree_mod_log_lock);
813         if (ret)
814                 goto free_tms;
815         kfree(tm_list);
816
817         return 0;
818
819 free_tms:
820         for (i = 0; i < nritems; i++)
821                 kfree(tm_list[i]);
822         kfree(tm_list);
823
824         return ret;
825 }
826
827 /*
828  * check if the tree block can be shared by multiple trees
829  */
830 int btrfs_block_can_be_shared(struct btrfs_root *root,
831                               struct extent_buffer *buf)
832 {
833         /*
834          * Tree blocks not in shareable trees and tree roots are never shared.
835          * If a block was allocated after the last snapshot and the block was
836          * not allocated by tree relocation, we know the block is not shared.
837          */
838         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
839             buf != root->node && buf != root->commit_root &&
840             (btrfs_header_generation(buf) <=
841              btrfs_root_last_snapshot(&root->root_item) ||
842              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
843                 return 1;
844
845         return 0;
846 }
847
848 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
849                                        struct btrfs_root *root,
850                                        struct extent_buffer *buf,
851                                        struct extent_buffer *cow,
852                                        int *last_ref)
853 {
854         struct btrfs_fs_info *fs_info = root->fs_info;
855         u64 refs;
856         u64 owner;
857         u64 flags;
858         u64 new_flags = 0;
859         int ret;
860
861         /*
862          * Backrefs update rules:
863          *
864          * Always use full backrefs for extent pointers in tree block
865          * allocated by tree relocation.
866          *
867          * If a shared tree block is no longer referenced by its owner
868          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
869          * use full backrefs for extent pointers in tree block.
870          *
871          * If a tree block is been relocating
872          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
873          * use full backrefs for extent pointers in tree block.
874          * The reason for this is some operations (such as drop tree)
875          * are only allowed for blocks use full backrefs.
876          */
877
878         if (btrfs_block_can_be_shared(root, buf)) {
879                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
880                                                btrfs_header_level(buf), 1,
881                                                &refs, &flags);
882                 if (ret)
883                         return ret;
884                 if (refs == 0) {
885                         ret = -EROFS;
886                         btrfs_handle_fs_error(fs_info, ret, NULL);
887                         return ret;
888                 }
889         } else {
890                 refs = 1;
891                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
892                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
893                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
894                 else
895                         flags = 0;
896         }
897
898         owner = btrfs_header_owner(buf);
899         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
900                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
901
902         if (refs > 1) {
903                 if ((owner == root->root_key.objectid ||
904                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
905                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
906                         ret = btrfs_inc_ref(trans, root, buf, 1);
907                         if (ret)
908                                 return ret;
909
910                         if (root->root_key.objectid ==
911                             BTRFS_TREE_RELOC_OBJECTID) {
912                                 ret = btrfs_dec_ref(trans, root, buf, 0);
913                                 if (ret)
914                                         return ret;
915                                 ret = btrfs_inc_ref(trans, root, cow, 1);
916                                 if (ret)
917                                         return ret;
918                         }
919                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
920                 } else {
921
922                         if (root->root_key.objectid ==
923                             BTRFS_TREE_RELOC_OBJECTID)
924                                 ret = btrfs_inc_ref(trans, root, cow, 1);
925                         else
926                                 ret = btrfs_inc_ref(trans, root, cow, 0);
927                         if (ret)
928                                 return ret;
929                 }
930                 if (new_flags != 0) {
931                         int level = btrfs_header_level(buf);
932
933                         ret = btrfs_set_disk_extent_flags(trans, buf,
934                                                           new_flags, level, 0);
935                         if (ret)
936                                 return ret;
937                 }
938         } else {
939                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
940                         if (root->root_key.objectid ==
941                             BTRFS_TREE_RELOC_OBJECTID)
942                                 ret = btrfs_inc_ref(trans, root, cow, 1);
943                         else
944                                 ret = btrfs_inc_ref(trans, root, cow, 0);
945                         if (ret)
946                                 return ret;
947                         ret = btrfs_dec_ref(trans, root, buf, 1);
948                         if (ret)
949                                 return ret;
950                 }
951                 btrfs_clean_tree_block(buf);
952                 *last_ref = 1;
953         }
954         return 0;
955 }
956
957 static struct extent_buffer *alloc_tree_block_no_bg_flush(
958                                           struct btrfs_trans_handle *trans,
959                                           struct btrfs_root *root,
960                                           u64 parent_start,
961                                           const struct btrfs_disk_key *disk_key,
962                                           int level,
963                                           u64 hint,
964                                           u64 empty_size,
965                                           enum btrfs_lock_nesting nest)
966 {
967         struct btrfs_fs_info *fs_info = root->fs_info;
968         struct extent_buffer *ret;
969
970         /*
971          * If we are COWing a node/leaf from the extent, chunk, device or free
972          * space trees, make sure that we do not finish block group creation of
973          * pending block groups. We do this to avoid a deadlock.
974          * COWing can result in allocation of a new chunk, and flushing pending
975          * block groups (btrfs_create_pending_block_groups()) can be triggered
976          * when finishing allocation of a new chunk. Creation of a pending block
977          * group modifies the extent, chunk, device and free space trees,
978          * therefore we could deadlock with ourselves since we are holding a
979          * lock on an extent buffer that btrfs_create_pending_block_groups() may
980          * try to COW later.
981          * For similar reasons, we also need to delay flushing pending block
982          * groups when splitting a leaf or node, from one of those trees, since
983          * we are holding a write lock on it and its parent or when inserting a
984          * new root node for one of those trees.
985          */
986         if (root == fs_info->extent_root ||
987             root == fs_info->chunk_root ||
988             root == fs_info->dev_root ||
989             root == fs_info->free_space_root)
990                 trans->can_flush_pending_bgs = false;
991
992         ret = btrfs_alloc_tree_block(trans, root, parent_start,
993                                      root->root_key.objectid, disk_key, level,
994                                      hint, empty_size, nest);
995         trans->can_flush_pending_bgs = true;
996
997         return ret;
998 }
999
1000 /*
1001  * does the dirty work in cow of a single block.  The parent block (if
1002  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1003  * dirty and returned locked.  If you modify the block it needs to be marked
1004  * dirty again.
1005  *
1006  * search_start -- an allocation hint for the new block
1007  *
1008  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1009  * bytes the allocator should try to find free next to the block it returns.
1010  * This is just a hint and may be ignored by the allocator.
1011  */
1012 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1013                              struct btrfs_root *root,
1014                              struct extent_buffer *buf,
1015                              struct extent_buffer *parent, int parent_slot,
1016                              struct extent_buffer **cow_ret,
1017                              u64 search_start, u64 empty_size,
1018                              enum btrfs_lock_nesting nest)
1019 {
1020         struct btrfs_fs_info *fs_info = root->fs_info;
1021         struct btrfs_disk_key disk_key;
1022         struct extent_buffer *cow;
1023         int level, ret;
1024         int last_ref = 0;
1025         int unlock_orig = 0;
1026         u64 parent_start = 0;
1027
1028         if (*cow_ret == buf)
1029                 unlock_orig = 1;
1030
1031         btrfs_assert_tree_locked(buf);
1032
1033         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1034                 trans->transid != fs_info->running_transaction->transid);
1035         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1036                 trans->transid != root->last_trans);
1037
1038         level = btrfs_header_level(buf);
1039
1040         if (level == 0)
1041                 btrfs_item_key(buf, &disk_key, 0);
1042         else
1043                 btrfs_node_key(buf, &disk_key, 0);
1044
1045         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1046                 parent_start = parent->start;
1047
1048         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1049                                            level, search_start, empty_size, nest);
1050         if (IS_ERR(cow))
1051                 return PTR_ERR(cow);
1052
1053         /* cow is set to blocking by btrfs_init_new_buffer */
1054
1055         copy_extent_buffer_full(cow, buf);
1056         btrfs_set_header_bytenr(cow, cow->start);
1057         btrfs_set_header_generation(cow, trans->transid);
1058         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1059         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1060                                      BTRFS_HEADER_FLAG_RELOC);
1061         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1062                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1063         else
1064                 btrfs_set_header_owner(cow, root->root_key.objectid);
1065
1066         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1067
1068         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1069         if (ret) {
1070                 btrfs_tree_unlock(cow);
1071                 free_extent_buffer(cow);
1072                 btrfs_abort_transaction(trans, ret);
1073                 return ret;
1074         }
1075
1076         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1077                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1078                 if (ret) {
1079                         btrfs_tree_unlock(cow);
1080                         free_extent_buffer(cow);
1081                         btrfs_abort_transaction(trans, ret);
1082                         return ret;
1083                 }
1084         }
1085
1086         if (buf == root->node) {
1087                 WARN_ON(parent && parent != buf);
1088                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1089                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1090                         parent_start = buf->start;
1091
1092                 atomic_inc(&cow->refs);
1093                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1094                 BUG_ON(ret < 0);
1095                 rcu_assign_pointer(root->node, cow);
1096
1097                 btrfs_free_tree_block(trans, root, buf, parent_start,
1098                                       last_ref);
1099                 free_extent_buffer(buf);
1100                 add_root_to_dirty_list(root);
1101         } else {
1102                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1103                 tree_mod_log_insert_key(parent, parent_slot,
1104                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1105                 btrfs_set_node_blockptr(parent, parent_slot,
1106                                         cow->start);
1107                 btrfs_set_node_ptr_generation(parent, parent_slot,
1108                                               trans->transid);
1109                 btrfs_mark_buffer_dirty(parent);
1110                 if (last_ref) {
1111                         ret = tree_mod_log_free_eb(buf);
1112                         if (ret) {
1113                                 btrfs_tree_unlock(cow);
1114                                 free_extent_buffer(cow);
1115                                 btrfs_abort_transaction(trans, ret);
1116                                 return ret;
1117                         }
1118                 }
1119                 btrfs_free_tree_block(trans, root, buf, parent_start,
1120                                       last_ref);
1121         }
1122         if (unlock_orig)
1123                 btrfs_tree_unlock(buf);
1124         free_extent_buffer_stale(buf);
1125         btrfs_mark_buffer_dirty(cow);
1126         *cow_ret = cow;
1127         return 0;
1128 }
1129
1130 /*
1131  * returns the logical address of the oldest predecessor of the given root.
1132  * entries older than time_seq are ignored.
1133  */
1134 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1135                 struct extent_buffer *eb_root, u64 time_seq)
1136 {
1137         struct tree_mod_elem *tm;
1138         struct tree_mod_elem *found = NULL;
1139         u64 root_logical = eb_root->start;
1140         int looped = 0;
1141
1142         if (!time_seq)
1143                 return NULL;
1144
1145         /*
1146          * the very last operation that's logged for a root is the
1147          * replacement operation (if it is replaced at all). this has
1148          * the logical address of the *new* root, making it the very
1149          * first operation that's logged for this root.
1150          */
1151         while (1) {
1152                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1153                                                 time_seq);
1154                 if (!looped && !tm)
1155                         return NULL;
1156                 /*
1157                  * if there are no tree operation for the oldest root, we simply
1158                  * return it. this should only happen if that (old) root is at
1159                  * level 0.
1160                  */
1161                 if (!tm)
1162                         break;
1163
1164                 /*
1165                  * if there's an operation that's not a root replacement, we
1166                  * found the oldest version of our root. normally, we'll find a
1167                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1168                  */
1169                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1170                         break;
1171
1172                 found = tm;
1173                 root_logical = tm->old_root.logical;
1174                 looped = 1;
1175         }
1176
1177         /* if there's no old root to return, return what we found instead */
1178         if (!found)
1179                 found = tm;
1180
1181         return found;
1182 }
1183
1184 /*
1185  * tm is a pointer to the first operation to rewind within eb. then, all
1186  * previous operations will be rewound (until we reach something older than
1187  * time_seq).
1188  */
1189 static void
1190 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1191                       u64 time_seq, struct tree_mod_elem *first_tm)
1192 {
1193         u32 n;
1194         struct rb_node *next;
1195         struct tree_mod_elem *tm = first_tm;
1196         unsigned long o_dst;
1197         unsigned long o_src;
1198         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1199
1200         n = btrfs_header_nritems(eb);
1201         read_lock(&fs_info->tree_mod_log_lock);
1202         while (tm && tm->seq >= time_seq) {
1203                 /*
1204                  * all the operations are recorded with the operator used for
1205                  * the modification. as we're going backwards, we do the
1206                  * opposite of each operation here.
1207                  */
1208                 switch (tm->op) {
1209                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1210                         BUG_ON(tm->slot < n);
1211                         fallthrough;
1212                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1213                 case MOD_LOG_KEY_REMOVE:
1214                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1215                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1216                         btrfs_set_node_ptr_generation(eb, tm->slot,
1217                                                       tm->generation);
1218                         n++;
1219                         break;
1220                 case MOD_LOG_KEY_REPLACE:
1221                         BUG_ON(tm->slot >= n);
1222                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1223                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1224                         btrfs_set_node_ptr_generation(eb, tm->slot,
1225                                                       tm->generation);
1226                         break;
1227                 case MOD_LOG_KEY_ADD:
1228                         /* if a move operation is needed it's in the log */
1229                         n--;
1230                         break;
1231                 case MOD_LOG_MOVE_KEYS:
1232                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1233                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1234                         memmove_extent_buffer(eb, o_dst, o_src,
1235                                               tm->move.nr_items * p_size);
1236                         break;
1237                 case MOD_LOG_ROOT_REPLACE:
1238                         /*
1239                          * this operation is special. for roots, this must be
1240                          * handled explicitly before rewinding.
1241                          * for non-roots, this operation may exist if the node
1242                          * was a root: root A -> child B; then A gets empty and
1243                          * B is promoted to the new root. in the mod log, we'll
1244                          * have a root-replace operation for B, a tree block
1245                          * that is no root. we simply ignore that operation.
1246                          */
1247                         break;
1248                 }
1249                 next = rb_next(&tm->node);
1250                 if (!next)
1251                         break;
1252                 tm = rb_entry(next, struct tree_mod_elem, node);
1253                 if (tm->logical != first_tm->logical)
1254                         break;
1255         }
1256         read_unlock(&fs_info->tree_mod_log_lock);
1257         btrfs_set_header_nritems(eb, n);
1258 }
1259
1260 /*
1261  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1262  * is returned. If rewind operations happen, a fresh buffer is returned. The
1263  * returned buffer is always read-locked. If the returned buffer is not the
1264  * input buffer, the lock on the input buffer is released and the input buffer
1265  * is freed (its refcount is decremented).
1266  */
1267 static struct extent_buffer *
1268 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1269                     struct extent_buffer *eb, u64 time_seq)
1270 {
1271         struct extent_buffer *eb_rewin;
1272         struct tree_mod_elem *tm;
1273
1274         if (!time_seq)
1275                 return eb;
1276
1277         if (btrfs_header_level(eb) == 0)
1278                 return eb;
1279
1280         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1281         if (!tm)
1282                 return eb;
1283
1284         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1285                 BUG_ON(tm->slot != 0);
1286                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1287                 if (!eb_rewin) {
1288                         btrfs_tree_read_unlock(eb);
1289                         free_extent_buffer(eb);
1290                         return NULL;
1291                 }
1292                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1293                 btrfs_set_header_backref_rev(eb_rewin,
1294                                              btrfs_header_backref_rev(eb));
1295                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1296                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1297         } else {
1298                 eb_rewin = btrfs_clone_extent_buffer(eb);
1299                 if (!eb_rewin) {
1300                         btrfs_tree_read_unlock(eb);
1301                         free_extent_buffer(eb);
1302                         return NULL;
1303                 }
1304         }
1305
1306         btrfs_tree_read_unlock(eb);
1307         free_extent_buffer(eb);
1308
1309         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1310                                        eb_rewin, btrfs_header_level(eb_rewin));
1311         btrfs_tree_read_lock(eb_rewin);
1312         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1313         WARN_ON(btrfs_header_nritems(eb_rewin) >
1314                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1315
1316         return eb_rewin;
1317 }
1318
1319 /*
1320  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1321  * value. If there are no changes, the current root->root_node is returned. If
1322  * anything changed in between, there's a fresh buffer allocated on which the
1323  * rewind operations are done. In any case, the returned buffer is read locked.
1324  * Returns NULL on error (with no locks held).
1325  */
1326 static inline struct extent_buffer *
1327 get_old_root(struct btrfs_root *root, u64 time_seq)
1328 {
1329         struct btrfs_fs_info *fs_info = root->fs_info;
1330         struct tree_mod_elem *tm;
1331         struct extent_buffer *eb = NULL;
1332         struct extent_buffer *eb_root;
1333         u64 eb_root_owner = 0;
1334         struct extent_buffer *old;
1335         struct tree_mod_root *old_root = NULL;
1336         u64 old_generation = 0;
1337         u64 logical;
1338         int level;
1339
1340         eb_root = btrfs_read_lock_root_node(root);
1341         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1342         if (!tm)
1343                 return eb_root;
1344
1345         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1346                 old_root = &tm->old_root;
1347                 old_generation = tm->generation;
1348                 logical = old_root->logical;
1349                 level = old_root->level;
1350         } else {
1351                 logical = eb_root->start;
1352                 level = btrfs_header_level(eb_root);
1353         }
1354
1355         tm = tree_mod_log_search(fs_info, logical, time_seq);
1356         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1357                 btrfs_tree_read_unlock(eb_root);
1358                 free_extent_buffer(eb_root);
1359                 old = read_tree_block(fs_info, logical, root->root_key.objectid,
1360                                       0, level, NULL);
1361                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1362                         if (!IS_ERR(old))
1363                                 free_extent_buffer(old);
1364                         btrfs_warn(fs_info,
1365                                    "failed to read tree block %llu from get_old_root",
1366                                    logical);
1367                 } else {
1368                         btrfs_tree_read_lock(old);
1369                         eb = btrfs_clone_extent_buffer(old);
1370                         btrfs_tree_read_unlock(old);
1371                         free_extent_buffer(old);
1372                 }
1373         } else if (old_root) {
1374                 eb_root_owner = btrfs_header_owner(eb_root);
1375                 btrfs_tree_read_unlock(eb_root);
1376                 free_extent_buffer(eb_root);
1377                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1378         } else {
1379                 eb = btrfs_clone_extent_buffer(eb_root);
1380                 btrfs_tree_read_unlock(eb_root);
1381                 free_extent_buffer(eb_root);
1382         }
1383
1384         if (!eb)
1385                 return NULL;
1386         if (old_root) {
1387                 btrfs_set_header_bytenr(eb, eb->start);
1388                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1389                 btrfs_set_header_owner(eb, eb_root_owner);
1390                 btrfs_set_header_level(eb, old_root->level);
1391                 btrfs_set_header_generation(eb, old_generation);
1392         }
1393         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1394                                        btrfs_header_level(eb));
1395         btrfs_tree_read_lock(eb);
1396         if (tm)
1397                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1398         else
1399                 WARN_ON(btrfs_header_level(eb) != 0);
1400         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1401
1402         return eb;
1403 }
1404
1405 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1406 {
1407         struct tree_mod_elem *tm;
1408         int level;
1409         struct extent_buffer *eb_root = btrfs_root_node(root);
1410
1411         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1412         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1413                 level = tm->old_root.level;
1414         } else {
1415                 level = btrfs_header_level(eb_root);
1416         }
1417         free_extent_buffer(eb_root);
1418
1419         return level;
1420 }
1421
1422 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1423                                    struct btrfs_root *root,
1424                                    struct extent_buffer *buf)
1425 {
1426         if (btrfs_is_testing(root->fs_info))
1427                 return 0;
1428
1429         /* Ensure we can see the FORCE_COW bit */
1430         smp_mb__before_atomic();
1431
1432         /*
1433          * We do not need to cow a block if
1434          * 1) this block is not created or changed in this transaction;
1435          * 2) this block does not belong to TREE_RELOC tree;
1436          * 3) the root is not forced COW.
1437          *
1438          * What is forced COW:
1439          *    when we create snapshot during committing the transaction,
1440          *    after we've finished copying src root, we must COW the shared
1441          *    block to ensure the metadata consistency.
1442          */
1443         if (btrfs_header_generation(buf) == trans->transid &&
1444             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1445             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1446               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1447             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1448                 return 0;
1449         return 1;
1450 }
1451
1452 /*
1453  * cows a single block, see __btrfs_cow_block for the real work.
1454  * This version of it has extra checks so that a block isn't COWed more than
1455  * once per transaction, as long as it hasn't been written yet
1456  */
1457 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1458                     struct btrfs_root *root, struct extent_buffer *buf,
1459                     struct extent_buffer *parent, int parent_slot,
1460                     struct extent_buffer **cow_ret,
1461                     enum btrfs_lock_nesting nest)
1462 {
1463         struct btrfs_fs_info *fs_info = root->fs_info;
1464         u64 search_start;
1465         int ret;
1466
1467         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1468                 btrfs_err(fs_info,
1469                         "COW'ing blocks on a fs root that's being dropped");
1470
1471         if (trans->transaction != fs_info->running_transaction)
1472                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1473                        trans->transid,
1474                        fs_info->running_transaction->transid);
1475
1476         if (trans->transid != fs_info->generation)
1477                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1478                        trans->transid, fs_info->generation);
1479
1480         if (!should_cow_block(trans, root, buf)) {
1481                 trans->dirty = true;
1482                 *cow_ret = buf;
1483                 return 0;
1484         }
1485
1486         search_start = buf->start & ~((u64)SZ_1G - 1);
1487
1488         /*
1489          * Before CoWing this block for later modification, check if it's
1490          * the subtree root and do the delayed subtree trace if needed.
1491          *
1492          * Also We don't care about the error, as it's handled internally.
1493          */
1494         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1495         ret = __btrfs_cow_block(trans, root, buf, parent,
1496                                  parent_slot, cow_ret, search_start, 0, nest);
1497
1498         trace_btrfs_cow_block(root, buf, *cow_ret);
1499
1500         return ret;
1501 }
1502 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
1503
1504 /*
1505  * helper function for defrag to decide if two blocks pointed to by a
1506  * node are actually close by
1507  */
1508 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1509 {
1510         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1511                 return 1;
1512         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1513                 return 1;
1514         return 0;
1515 }
1516
1517 #ifdef __LITTLE_ENDIAN
1518
1519 /*
1520  * Compare two keys, on little-endian the disk order is same as CPU order and
1521  * we can avoid the conversion.
1522  */
1523 static int comp_keys(const struct btrfs_disk_key *disk_key,
1524                      const struct btrfs_key *k2)
1525 {
1526         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1527
1528         return btrfs_comp_cpu_keys(k1, k2);
1529 }
1530
1531 #else
1532
1533 /*
1534  * compare two keys in a memcmp fashion
1535  */
1536 static int comp_keys(const struct btrfs_disk_key *disk,
1537                      const struct btrfs_key *k2)
1538 {
1539         struct btrfs_key k1;
1540
1541         btrfs_disk_key_to_cpu(&k1, disk);
1542
1543         return btrfs_comp_cpu_keys(&k1, k2);
1544 }
1545 #endif
1546
1547 /*
1548  * same as comp_keys only with two btrfs_key's
1549  */
1550 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1551 {
1552         if (k1->objectid > k2->objectid)
1553                 return 1;
1554         if (k1->objectid < k2->objectid)
1555                 return -1;
1556         if (k1->type > k2->type)
1557                 return 1;
1558         if (k1->type < k2->type)
1559                 return -1;
1560         if (k1->offset > k2->offset)
1561                 return 1;
1562         if (k1->offset < k2->offset)
1563                 return -1;
1564         return 0;
1565 }
1566
1567 /*
1568  * this is used by the defrag code to go through all the
1569  * leaves pointed to by a node and reallocate them so that
1570  * disk order is close to key order
1571  */
1572 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1573                        struct btrfs_root *root, struct extent_buffer *parent,
1574                        int start_slot, u64 *last_ret,
1575                        struct btrfs_key *progress)
1576 {
1577         struct btrfs_fs_info *fs_info = root->fs_info;
1578         struct extent_buffer *cur;
1579         u64 blocknr;
1580         u64 search_start = *last_ret;
1581         u64 last_block = 0;
1582         u64 other;
1583         u32 parent_nritems;
1584         int end_slot;
1585         int i;
1586         int err = 0;
1587         u32 blocksize;
1588         int progress_passed = 0;
1589         struct btrfs_disk_key disk_key;
1590
1591         WARN_ON(trans->transaction != fs_info->running_transaction);
1592         WARN_ON(trans->transid != fs_info->generation);
1593
1594         parent_nritems = btrfs_header_nritems(parent);
1595         blocksize = fs_info->nodesize;
1596         end_slot = parent_nritems - 1;
1597
1598         if (parent_nritems <= 1)
1599                 return 0;
1600
1601         for (i = start_slot; i <= end_slot; i++) {
1602                 int close = 1;
1603
1604                 btrfs_node_key(parent, &disk_key, i);
1605                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1606                         continue;
1607
1608                 progress_passed = 1;
1609                 blocknr = btrfs_node_blockptr(parent, i);
1610                 if (last_block == 0)
1611                         last_block = blocknr;
1612
1613                 if (i > 0) {
1614                         other = btrfs_node_blockptr(parent, i - 1);
1615                         close = close_blocks(blocknr, other, blocksize);
1616                 }
1617                 if (!close && i < end_slot) {
1618                         other = btrfs_node_blockptr(parent, i + 1);
1619                         close = close_blocks(blocknr, other, blocksize);
1620                 }
1621                 if (close) {
1622                         last_block = blocknr;
1623                         continue;
1624                 }
1625
1626                 cur = btrfs_read_node_slot(parent, i);
1627                 if (IS_ERR(cur))
1628                         return PTR_ERR(cur);
1629                 if (search_start == 0)
1630                         search_start = last_block;
1631
1632                 btrfs_tree_lock(cur);
1633                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1634                                         &cur, search_start,
1635                                         min(16 * blocksize,
1636                                             (end_slot - i) * blocksize),
1637                                         BTRFS_NESTING_COW);
1638                 if (err) {
1639                         btrfs_tree_unlock(cur);
1640                         free_extent_buffer(cur);
1641                         break;
1642                 }
1643                 search_start = cur->start;
1644                 last_block = cur->start;
1645                 *last_ret = search_start;
1646                 btrfs_tree_unlock(cur);
1647                 free_extent_buffer(cur);
1648         }
1649         return err;
1650 }
1651
1652 /*
1653  * search for key in the extent_buffer.  The items start at offset p,
1654  * and they are item_size apart.  There are 'max' items in p.
1655  *
1656  * the slot in the array is returned via slot, and it points to
1657  * the place where you would insert key if it is not found in
1658  * the array.
1659  *
1660  * slot may point to max if the key is bigger than all of the keys
1661  */
1662 static noinline int generic_bin_search(struct extent_buffer *eb,
1663                                        unsigned long p, int item_size,
1664                                        const struct btrfs_key *key,
1665                                        int max, int *slot)
1666 {
1667         int low = 0;
1668         int high = max;
1669         int ret;
1670         const int key_size = sizeof(struct btrfs_disk_key);
1671
1672         if (low > high) {
1673                 btrfs_err(eb->fs_info,
1674                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1675                           __func__, low, high, eb->start,
1676                           btrfs_header_owner(eb), btrfs_header_level(eb));
1677                 return -EINVAL;
1678         }
1679
1680         while (low < high) {
1681                 unsigned long oip;
1682                 unsigned long offset;
1683                 struct btrfs_disk_key *tmp;
1684                 struct btrfs_disk_key unaligned;
1685                 int mid;
1686
1687                 mid = (low + high) / 2;
1688                 offset = p + mid * item_size;
1689                 oip = offset_in_page(offset);
1690
1691                 if (oip + key_size <= PAGE_SIZE) {
1692                         const unsigned long idx = get_eb_page_index(offset);
1693                         char *kaddr = page_address(eb->pages[idx]);
1694
1695                         oip = get_eb_offset_in_page(eb, offset);
1696                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
1697                 } else {
1698                         read_extent_buffer(eb, &unaligned, offset, key_size);
1699                         tmp = &unaligned;
1700                 }
1701
1702                 ret = comp_keys(tmp, key);
1703
1704                 if (ret < 0)
1705                         low = mid + 1;
1706                 else if (ret > 0)
1707                         high = mid;
1708                 else {
1709                         *slot = mid;
1710                         return 0;
1711                 }
1712         }
1713         *slot = low;
1714         return 1;
1715 }
1716
1717 /*
1718  * simple bin_search frontend that does the right thing for
1719  * leaves vs nodes
1720  */
1721 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1722                      int *slot)
1723 {
1724         if (btrfs_header_level(eb) == 0)
1725                 return generic_bin_search(eb,
1726                                           offsetof(struct btrfs_leaf, items),
1727                                           sizeof(struct btrfs_item),
1728                                           key, btrfs_header_nritems(eb),
1729                                           slot);
1730         else
1731                 return generic_bin_search(eb,
1732                                           offsetof(struct btrfs_node, ptrs),
1733                                           sizeof(struct btrfs_key_ptr),
1734                                           key, btrfs_header_nritems(eb),
1735                                           slot);
1736 }
1737
1738 static void root_add_used(struct btrfs_root *root, u32 size)
1739 {
1740         spin_lock(&root->accounting_lock);
1741         btrfs_set_root_used(&root->root_item,
1742                             btrfs_root_used(&root->root_item) + size);
1743         spin_unlock(&root->accounting_lock);
1744 }
1745
1746 static void root_sub_used(struct btrfs_root *root, u32 size)
1747 {
1748         spin_lock(&root->accounting_lock);
1749         btrfs_set_root_used(&root->root_item,
1750                             btrfs_root_used(&root->root_item) - size);
1751         spin_unlock(&root->accounting_lock);
1752 }
1753
1754 /* given a node and slot number, this reads the blocks it points to.  The
1755  * extent buffer is returned with a reference taken (but unlocked).
1756  */
1757 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1758                                            int slot)
1759 {
1760         int level = btrfs_header_level(parent);
1761         struct extent_buffer *eb;
1762         struct btrfs_key first_key;
1763
1764         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1765                 return ERR_PTR(-ENOENT);
1766
1767         BUG_ON(level == 0);
1768
1769         btrfs_node_key_to_cpu(parent, &first_key, slot);
1770         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1771                              btrfs_header_owner(parent),
1772                              btrfs_node_ptr_generation(parent, slot),
1773                              level - 1, &first_key);
1774         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1775                 free_extent_buffer(eb);
1776                 eb = ERR_PTR(-EIO);
1777         }
1778
1779         return eb;
1780 }
1781
1782 /*
1783  * node level balancing, used to make sure nodes are in proper order for
1784  * item deletion.  We balance from the top down, so we have to make sure
1785  * that a deletion won't leave an node completely empty later on.
1786  */
1787 static noinline int balance_level(struct btrfs_trans_handle *trans,
1788                          struct btrfs_root *root,
1789                          struct btrfs_path *path, int level)
1790 {
1791         struct btrfs_fs_info *fs_info = root->fs_info;
1792         struct extent_buffer *right = NULL;
1793         struct extent_buffer *mid;
1794         struct extent_buffer *left = NULL;
1795         struct extent_buffer *parent = NULL;
1796         int ret = 0;
1797         int wret;
1798         int pslot;
1799         int orig_slot = path->slots[level];
1800         u64 orig_ptr;
1801
1802         ASSERT(level > 0);
1803
1804         mid = path->nodes[level];
1805
1806         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
1807         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1808
1809         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1810
1811         if (level < BTRFS_MAX_LEVEL - 1) {
1812                 parent = path->nodes[level + 1];
1813                 pslot = path->slots[level + 1];
1814         }
1815
1816         /*
1817          * deal with the case where there is only one pointer in the root
1818          * by promoting the node below to a root
1819          */
1820         if (!parent) {
1821                 struct extent_buffer *child;
1822
1823                 if (btrfs_header_nritems(mid) != 1)
1824                         return 0;
1825
1826                 /* promote the child to a root */
1827                 child = btrfs_read_node_slot(mid, 0);
1828                 if (IS_ERR(child)) {
1829                         ret = PTR_ERR(child);
1830                         btrfs_handle_fs_error(fs_info, ret, NULL);
1831                         goto enospc;
1832                 }
1833
1834                 btrfs_tree_lock(child);
1835                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1836                                       BTRFS_NESTING_COW);
1837                 if (ret) {
1838                         btrfs_tree_unlock(child);
1839                         free_extent_buffer(child);
1840                         goto enospc;
1841                 }
1842
1843                 ret = tree_mod_log_insert_root(root->node, child, 1);
1844                 BUG_ON(ret < 0);
1845                 rcu_assign_pointer(root->node, child);
1846
1847                 add_root_to_dirty_list(root);
1848                 btrfs_tree_unlock(child);
1849
1850                 path->locks[level] = 0;
1851                 path->nodes[level] = NULL;
1852                 btrfs_clean_tree_block(mid);
1853                 btrfs_tree_unlock(mid);
1854                 /* once for the path */
1855                 free_extent_buffer(mid);
1856
1857                 root_sub_used(root, mid->len);
1858                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1859                 /* once for the root ptr */
1860                 free_extent_buffer_stale(mid);
1861                 return 0;
1862         }
1863         if (btrfs_header_nritems(mid) >
1864             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1865                 return 0;
1866
1867         left = btrfs_read_node_slot(parent, pslot - 1);
1868         if (IS_ERR(left))
1869                 left = NULL;
1870
1871         if (left) {
1872                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1873                 wret = btrfs_cow_block(trans, root, left,
1874                                        parent, pslot - 1, &left,
1875                                        BTRFS_NESTING_LEFT_COW);
1876                 if (wret) {
1877                         ret = wret;
1878                         goto enospc;
1879                 }
1880         }
1881
1882         right = btrfs_read_node_slot(parent, pslot + 1);
1883         if (IS_ERR(right))
1884                 right = NULL;
1885
1886         if (right) {
1887                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1888                 wret = btrfs_cow_block(trans, root, right,
1889                                        parent, pslot + 1, &right,
1890                                        BTRFS_NESTING_RIGHT_COW);
1891                 if (wret) {
1892                         ret = wret;
1893                         goto enospc;
1894                 }
1895         }
1896
1897         /* first, try to make some room in the middle buffer */
1898         if (left) {
1899                 orig_slot += btrfs_header_nritems(left);
1900                 wret = push_node_left(trans, left, mid, 1);
1901                 if (wret < 0)
1902                         ret = wret;
1903         }
1904
1905         /*
1906          * then try to empty the right most buffer into the middle
1907          */
1908         if (right) {
1909                 wret = push_node_left(trans, mid, right, 1);
1910                 if (wret < 0 && wret != -ENOSPC)
1911                         ret = wret;
1912                 if (btrfs_header_nritems(right) == 0) {
1913                         btrfs_clean_tree_block(right);
1914                         btrfs_tree_unlock(right);
1915                         del_ptr(root, path, level + 1, pslot + 1);
1916                         root_sub_used(root, right->len);
1917                         btrfs_free_tree_block(trans, root, right, 0, 1);
1918                         free_extent_buffer_stale(right);
1919                         right = NULL;
1920                 } else {
1921                         struct btrfs_disk_key right_key;
1922                         btrfs_node_key(right, &right_key, 0);
1923                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1924                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1925                         BUG_ON(ret < 0);
1926                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1927                         btrfs_mark_buffer_dirty(parent);
1928                 }
1929         }
1930         if (btrfs_header_nritems(mid) == 1) {
1931                 /*
1932                  * we're not allowed to leave a node with one item in the
1933                  * tree during a delete.  A deletion from lower in the tree
1934                  * could try to delete the only pointer in this node.
1935                  * So, pull some keys from the left.
1936                  * There has to be a left pointer at this point because
1937                  * otherwise we would have pulled some pointers from the
1938                  * right
1939                  */
1940                 if (!left) {
1941                         ret = -EROFS;
1942                         btrfs_handle_fs_error(fs_info, ret, NULL);
1943                         goto enospc;
1944                 }
1945                 wret = balance_node_right(trans, mid, left);
1946                 if (wret < 0) {
1947                         ret = wret;
1948                         goto enospc;
1949                 }
1950                 if (wret == 1) {
1951                         wret = push_node_left(trans, left, mid, 1);
1952                         if (wret < 0)
1953                                 ret = wret;
1954                 }
1955                 BUG_ON(wret == 1);
1956         }
1957         if (btrfs_header_nritems(mid) == 0) {
1958                 btrfs_clean_tree_block(mid);
1959                 btrfs_tree_unlock(mid);
1960                 del_ptr(root, path, level + 1, pslot);
1961                 root_sub_used(root, mid->len);
1962                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1963                 free_extent_buffer_stale(mid);
1964                 mid = NULL;
1965         } else {
1966                 /* update the parent key to reflect our changes */
1967                 struct btrfs_disk_key mid_key;
1968                 btrfs_node_key(mid, &mid_key, 0);
1969                 ret = tree_mod_log_insert_key(parent, pslot,
1970                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1971                 BUG_ON(ret < 0);
1972                 btrfs_set_node_key(parent, &mid_key, pslot);
1973                 btrfs_mark_buffer_dirty(parent);
1974         }
1975
1976         /* update the path */
1977         if (left) {
1978                 if (btrfs_header_nritems(left) > orig_slot) {
1979                         atomic_inc(&left->refs);
1980                         /* left was locked after cow */
1981                         path->nodes[level] = left;
1982                         path->slots[level + 1] -= 1;
1983                         path->slots[level] = orig_slot;
1984                         if (mid) {
1985                                 btrfs_tree_unlock(mid);
1986                                 free_extent_buffer(mid);
1987                         }
1988                 } else {
1989                         orig_slot -= btrfs_header_nritems(left);
1990                         path->slots[level] = orig_slot;
1991                 }
1992         }
1993         /* double check we haven't messed things up */
1994         if (orig_ptr !=
1995             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1996                 BUG();
1997 enospc:
1998         if (right) {
1999                 btrfs_tree_unlock(right);
2000                 free_extent_buffer(right);
2001         }
2002         if (left) {
2003                 if (path->nodes[level] != left)
2004                         btrfs_tree_unlock(left);
2005                 free_extent_buffer(left);
2006         }
2007         return ret;
2008 }
2009
2010 /* Node balancing for insertion.  Here we only split or push nodes around
2011  * when they are completely full.  This is also done top down, so we
2012  * have to be pessimistic.
2013  */
2014 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2015                                           struct btrfs_root *root,
2016                                           struct btrfs_path *path, int level)
2017 {
2018         struct btrfs_fs_info *fs_info = root->fs_info;
2019         struct extent_buffer *right = NULL;
2020         struct extent_buffer *mid;
2021         struct extent_buffer *left = NULL;
2022         struct extent_buffer *parent = NULL;
2023         int ret = 0;
2024         int wret;
2025         int pslot;
2026         int orig_slot = path->slots[level];
2027
2028         if (level == 0)
2029                 return 1;
2030
2031         mid = path->nodes[level];
2032         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2033
2034         if (level < BTRFS_MAX_LEVEL - 1) {
2035                 parent = path->nodes[level + 1];
2036                 pslot = path->slots[level + 1];
2037         }
2038
2039         if (!parent)
2040                 return 1;
2041
2042         left = btrfs_read_node_slot(parent, pslot - 1);
2043         if (IS_ERR(left))
2044                 left = NULL;
2045
2046         /* first, try to make some room in the middle buffer */
2047         if (left) {
2048                 u32 left_nr;
2049
2050                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2051
2052                 left_nr = btrfs_header_nritems(left);
2053                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2054                         wret = 1;
2055                 } else {
2056                         ret = btrfs_cow_block(trans, root, left, parent,
2057                                               pslot - 1, &left,
2058                                               BTRFS_NESTING_LEFT_COW);
2059                         if (ret)
2060                                 wret = 1;
2061                         else {
2062                                 wret = push_node_left(trans, left, mid, 0);
2063                         }
2064                 }
2065                 if (wret < 0)
2066                         ret = wret;
2067                 if (wret == 0) {
2068                         struct btrfs_disk_key disk_key;
2069                         orig_slot += left_nr;
2070                         btrfs_node_key(mid, &disk_key, 0);
2071                         ret = tree_mod_log_insert_key(parent, pslot,
2072                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2073                         BUG_ON(ret < 0);
2074                         btrfs_set_node_key(parent, &disk_key, pslot);
2075                         btrfs_mark_buffer_dirty(parent);
2076                         if (btrfs_header_nritems(left) > orig_slot) {
2077                                 path->nodes[level] = left;
2078                                 path->slots[level + 1] -= 1;
2079                                 path->slots[level] = orig_slot;
2080                                 btrfs_tree_unlock(mid);
2081                                 free_extent_buffer(mid);
2082                         } else {
2083                                 orig_slot -=
2084                                         btrfs_header_nritems(left);
2085                                 path->slots[level] = orig_slot;
2086                                 btrfs_tree_unlock(left);
2087                                 free_extent_buffer(left);
2088                         }
2089                         return 0;
2090                 }
2091                 btrfs_tree_unlock(left);
2092                 free_extent_buffer(left);
2093         }
2094         right = btrfs_read_node_slot(parent, pslot + 1);
2095         if (IS_ERR(right))
2096                 right = NULL;
2097
2098         /*
2099          * then try to empty the right most buffer into the middle
2100          */
2101         if (right) {
2102                 u32 right_nr;
2103
2104                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2105
2106                 right_nr = btrfs_header_nritems(right);
2107                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2108                         wret = 1;
2109                 } else {
2110                         ret = btrfs_cow_block(trans, root, right,
2111                                               parent, pslot + 1,
2112                                               &right, BTRFS_NESTING_RIGHT_COW);
2113                         if (ret)
2114                                 wret = 1;
2115                         else {
2116                                 wret = balance_node_right(trans, right, mid);
2117                         }
2118                 }
2119                 if (wret < 0)
2120                         ret = wret;
2121                 if (wret == 0) {
2122                         struct btrfs_disk_key disk_key;
2123
2124                         btrfs_node_key(right, &disk_key, 0);
2125                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2126                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2127                         BUG_ON(ret < 0);
2128                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2129                         btrfs_mark_buffer_dirty(parent);
2130
2131                         if (btrfs_header_nritems(mid) <= orig_slot) {
2132                                 path->nodes[level] = right;
2133                                 path->slots[level + 1] += 1;
2134                                 path->slots[level] = orig_slot -
2135                                         btrfs_header_nritems(mid);
2136                                 btrfs_tree_unlock(mid);
2137                                 free_extent_buffer(mid);
2138                         } else {
2139                                 btrfs_tree_unlock(right);
2140                                 free_extent_buffer(right);
2141                         }
2142                         return 0;
2143                 }
2144                 btrfs_tree_unlock(right);
2145                 free_extent_buffer(right);
2146         }
2147         return 1;
2148 }
2149
2150 /*
2151  * readahead one full node of leaves, finding things that are close
2152  * to the block in 'slot', and triggering ra on them.
2153  */
2154 static void reada_for_search(struct btrfs_fs_info *fs_info,
2155                              struct btrfs_path *path,
2156                              int level, int slot, u64 objectid)
2157 {
2158         struct extent_buffer *node;
2159         struct btrfs_disk_key disk_key;
2160         u32 nritems;
2161         u64 search;
2162         u64 target;
2163         u64 nread = 0;
2164         struct extent_buffer *eb;
2165         u32 nr;
2166         u32 blocksize;
2167         u32 nscan = 0;
2168
2169         if (level != 1)
2170                 return;
2171
2172         if (!path->nodes[level])
2173                 return;
2174
2175         node = path->nodes[level];
2176
2177         search = btrfs_node_blockptr(node, slot);
2178         blocksize = fs_info->nodesize;
2179         eb = find_extent_buffer(fs_info, search);
2180         if (eb) {
2181                 free_extent_buffer(eb);
2182                 return;
2183         }
2184
2185         target = search;
2186
2187         nritems = btrfs_header_nritems(node);
2188         nr = slot;
2189
2190         while (1) {
2191                 if (path->reada == READA_BACK) {
2192                         if (nr == 0)
2193                                 break;
2194                         nr--;
2195                 } else if (path->reada == READA_FORWARD) {
2196                         nr++;
2197                         if (nr >= nritems)
2198                                 break;
2199                 }
2200                 if (path->reada == READA_BACK && objectid) {
2201                         btrfs_node_key(node, &disk_key, nr);
2202                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2203                                 break;
2204                 }
2205                 search = btrfs_node_blockptr(node, nr);
2206                 if ((search <= target && target - search <= 65536) ||
2207                     (search > target && search - target <= 65536)) {
2208                         btrfs_readahead_node_child(node, nr);
2209                         nread += blocksize;
2210                 }
2211                 nscan++;
2212                 if ((nread > 65536 || nscan > 32))
2213                         break;
2214         }
2215 }
2216
2217 static noinline void reada_for_balance(struct btrfs_path *path, int level)
2218 {
2219         struct extent_buffer *parent;
2220         int slot;
2221         int nritems;
2222
2223         parent = path->nodes[level + 1];
2224         if (!parent)
2225                 return;
2226
2227         nritems = btrfs_header_nritems(parent);
2228         slot = path->slots[level + 1];
2229
2230         if (slot > 0)
2231                 btrfs_readahead_node_child(parent, slot - 1);
2232         if (slot + 1 < nritems)
2233                 btrfs_readahead_node_child(parent, slot + 1);
2234 }
2235
2236
2237 /*
2238  * when we walk down the tree, it is usually safe to unlock the higher layers
2239  * in the tree.  The exceptions are when our path goes through slot 0, because
2240  * operations on the tree might require changing key pointers higher up in the
2241  * tree.
2242  *
2243  * callers might also have set path->keep_locks, which tells this code to keep
2244  * the lock if the path points to the last slot in the block.  This is part of
2245  * walking through the tree, and selecting the next slot in the higher block.
2246  *
2247  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2248  * if lowest_unlock is 1, level 0 won't be unlocked
2249  */
2250 static noinline void unlock_up(struct btrfs_path *path, int level,
2251                                int lowest_unlock, int min_write_lock_level,
2252                                int *write_lock_level)
2253 {
2254         int i;
2255         int skip_level = level;
2256         int no_skips = 0;
2257         struct extent_buffer *t;
2258
2259         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2260                 if (!path->nodes[i])
2261                         break;
2262                 if (!path->locks[i])
2263                         break;
2264                 if (!no_skips && path->slots[i] == 0) {
2265                         skip_level = i + 1;
2266                         continue;
2267                 }
2268                 if (!no_skips && path->keep_locks) {
2269                         u32 nritems;
2270                         t = path->nodes[i];
2271                         nritems = btrfs_header_nritems(t);
2272                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2273                                 skip_level = i + 1;
2274                                 continue;
2275                         }
2276                 }
2277                 if (skip_level < i && i >= lowest_unlock)
2278                         no_skips = 1;
2279
2280                 t = path->nodes[i];
2281                 if (i >= lowest_unlock && i > skip_level) {
2282                         btrfs_tree_unlock_rw(t, path->locks[i]);
2283                         path->locks[i] = 0;
2284                         if (write_lock_level &&
2285                             i > min_write_lock_level &&
2286                             i <= *write_lock_level) {
2287                                 *write_lock_level = i - 1;
2288                         }
2289                 }
2290         }
2291 }
2292
2293 /*
2294  * helper function for btrfs_search_slot.  The goal is to find a block
2295  * in cache without setting the path to blocking.  If we find the block
2296  * we return zero and the path is unchanged.
2297  *
2298  * If we can't find the block, we set the path blocking and do some
2299  * reada.  -EAGAIN is returned and the search must be repeated.
2300  */
2301 static int
2302 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2303                       struct extent_buffer **eb_ret, int level, int slot,
2304                       const struct btrfs_key *key)
2305 {
2306         struct btrfs_fs_info *fs_info = root->fs_info;
2307         u64 blocknr;
2308         u64 gen;
2309         struct extent_buffer *tmp;
2310         struct btrfs_key first_key;
2311         int ret;
2312         int parent_level;
2313
2314         blocknr = btrfs_node_blockptr(*eb_ret, slot);
2315         gen = btrfs_node_ptr_generation(*eb_ret, slot);
2316         parent_level = btrfs_header_level(*eb_ret);
2317         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2318
2319         tmp = find_extent_buffer(fs_info, blocknr);
2320         if (tmp) {
2321                 /* first we do an atomic uptodate check */
2322                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2323                         /*
2324                          * Do extra check for first_key, eb can be stale due to
2325                          * being cached, read from scrub, or have multiple
2326                          * parents (shared tree blocks).
2327                          */
2328                         if (btrfs_verify_level_key(tmp,
2329                                         parent_level - 1, &first_key, gen)) {
2330                                 free_extent_buffer(tmp);
2331                                 return -EUCLEAN;
2332                         }
2333                         *eb_ret = tmp;
2334                         return 0;
2335                 }
2336
2337                 /* now we're allowed to do a blocking uptodate check */
2338                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2339                 if (!ret) {
2340                         *eb_ret = tmp;
2341                         return 0;
2342                 }
2343                 free_extent_buffer(tmp);
2344                 btrfs_release_path(p);
2345                 return -EIO;
2346         }
2347
2348         /*
2349          * reduce lock contention at high levels
2350          * of the btree by dropping locks before
2351          * we read.  Don't release the lock on the current
2352          * level because we need to walk this node to figure
2353          * out which blocks to read.
2354          */
2355         btrfs_unlock_up_safe(p, level + 1);
2356
2357         if (p->reada != READA_NONE)
2358                 reada_for_search(fs_info, p, level, slot, key->objectid);
2359
2360         ret = -EAGAIN;
2361         tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
2362                               gen, parent_level - 1, &first_key);
2363         if (!IS_ERR(tmp)) {
2364                 /*
2365                  * If the read above didn't mark this buffer up to date,
2366                  * it will never end up being up to date.  Set ret to EIO now
2367                  * and give up so that our caller doesn't loop forever
2368                  * on our EAGAINs.
2369                  */
2370                 if (!extent_buffer_uptodate(tmp))
2371                         ret = -EIO;
2372                 free_extent_buffer(tmp);
2373         } else {
2374                 ret = PTR_ERR(tmp);
2375         }
2376
2377         btrfs_release_path(p);
2378         return ret;
2379 }
2380
2381 /*
2382  * helper function for btrfs_search_slot.  This does all of the checks
2383  * for node-level blocks and does any balancing required based on
2384  * the ins_len.
2385  *
2386  * If no extra work was required, zero is returned.  If we had to
2387  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2388  * start over
2389  */
2390 static int
2391 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2392                        struct btrfs_root *root, struct btrfs_path *p,
2393                        struct extent_buffer *b, int level, int ins_len,
2394                        int *write_lock_level)
2395 {
2396         struct btrfs_fs_info *fs_info = root->fs_info;
2397         int ret = 0;
2398
2399         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2400             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2401
2402                 if (*write_lock_level < level + 1) {
2403                         *write_lock_level = level + 1;
2404                         btrfs_release_path(p);
2405                         return -EAGAIN;
2406                 }
2407
2408                 reada_for_balance(p, level);
2409                 ret = split_node(trans, root, p, level);
2410
2411                 b = p->nodes[level];
2412         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2413                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2414
2415                 if (*write_lock_level < level + 1) {
2416                         *write_lock_level = level + 1;
2417                         btrfs_release_path(p);
2418                         return -EAGAIN;
2419                 }
2420
2421                 reada_for_balance(p, level);
2422                 ret = balance_level(trans, root, p, level);
2423                 if (ret)
2424                         return ret;
2425
2426                 b = p->nodes[level];
2427                 if (!b) {
2428                         btrfs_release_path(p);
2429                         return -EAGAIN;
2430                 }
2431                 BUG_ON(btrfs_header_nritems(b) == 1);
2432         }
2433         return ret;
2434 }
2435
2436 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2437                 u64 iobjectid, u64 ioff, u8 key_type,
2438                 struct btrfs_key *found_key)
2439 {
2440         int ret;
2441         struct btrfs_key key;
2442         struct extent_buffer *eb;
2443
2444         ASSERT(path);
2445         ASSERT(found_key);
2446
2447         key.type = key_type;
2448         key.objectid = iobjectid;
2449         key.offset = ioff;
2450
2451         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2452         if (ret < 0)
2453                 return ret;
2454
2455         eb = path->nodes[0];
2456         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2457                 ret = btrfs_next_leaf(fs_root, path);
2458                 if (ret)
2459                         return ret;
2460                 eb = path->nodes[0];
2461         }
2462
2463         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2464         if (found_key->type != key.type ||
2465                         found_key->objectid != key.objectid)
2466                 return 1;
2467
2468         return 0;
2469 }
2470
2471 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2472                                                         struct btrfs_path *p,
2473                                                         int write_lock_level)
2474 {
2475         struct btrfs_fs_info *fs_info = root->fs_info;
2476         struct extent_buffer *b;
2477         int root_lock;
2478         int level = 0;
2479
2480         /* We try very hard to do read locks on the root */
2481         root_lock = BTRFS_READ_LOCK;
2482
2483         if (p->search_commit_root) {
2484                 /*
2485                  * The commit roots are read only so we always do read locks,
2486                  * and we always must hold the commit_root_sem when doing
2487                  * searches on them, the only exception is send where we don't
2488                  * want to block transaction commits for a long time, so
2489                  * we need to clone the commit root in order to avoid races
2490                  * with transaction commits that create a snapshot of one of
2491                  * the roots used by a send operation.
2492                  */
2493                 if (p->need_commit_sem) {
2494                         down_read(&fs_info->commit_root_sem);
2495                         b = btrfs_clone_extent_buffer(root->commit_root);
2496                         up_read(&fs_info->commit_root_sem);
2497                         if (!b)
2498                                 return ERR_PTR(-ENOMEM);
2499
2500                 } else {
2501                         b = root->commit_root;
2502                         atomic_inc(&b->refs);
2503                 }
2504                 level = btrfs_header_level(b);
2505                 /*
2506                  * Ensure that all callers have set skip_locking when
2507                  * p->search_commit_root = 1.
2508                  */
2509                 ASSERT(p->skip_locking == 1);
2510
2511                 goto out;
2512         }
2513
2514         if (p->skip_locking) {
2515                 b = btrfs_root_node(root);
2516                 level = btrfs_header_level(b);
2517                 goto out;
2518         }
2519
2520         /*
2521          * If the level is set to maximum, we can skip trying to get the read
2522          * lock.
2523          */
2524         if (write_lock_level < BTRFS_MAX_LEVEL) {
2525                 /*
2526                  * We don't know the level of the root node until we actually
2527                  * have it read locked
2528                  */
2529                 b = btrfs_read_lock_root_node(root);
2530                 level = btrfs_header_level(b);
2531                 if (level > write_lock_level)
2532                         goto out;
2533
2534                 /* Whoops, must trade for write lock */
2535                 btrfs_tree_read_unlock(b);
2536                 free_extent_buffer(b);
2537         }
2538
2539         b = btrfs_lock_root_node(root);
2540         root_lock = BTRFS_WRITE_LOCK;
2541
2542         /* The level might have changed, check again */
2543         level = btrfs_header_level(b);
2544
2545 out:
2546         p->nodes[level] = b;
2547         if (!p->skip_locking)
2548                 p->locks[level] = root_lock;
2549         /*
2550          * Callers are responsible for dropping b's references.
2551          */
2552         return b;
2553 }
2554
2555
2556 /*
2557  * btrfs_search_slot - look for a key in a tree and perform necessary
2558  * modifications to preserve tree invariants.
2559  *
2560  * @trans:      Handle of transaction, used when modifying the tree
2561  * @p:          Holds all btree nodes along the search path
2562  * @root:       The root node of the tree
2563  * @key:        The key we are looking for
2564  * @ins_len:    Indicates purpose of search:
2565  *              >0  for inserts it's size of item inserted (*)
2566  *              <0  for deletions
2567  *               0  for plain searches, not modifying the tree
2568  *
2569  *              (*) If size of item inserted doesn't include
2570  *              sizeof(struct btrfs_item), then p->search_for_extension must
2571  *              be set.
2572  * @cow:        boolean should CoW operations be performed. Must always be 1
2573  *              when modifying the tree.
2574  *
2575  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2576  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2577  *
2578  * If @key is found, 0 is returned and you can find the item in the leaf level
2579  * of the path (level 0)
2580  *
2581  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2582  * points to the slot where it should be inserted
2583  *
2584  * If an error is encountered while searching the tree a negative error number
2585  * is returned
2586  */
2587 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2588                       const struct btrfs_key *key, struct btrfs_path *p,
2589                       int ins_len, int cow)
2590 {
2591         struct extent_buffer *b;
2592         int slot;
2593         int ret;
2594         int err;
2595         int level;
2596         int lowest_unlock = 1;
2597         /* everything at write_lock_level or lower must be write locked */
2598         int write_lock_level = 0;
2599         u8 lowest_level = 0;
2600         int min_write_lock_level;
2601         int prev_cmp;
2602
2603         lowest_level = p->lowest_level;
2604         WARN_ON(lowest_level && ins_len > 0);
2605         WARN_ON(p->nodes[0] != NULL);
2606         BUG_ON(!cow && ins_len);
2607
2608         if (ins_len < 0) {
2609                 lowest_unlock = 2;
2610
2611                 /* when we are removing items, we might have to go up to level
2612                  * two as we update tree pointers  Make sure we keep write
2613                  * for those levels as well
2614                  */
2615                 write_lock_level = 2;
2616         } else if (ins_len > 0) {
2617                 /*
2618                  * for inserting items, make sure we have a write lock on
2619                  * level 1 so we can update keys
2620                  */
2621                 write_lock_level = 1;
2622         }
2623
2624         if (!cow)
2625                 write_lock_level = -1;
2626
2627         if (cow && (p->keep_locks || p->lowest_level))
2628                 write_lock_level = BTRFS_MAX_LEVEL;
2629
2630         min_write_lock_level = write_lock_level;
2631
2632 again:
2633         prev_cmp = -1;
2634         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2635         if (IS_ERR(b)) {
2636                 ret = PTR_ERR(b);
2637                 goto done;
2638         }
2639
2640         while (b) {
2641                 int dec = 0;
2642
2643                 level = btrfs_header_level(b);
2644
2645                 if (cow) {
2646                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2647
2648                         /*
2649                          * if we don't really need to cow this block
2650                          * then we don't want to set the path blocking,
2651                          * so we test it here
2652                          */
2653                         if (!should_cow_block(trans, root, b)) {
2654                                 trans->dirty = true;
2655                                 goto cow_done;
2656                         }
2657
2658                         /*
2659                          * must have write locks on this node and the
2660                          * parent
2661                          */
2662                         if (level > write_lock_level ||
2663                             (level + 1 > write_lock_level &&
2664                             level + 1 < BTRFS_MAX_LEVEL &&
2665                             p->nodes[level + 1])) {
2666                                 write_lock_level = level + 1;
2667                                 btrfs_release_path(p);
2668                                 goto again;
2669                         }
2670
2671                         if (last_level)
2672                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2673                                                       &b,
2674                                                       BTRFS_NESTING_COW);
2675                         else
2676                                 err = btrfs_cow_block(trans, root, b,
2677                                                       p->nodes[level + 1],
2678                                                       p->slots[level + 1], &b,
2679                                                       BTRFS_NESTING_COW);
2680                         if (err) {
2681                                 ret = err;
2682                                 goto done;
2683                         }
2684                 }
2685 cow_done:
2686                 p->nodes[level] = b;
2687                 /*
2688                  * Leave path with blocking locks to avoid massive
2689                  * lock context switch, this is made on purpose.
2690                  */
2691
2692                 /*
2693                  * we have a lock on b and as long as we aren't changing
2694                  * the tree, there is no way to for the items in b to change.
2695                  * It is safe to drop the lock on our parent before we
2696                  * go through the expensive btree search on b.
2697                  *
2698                  * If we're inserting or deleting (ins_len != 0), then we might
2699                  * be changing slot zero, which may require changing the parent.
2700                  * So, we can't drop the lock until after we know which slot
2701                  * we're operating on.
2702                  */
2703                 if (!ins_len && !p->keep_locks) {
2704                         int u = level + 1;
2705
2706                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2707                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2708                                 p->locks[u] = 0;
2709                         }
2710                 }
2711
2712                 /*
2713                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2714                  * we can safely assume the target key will always be in slot 0
2715                  * on lower levels due to the invariants BTRFS' btree provides,
2716                  * namely that a btrfs_key_ptr entry always points to the
2717                  * lowest key in the child node, thus we can skip searching
2718                  * lower levels
2719                  */
2720                 if (prev_cmp == 0) {
2721                         slot = 0;
2722                         ret = 0;
2723                 } else {
2724                         ret = btrfs_bin_search(b, key, &slot);
2725                         prev_cmp = ret;
2726                         if (ret < 0)
2727                                 goto done;
2728                 }
2729
2730                 if (level == 0) {
2731                         p->slots[level] = slot;
2732                         /*
2733                          * Item key already exists. In this case, if we are
2734                          * allowed to insert the item (for example, in dir_item
2735                          * case, item key collision is allowed), it will be
2736                          * merged with the original item. Only the item size
2737                          * grows, no new btrfs item will be added. If
2738                          * search_for_extension is not set, ins_len already
2739                          * accounts the size btrfs_item, deduct it here so leaf
2740                          * space check will be correct.
2741                          */
2742                         if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
2743                                 ASSERT(ins_len >= sizeof(struct btrfs_item));
2744                                 ins_len -= sizeof(struct btrfs_item);
2745                         }
2746                         if (ins_len > 0 &&
2747                             btrfs_leaf_free_space(b) < ins_len) {
2748                                 if (write_lock_level < 1) {
2749                                         write_lock_level = 1;
2750                                         btrfs_release_path(p);
2751                                         goto again;
2752                                 }
2753
2754                                 err = split_leaf(trans, root, key,
2755                                                  p, ins_len, ret == 0);
2756
2757                                 BUG_ON(err > 0);
2758                                 if (err) {
2759                                         ret = err;
2760                                         goto done;
2761                                 }
2762                         }
2763                         if (!p->search_for_split)
2764                                 unlock_up(p, level, lowest_unlock,
2765                                           min_write_lock_level, NULL);
2766                         goto done;
2767                 }
2768                 if (ret && slot > 0) {
2769                         dec = 1;
2770                         slot--;
2771                 }
2772                 p->slots[level] = slot;
2773                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2774                                              &write_lock_level);
2775                 if (err == -EAGAIN)
2776                         goto again;
2777                 if (err) {
2778                         ret = err;
2779                         goto done;
2780                 }
2781                 b = p->nodes[level];
2782                 slot = p->slots[level];
2783
2784                 /*
2785                  * Slot 0 is special, if we change the key we have to update
2786                  * the parent pointer which means we must have a write lock on
2787                  * the parent
2788                  */
2789                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2790                         write_lock_level = level + 1;
2791                         btrfs_release_path(p);
2792                         goto again;
2793                 }
2794
2795                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2796                           &write_lock_level);
2797
2798                 if (level == lowest_level) {
2799                         if (dec)
2800                                 p->slots[level]++;
2801                         goto done;
2802                 }
2803
2804                 err = read_block_for_search(root, p, &b, level, slot, key);
2805                 if (err == -EAGAIN)
2806                         goto again;
2807                 if (err) {
2808                         ret = err;
2809                         goto done;
2810                 }
2811
2812                 if (!p->skip_locking) {
2813                         level = btrfs_header_level(b);
2814                         if (level <= write_lock_level) {
2815                                 btrfs_tree_lock(b);
2816                                 p->locks[level] = BTRFS_WRITE_LOCK;
2817                         } else {
2818                                 btrfs_tree_read_lock(b);
2819                                 p->locks[level] = BTRFS_READ_LOCK;
2820                         }
2821                         p->nodes[level] = b;
2822                 }
2823         }
2824         ret = 1;
2825 done:
2826         if (ret < 0 && !p->skip_release_on_error)
2827                 btrfs_release_path(p);
2828         return ret;
2829 }
2830 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2831
2832 /*
2833  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2834  * current state of the tree together with the operations recorded in the tree
2835  * modification log to search for the key in a previous version of this tree, as
2836  * denoted by the time_seq parameter.
2837  *
2838  * Naturally, there is no support for insert, delete or cow operations.
2839  *
2840  * The resulting path and return value will be set up as if we called
2841  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2842  */
2843 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2844                           struct btrfs_path *p, u64 time_seq)
2845 {
2846         struct btrfs_fs_info *fs_info = root->fs_info;
2847         struct extent_buffer *b;
2848         int slot;
2849         int ret;
2850         int err;
2851         int level;
2852         int lowest_unlock = 1;
2853         u8 lowest_level = 0;
2854
2855         lowest_level = p->lowest_level;
2856         WARN_ON(p->nodes[0] != NULL);
2857
2858         if (p->search_commit_root) {
2859                 BUG_ON(time_seq);
2860                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2861         }
2862
2863 again:
2864         b = get_old_root(root, time_seq);
2865         if (!b) {
2866                 ret = -EIO;
2867                 goto done;
2868         }
2869         level = btrfs_header_level(b);
2870         p->locks[level] = BTRFS_READ_LOCK;
2871
2872         while (b) {
2873                 int dec = 0;
2874
2875                 level = btrfs_header_level(b);
2876                 p->nodes[level] = b;
2877
2878                 /*
2879                  * we have a lock on b and as long as we aren't changing
2880                  * the tree, there is no way to for the items in b to change.
2881                  * It is safe to drop the lock on our parent before we
2882                  * go through the expensive btree search on b.
2883                  */
2884                 btrfs_unlock_up_safe(p, level + 1);
2885
2886                 ret = btrfs_bin_search(b, key, &slot);
2887                 if (ret < 0)
2888                         goto done;
2889
2890                 if (level == 0) {
2891                         p->slots[level] = slot;
2892                         unlock_up(p, level, lowest_unlock, 0, NULL);
2893                         goto done;
2894                 }
2895
2896                 if (ret && slot > 0) {
2897                         dec = 1;
2898                         slot--;
2899                 }
2900                 p->slots[level] = slot;
2901                 unlock_up(p, level, lowest_unlock, 0, NULL);
2902
2903                 if (level == lowest_level) {
2904                         if (dec)
2905                                 p->slots[level]++;
2906                         goto done;
2907                 }
2908
2909                 err = read_block_for_search(root, p, &b, level, slot, key);
2910                 if (err == -EAGAIN)
2911                         goto again;
2912                 if (err) {
2913                         ret = err;
2914                         goto done;
2915                 }
2916
2917                 level = btrfs_header_level(b);
2918                 btrfs_tree_read_lock(b);
2919                 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2920                 if (!b) {
2921                         ret = -ENOMEM;
2922                         goto done;
2923                 }
2924                 p->locks[level] = BTRFS_READ_LOCK;
2925                 p->nodes[level] = b;
2926         }
2927         ret = 1;
2928 done:
2929         if (ret < 0)
2930                 btrfs_release_path(p);
2931
2932         return ret;
2933 }
2934
2935 /*
2936  * helper to use instead of search slot if no exact match is needed but
2937  * instead the next or previous item should be returned.
2938  * When find_higher is true, the next higher item is returned, the next lower
2939  * otherwise.
2940  * When return_any and find_higher are both true, and no higher item is found,
2941  * return the next lower instead.
2942  * When return_any is true and find_higher is false, and no lower item is found,
2943  * return the next higher instead.
2944  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2945  * < 0 on error
2946  */
2947 int btrfs_search_slot_for_read(struct btrfs_root *root,
2948                                const struct btrfs_key *key,
2949                                struct btrfs_path *p, int find_higher,
2950                                int return_any)
2951 {
2952         int ret;
2953         struct extent_buffer *leaf;
2954
2955 again:
2956         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2957         if (ret <= 0)
2958                 return ret;
2959         /*
2960          * a return value of 1 means the path is at the position where the
2961          * item should be inserted. Normally this is the next bigger item,
2962          * but in case the previous item is the last in a leaf, path points
2963          * to the first free slot in the previous leaf, i.e. at an invalid
2964          * item.
2965          */
2966         leaf = p->nodes[0];
2967
2968         if (find_higher) {
2969                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2970                         ret = btrfs_next_leaf(root, p);
2971                         if (ret <= 0)
2972                                 return ret;
2973                         if (!return_any)
2974                                 return 1;
2975                         /*
2976                          * no higher item found, return the next
2977                          * lower instead
2978                          */
2979                         return_any = 0;
2980                         find_higher = 0;
2981                         btrfs_release_path(p);
2982                         goto again;
2983                 }
2984         } else {
2985                 if (p->slots[0] == 0) {
2986                         ret = btrfs_prev_leaf(root, p);
2987                         if (ret < 0)
2988                                 return ret;
2989                         if (!ret) {
2990                                 leaf = p->nodes[0];
2991                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2992                                         p->slots[0]--;
2993                                 return 0;
2994                         }
2995                         if (!return_any)
2996                                 return 1;
2997                         /*
2998                          * no lower item found, return the next
2999                          * higher instead
3000                          */
3001                         return_any = 0;
3002                         find_higher = 1;
3003                         btrfs_release_path(p);
3004                         goto again;
3005                 } else {
3006                         --p->slots[0];
3007                 }
3008         }
3009         return 0;
3010 }
3011
3012 /*
3013  * adjust the pointers going up the tree, starting at level
3014  * making sure the right key of each node is points to 'key'.
3015  * This is used after shifting pointers to the left, so it stops
3016  * fixing up pointers when a given leaf/node is not in slot 0 of the
3017  * higher levels
3018  *
3019  */
3020 static void fixup_low_keys(struct btrfs_path *path,
3021                            struct btrfs_disk_key *key, int level)
3022 {
3023         int i;
3024         struct extent_buffer *t;
3025         int ret;
3026
3027         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3028                 int tslot = path->slots[i];
3029
3030                 if (!path->nodes[i])
3031                         break;
3032                 t = path->nodes[i];
3033                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3034                                 GFP_ATOMIC);
3035                 BUG_ON(ret < 0);
3036                 btrfs_set_node_key(t, key, tslot);
3037                 btrfs_mark_buffer_dirty(path->nodes[i]);
3038                 if (tslot != 0)
3039                         break;
3040         }
3041 }
3042
3043 /*
3044  * update item key.
3045  *
3046  * This function isn't completely safe. It's the caller's responsibility
3047  * that the new key won't break the order
3048  */
3049 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3050                              struct btrfs_path *path,
3051                              const struct btrfs_key *new_key)
3052 {
3053         struct btrfs_disk_key disk_key;
3054         struct extent_buffer *eb;
3055         int slot;
3056
3057         eb = path->nodes[0];
3058         slot = path->slots[0];
3059         if (slot > 0) {
3060                 btrfs_item_key(eb, &disk_key, slot - 1);
3061                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3062                         btrfs_crit(fs_info,
3063                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3064                                    slot, btrfs_disk_key_objectid(&disk_key),
3065                                    btrfs_disk_key_type(&disk_key),
3066                                    btrfs_disk_key_offset(&disk_key),
3067                                    new_key->objectid, new_key->type,
3068                                    new_key->offset);
3069                         btrfs_print_leaf(eb);
3070                         BUG();
3071                 }
3072         }
3073         if (slot < btrfs_header_nritems(eb) - 1) {
3074                 btrfs_item_key(eb, &disk_key, slot + 1);
3075                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3076                         btrfs_crit(fs_info,
3077                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3078                                    slot, btrfs_disk_key_objectid(&disk_key),
3079                                    btrfs_disk_key_type(&disk_key),
3080                                    btrfs_disk_key_offset(&disk_key),
3081                                    new_key->objectid, new_key->type,
3082                                    new_key->offset);
3083                         btrfs_print_leaf(eb);
3084                         BUG();
3085                 }
3086         }
3087
3088         btrfs_cpu_key_to_disk(&disk_key, new_key);
3089         btrfs_set_item_key(eb, &disk_key, slot);
3090         btrfs_mark_buffer_dirty(eb);
3091         if (slot == 0)
3092                 fixup_low_keys(path, &disk_key, 1);
3093 }
3094
3095 /*
3096  * Check key order of two sibling extent buffers.
3097  *
3098  * Return true if something is wrong.
3099  * Return false if everything is fine.
3100  *
3101  * Tree-checker only works inside one tree block, thus the following
3102  * corruption can not be detected by tree-checker:
3103  *
3104  * Leaf @left                   | Leaf @right
3105  * --------------------------------------------------------------
3106  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3107  *
3108  * Key f6 in leaf @left itself is valid, but not valid when the next
3109  * key in leaf @right is 7.
3110  * This can only be checked at tree block merge time.
3111  * And since tree checker has ensured all key order in each tree block
3112  * is correct, we only need to bother the last key of @left and the first
3113  * key of @right.
3114  */
3115 static bool check_sibling_keys(struct extent_buffer *left,
3116                                struct extent_buffer *right)
3117 {
3118         struct btrfs_key left_last;
3119         struct btrfs_key right_first;
3120         int level = btrfs_header_level(left);
3121         int nr_left = btrfs_header_nritems(left);
3122         int nr_right = btrfs_header_nritems(right);
3123
3124         /* No key to check in one of the tree blocks */
3125         if (!nr_left || !nr_right)
3126                 return false;
3127
3128         if (level) {
3129                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3130                 btrfs_node_key_to_cpu(right, &right_first, 0);
3131         } else {
3132                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3133                 btrfs_item_key_to_cpu(right, &right_first, 0);
3134         }
3135
3136         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3137                 btrfs_crit(left->fs_info,
3138 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3139                            left_last.objectid, left_last.type,
3140                            left_last.offset, right_first.objectid,
3141                            right_first.type, right_first.offset);
3142                 return true;
3143         }
3144         return false;
3145 }
3146
3147 /*
3148  * try to push data from one node into the next node left in the
3149  * tree.
3150  *
3151  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3152  * error, and > 0 if there was no room in the left hand block.
3153  */
3154 static int push_node_left(struct btrfs_trans_handle *trans,
3155                           struct extent_buffer *dst,
3156                           struct extent_buffer *src, int empty)
3157 {
3158         struct btrfs_fs_info *fs_info = trans->fs_info;
3159         int push_items = 0;
3160         int src_nritems;
3161         int dst_nritems;
3162         int ret = 0;
3163
3164         src_nritems = btrfs_header_nritems(src);
3165         dst_nritems = btrfs_header_nritems(dst);
3166         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3167         WARN_ON(btrfs_header_generation(src) != trans->transid);
3168         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3169
3170         if (!empty && src_nritems <= 8)
3171                 return 1;
3172
3173         if (push_items <= 0)
3174                 return 1;
3175
3176         if (empty) {
3177                 push_items = min(src_nritems, push_items);
3178                 if (push_items < src_nritems) {
3179                         /* leave at least 8 pointers in the node if
3180                          * we aren't going to empty it
3181                          */
3182                         if (src_nritems - push_items < 8) {
3183                                 if (push_items <= 8)
3184                                         return 1;
3185                                 push_items -= 8;
3186                         }
3187                 }
3188         } else
3189                 push_items = min(src_nritems - 8, push_items);
3190
3191         /* dst is the left eb, src is the middle eb */
3192         if (check_sibling_keys(dst, src)) {
3193                 ret = -EUCLEAN;
3194                 btrfs_abort_transaction(trans, ret);
3195                 return ret;
3196         }
3197         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3198         if (ret) {
3199                 btrfs_abort_transaction(trans, ret);
3200                 return ret;
3201         }
3202         copy_extent_buffer(dst, src,
3203                            btrfs_node_key_ptr_offset(dst_nritems),
3204                            btrfs_node_key_ptr_offset(0),
3205                            push_items * sizeof(struct btrfs_key_ptr));
3206
3207         if (push_items < src_nritems) {
3208                 /*
3209                  * Don't call tree_mod_log_insert_move here, key removal was
3210                  * already fully logged by tree_mod_log_eb_copy above.
3211                  */
3212                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3213                                       btrfs_node_key_ptr_offset(push_items),
3214                                       (src_nritems - push_items) *
3215                                       sizeof(struct btrfs_key_ptr));
3216         }
3217         btrfs_set_header_nritems(src, src_nritems - push_items);
3218         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3219         btrfs_mark_buffer_dirty(src);
3220         btrfs_mark_buffer_dirty(dst);
3221
3222         return ret;
3223 }
3224
3225 /*
3226  * try to push data from one node into the next node right in the
3227  * tree.
3228  *
3229  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3230  * error, and > 0 if there was no room in the right hand block.
3231  *
3232  * this will  only push up to 1/2 the contents of the left node over
3233  */
3234 static int balance_node_right(struct btrfs_trans_handle *trans,
3235                               struct extent_buffer *dst,
3236                               struct extent_buffer *src)
3237 {
3238         struct btrfs_fs_info *fs_info = trans->fs_info;
3239         int push_items = 0;
3240         int max_push;
3241         int src_nritems;
3242         int dst_nritems;
3243         int ret = 0;
3244
3245         WARN_ON(btrfs_header_generation(src) != trans->transid);
3246         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3247
3248         src_nritems = btrfs_header_nritems(src);
3249         dst_nritems = btrfs_header_nritems(dst);
3250         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3251         if (push_items <= 0)
3252                 return 1;
3253
3254         if (src_nritems < 4)
3255                 return 1;
3256
3257         max_push = src_nritems / 2 + 1;
3258         /* don't try to empty the node */
3259         if (max_push >= src_nritems)
3260                 return 1;
3261
3262         if (max_push < push_items)
3263                 push_items = max_push;
3264
3265         /* dst is the right eb, src is the middle eb */
3266         if (check_sibling_keys(src, dst)) {
3267                 ret = -EUCLEAN;
3268                 btrfs_abort_transaction(trans, ret);
3269                 return ret;
3270         }
3271         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3272         BUG_ON(ret < 0);
3273         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3274                                       btrfs_node_key_ptr_offset(0),
3275                                       (dst_nritems) *
3276                                       sizeof(struct btrfs_key_ptr));
3277
3278         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3279                                    push_items);
3280         if (ret) {
3281                 btrfs_abort_transaction(trans, ret);
3282                 return ret;
3283         }
3284         copy_extent_buffer(dst, src,
3285                            btrfs_node_key_ptr_offset(0),
3286                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3287                            push_items * sizeof(struct btrfs_key_ptr));
3288
3289         btrfs_set_header_nritems(src, src_nritems - push_items);
3290         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3291
3292         btrfs_mark_buffer_dirty(src);
3293         btrfs_mark_buffer_dirty(dst);
3294
3295         return ret;
3296 }
3297
3298 /*
3299  * helper function to insert a new root level in the tree.
3300  * A new node is allocated, and a single item is inserted to
3301  * point to the existing root
3302  *
3303  * returns zero on success or < 0 on failure.
3304  */
3305 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3306                            struct btrfs_root *root,
3307                            struct btrfs_path *path, int level)
3308 {
3309         struct btrfs_fs_info *fs_info = root->fs_info;
3310         u64 lower_gen;
3311         struct extent_buffer *lower;
3312         struct extent_buffer *c;
3313         struct extent_buffer *old;
3314         struct btrfs_disk_key lower_key;
3315         int ret;
3316
3317         BUG_ON(path->nodes[level]);
3318         BUG_ON(path->nodes[level-1] != root->node);
3319
3320         lower = path->nodes[level-1];
3321         if (level == 1)
3322                 btrfs_item_key(lower, &lower_key, 0);
3323         else
3324                 btrfs_node_key(lower, &lower_key, 0);
3325
3326         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3327                                          root->node->start, 0,
3328                                          BTRFS_NESTING_NEW_ROOT);
3329         if (IS_ERR(c))
3330                 return PTR_ERR(c);
3331
3332         root_add_used(root, fs_info->nodesize);
3333
3334         btrfs_set_header_nritems(c, 1);
3335         btrfs_set_node_key(c, &lower_key, 0);
3336         btrfs_set_node_blockptr(c, 0, lower->start);
3337         lower_gen = btrfs_header_generation(lower);
3338         WARN_ON(lower_gen != trans->transid);
3339
3340         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3341
3342         btrfs_mark_buffer_dirty(c);
3343
3344         old = root->node;
3345         ret = tree_mod_log_insert_root(root->node, c, 0);
3346         BUG_ON(ret < 0);
3347         rcu_assign_pointer(root->node, c);
3348
3349         /* the super has an extra ref to root->node */
3350         free_extent_buffer(old);
3351
3352         add_root_to_dirty_list(root);
3353         atomic_inc(&c->refs);
3354         path->nodes[level] = c;
3355         path->locks[level] = BTRFS_WRITE_LOCK;
3356         path->slots[level] = 0;
3357         return 0;
3358 }
3359
3360 /*
3361  * worker function to insert a single pointer in a node.
3362  * the node should have enough room for the pointer already
3363  *
3364  * slot and level indicate where you want the key to go, and
3365  * blocknr is the block the key points to.
3366  */
3367 static void insert_ptr(struct btrfs_trans_handle *trans,
3368                        struct btrfs_path *path,
3369                        struct btrfs_disk_key *key, u64 bytenr,
3370                        int slot, int level)
3371 {
3372         struct extent_buffer *lower;
3373         int nritems;
3374         int ret;
3375
3376         BUG_ON(!path->nodes[level]);
3377         btrfs_assert_tree_locked(path->nodes[level]);
3378         lower = path->nodes[level];
3379         nritems = btrfs_header_nritems(lower);
3380         BUG_ON(slot > nritems);
3381         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3382         if (slot != nritems) {
3383                 if (level) {
3384                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3385                                         nritems - slot);
3386                         BUG_ON(ret < 0);
3387                 }
3388                 memmove_extent_buffer(lower,
3389                               btrfs_node_key_ptr_offset(slot + 1),
3390                               btrfs_node_key_ptr_offset(slot),
3391                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3392         }
3393         if (level) {
3394                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3395                                 GFP_NOFS);
3396                 BUG_ON(ret < 0);
3397         }
3398         btrfs_set_node_key(lower, key, slot);
3399         btrfs_set_node_blockptr(lower, slot, bytenr);
3400         WARN_ON(trans->transid == 0);
3401         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3402         btrfs_set_header_nritems(lower, nritems + 1);
3403         btrfs_mark_buffer_dirty(lower);
3404 }
3405
3406 /*
3407  * split the node at the specified level in path in two.
3408  * The path is corrected to point to the appropriate node after the split
3409  *
3410  * Before splitting this tries to make some room in the node by pushing
3411  * left and right, if either one works, it returns right away.
3412  *
3413  * returns 0 on success and < 0 on failure
3414  */
3415 static noinline int split_node(struct btrfs_trans_handle *trans,
3416                                struct btrfs_root *root,
3417                                struct btrfs_path *path, int level)
3418 {
3419         struct btrfs_fs_info *fs_info = root->fs_info;
3420         struct extent_buffer *c;
3421         struct extent_buffer *split;
3422         struct btrfs_disk_key disk_key;
3423         int mid;
3424         int ret;
3425         u32 c_nritems;
3426
3427         c = path->nodes[level];
3428         WARN_ON(btrfs_header_generation(c) != trans->transid);
3429         if (c == root->node) {
3430                 /*
3431                  * trying to split the root, lets make a new one
3432                  *
3433                  * tree mod log: We don't log_removal old root in
3434                  * insert_new_root, because that root buffer will be kept as a
3435                  * normal node. We are going to log removal of half of the
3436                  * elements below with tree_mod_log_eb_copy. We're holding a
3437                  * tree lock on the buffer, which is why we cannot race with
3438                  * other tree_mod_log users.
3439                  */
3440                 ret = insert_new_root(trans, root, path, level + 1);
3441                 if (ret)
3442                         return ret;
3443         } else {
3444                 ret = push_nodes_for_insert(trans, root, path, level);
3445                 c = path->nodes[level];
3446                 if (!ret && btrfs_header_nritems(c) <
3447                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3448                         return 0;
3449                 if (ret < 0)
3450                         return ret;
3451         }
3452
3453         c_nritems = btrfs_header_nritems(c);
3454         mid = (c_nritems + 1) / 2;
3455         btrfs_node_key(c, &disk_key, mid);
3456
3457         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3458                                              c->start, 0, BTRFS_NESTING_SPLIT);
3459         if (IS_ERR(split))
3460                 return PTR_ERR(split);
3461
3462         root_add_used(root, fs_info->nodesize);
3463         ASSERT(btrfs_header_level(c) == level);
3464
3465         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3466         if (ret) {
3467                 btrfs_abort_transaction(trans, ret);
3468                 return ret;
3469         }
3470         copy_extent_buffer(split, c,
3471                            btrfs_node_key_ptr_offset(0),
3472                            btrfs_node_key_ptr_offset(mid),
3473                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3474         btrfs_set_header_nritems(split, c_nritems - mid);
3475         btrfs_set_header_nritems(c, mid);
3476
3477         btrfs_mark_buffer_dirty(c);
3478         btrfs_mark_buffer_dirty(split);
3479
3480         insert_ptr(trans, path, &disk_key, split->start,
3481                    path->slots[level + 1] + 1, level + 1);
3482
3483         if (path->slots[level] >= mid) {
3484                 path->slots[level] -= mid;
3485                 btrfs_tree_unlock(c);
3486                 free_extent_buffer(c);
3487                 path->nodes[level] = split;
3488                 path->slots[level + 1] += 1;
3489         } else {
3490                 btrfs_tree_unlock(split);
3491                 free_extent_buffer(split);
3492         }
3493         return 0;
3494 }
3495
3496 /*
3497  * how many bytes are required to store the items in a leaf.  start
3498  * and nr indicate which items in the leaf to check.  This totals up the
3499  * space used both by the item structs and the item data
3500  */
3501 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3502 {
3503         struct btrfs_item *start_item;
3504         struct btrfs_item *end_item;
3505         int data_len;
3506         int nritems = btrfs_header_nritems(l);
3507         int end = min(nritems, start + nr) - 1;
3508
3509         if (!nr)
3510                 return 0;
3511         start_item = btrfs_item_nr(start);
3512         end_item = btrfs_item_nr(end);
3513         data_len = btrfs_item_offset(l, start_item) +
3514                    btrfs_item_size(l, start_item);
3515         data_len = data_len - btrfs_item_offset(l, end_item);
3516         data_len += sizeof(struct btrfs_item) * nr;
3517         WARN_ON(data_len < 0);
3518         return data_len;
3519 }
3520
3521 /*
3522  * The space between the end of the leaf items and
3523  * the start of the leaf data.  IOW, how much room
3524  * the leaf has left for both items and data
3525  */
3526 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3527 {
3528         struct btrfs_fs_info *fs_info = leaf->fs_info;
3529         int nritems = btrfs_header_nritems(leaf);
3530         int ret;
3531
3532         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3533         if (ret < 0) {
3534                 btrfs_crit(fs_info,
3535                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3536                            ret,
3537                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3538                            leaf_space_used(leaf, 0, nritems), nritems);
3539         }
3540         return ret;
3541 }
3542
3543 /*
3544  * min slot controls the lowest index we're willing to push to the
3545  * right.  We'll push up to and including min_slot, but no lower
3546  */
3547 static noinline int __push_leaf_right(struct btrfs_path *path,
3548                                       int data_size, int empty,
3549                                       struct extent_buffer *right,
3550                                       int free_space, u32 left_nritems,
3551                                       u32 min_slot)
3552 {
3553         struct btrfs_fs_info *fs_info = right->fs_info;
3554         struct extent_buffer *left = path->nodes[0];
3555         struct extent_buffer *upper = path->nodes[1];
3556         struct btrfs_map_token token;
3557         struct btrfs_disk_key disk_key;
3558         int slot;
3559         u32 i;
3560         int push_space = 0;
3561         int push_items = 0;
3562         struct btrfs_item *item;
3563         u32 nr;
3564         u32 right_nritems;
3565         u32 data_end;
3566         u32 this_item_size;
3567
3568         if (empty)
3569                 nr = 0;
3570         else
3571                 nr = max_t(u32, 1, min_slot);
3572
3573         if (path->slots[0] >= left_nritems)
3574                 push_space += data_size;
3575
3576         slot = path->slots[1];
3577         i = left_nritems - 1;
3578         while (i >= nr) {
3579                 item = btrfs_item_nr(i);
3580
3581                 if (!empty && push_items > 0) {
3582                         if (path->slots[0] > i)
3583                                 break;
3584                         if (path->slots[0] == i) {
3585                                 int space = btrfs_leaf_free_space(left);
3586
3587                                 if (space + push_space * 2 > free_space)
3588                                         break;
3589                         }
3590                 }
3591
3592                 if (path->slots[0] == i)
3593                         push_space += data_size;
3594
3595                 this_item_size = btrfs_item_size(left, item);
3596                 if (this_item_size + sizeof(*item) + push_space > free_space)
3597                         break;
3598
3599                 push_items++;
3600                 push_space += this_item_size + sizeof(*item);
3601                 if (i == 0)
3602                         break;
3603                 i--;
3604         }
3605
3606         if (push_items == 0)
3607                 goto out_unlock;
3608
3609         WARN_ON(!empty && push_items == left_nritems);
3610
3611         /* push left to right */
3612         right_nritems = btrfs_header_nritems(right);
3613
3614         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3615         push_space -= leaf_data_end(left);
3616
3617         /* make room in the right data area */
3618         data_end = leaf_data_end(right);
3619         memmove_extent_buffer(right,
3620                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3621                               BTRFS_LEAF_DATA_OFFSET + data_end,
3622                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3623
3624         /* copy from the left data area */
3625         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3626                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3627                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3628                      push_space);
3629
3630         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3631                               btrfs_item_nr_offset(0),
3632                               right_nritems * sizeof(struct btrfs_item));
3633
3634         /* copy the items from left to right */
3635         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3636                    btrfs_item_nr_offset(left_nritems - push_items),
3637                    push_items * sizeof(struct btrfs_item));
3638
3639         /* update the item pointers */
3640         btrfs_init_map_token(&token, right);
3641         right_nritems += push_items;
3642         btrfs_set_header_nritems(right, right_nritems);
3643         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3644         for (i = 0; i < right_nritems; i++) {
3645                 item = btrfs_item_nr(i);
3646                 push_space -= btrfs_token_item_size(&token, item);
3647                 btrfs_set_token_item_offset(&token, item, push_space);
3648         }
3649
3650         left_nritems -= push_items;
3651         btrfs_set_header_nritems(left, left_nritems);
3652
3653         if (left_nritems)
3654                 btrfs_mark_buffer_dirty(left);
3655         else
3656                 btrfs_clean_tree_block(left);
3657
3658         btrfs_mark_buffer_dirty(right);
3659
3660         btrfs_item_key(right, &disk_key, 0);
3661         btrfs_set_node_key(upper, &disk_key, slot + 1);
3662         btrfs_mark_buffer_dirty(upper);
3663
3664         /* then fixup the leaf pointer in the path */
3665         if (path->slots[0] >= left_nritems) {
3666                 path->slots[0] -= left_nritems;
3667                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3668                         btrfs_clean_tree_block(path->nodes[0]);
3669                 btrfs_tree_unlock(path->nodes[0]);
3670                 free_extent_buffer(path->nodes[0]);
3671                 path->nodes[0] = right;
3672                 path->slots[1] += 1;
3673         } else {
3674                 btrfs_tree_unlock(right);
3675                 free_extent_buffer(right);
3676         }
3677         return 0;
3678
3679 out_unlock:
3680         btrfs_tree_unlock(right);
3681         free_extent_buffer(right);
3682         return 1;
3683 }
3684
3685 /*
3686  * push some data in the path leaf to the right, trying to free up at
3687  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3688  *
3689  * returns 1 if the push failed because the other node didn't have enough
3690  * room, 0 if everything worked out and < 0 if there were major errors.
3691  *
3692  * this will push starting from min_slot to the end of the leaf.  It won't
3693  * push any slot lower than min_slot
3694  */
3695 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3696                            *root, struct btrfs_path *path,
3697                            int min_data_size, int data_size,
3698                            int empty, u32 min_slot)
3699 {
3700         struct extent_buffer *left = path->nodes[0];
3701         struct extent_buffer *right;
3702         struct extent_buffer *upper;
3703         int slot;
3704         int free_space;
3705         u32 left_nritems;
3706         int ret;
3707
3708         if (!path->nodes[1])
3709                 return 1;
3710
3711         slot = path->slots[1];
3712         upper = path->nodes[1];
3713         if (slot >= btrfs_header_nritems(upper) - 1)
3714                 return 1;
3715
3716         btrfs_assert_tree_locked(path->nodes[1]);
3717
3718         right = btrfs_read_node_slot(upper, slot + 1);
3719         /*
3720          * slot + 1 is not valid or we fail to read the right node,
3721          * no big deal, just return.
3722          */
3723         if (IS_ERR(right))
3724                 return 1;
3725
3726         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3727
3728         free_space = btrfs_leaf_free_space(right);
3729         if (free_space < data_size)
3730                 goto out_unlock;
3731
3732         /* cow and double check */
3733         ret = btrfs_cow_block(trans, root, right, upper,
3734                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3735         if (ret)
3736                 goto out_unlock;
3737
3738         free_space = btrfs_leaf_free_space(right);
3739         if (free_space < data_size)
3740                 goto out_unlock;
3741
3742         left_nritems = btrfs_header_nritems(left);
3743         if (left_nritems == 0)
3744                 goto out_unlock;
3745
3746         if (check_sibling_keys(left, right)) {
3747                 ret = -EUCLEAN;
3748                 btrfs_tree_unlock(right);
3749                 free_extent_buffer(right);
3750                 return ret;
3751         }
3752         if (path->slots[0] == left_nritems && !empty) {
3753                 /* Key greater than all keys in the leaf, right neighbor has
3754                  * enough room for it and we're not emptying our leaf to delete
3755                  * it, therefore use right neighbor to insert the new item and
3756                  * no need to touch/dirty our left leaf. */
3757                 btrfs_tree_unlock(left);
3758                 free_extent_buffer(left);
3759                 path->nodes[0] = right;
3760                 path->slots[0] = 0;
3761                 path->slots[1]++;
3762                 return 0;
3763         }
3764
3765         return __push_leaf_right(path, min_data_size, empty,
3766                                 right, free_space, left_nritems, min_slot);
3767 out_unlock:
3768         btrfs_tree_unlock(right);
3769         free_extent_buffer(right);
3770         return 1;
3771 }
3772
3773 /*
3774  * push some data in the path leaf to the left, trying to free up at
3775  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3776  *
3777  * max_slot can put a limit on how far into the leaf we'll push items.  The
3778  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3779  * items
3780  */
3781 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3782                                      int empty, struct extent_buffer *left,
3783                                      int free_space, u32 right_nritems,
3784                                      u32 max_slot)
3785 {
3786         struct btrfs_fs_info *fs_info = left->fs_info;
3787         struct btrfs_disk_key disk_key;
3788         struct extent_buffer *right = path->nodes[0];
3789         int i;
3790         int push_space = 0;
3791         int push_items = 0;
3792         struct btrfs_item *item;
3793         u32 old_left_nritems;
3794         u32 nr;
3795         int ret = 0;
3796         u32 this_item_size;
3797         u32 old_left_item_size;
3798         struct btrfs_map_token token;
3799
3800         if (empty)
3801                 nr = min(right_nritems, max_slot);
3802         else
3803                 nr = min(right_nritems - 1, max_slot);
3804
3805         for (i = 0; i < nr; i++) {
3806                 item = btrfs_item_nr(i);
3807
3808                 if (!empty && push_items > 0) {
3809                         if (path->slots[0] < i)
3810                                 break;
3811                         if (path->slots[0] == i) {
3812                                 int space = btrfs_leaf_free_space(right);
3813
3814                                 if (space + push_space * 2 > free_space)
3815                                         break;
3816                         }
3817                 }
3818
3819                 if (path->slots[0] == i)
3820                         push_space += data_size;
3821
3822                 this_item_size = btrfs_item_size(right, item);
3823                 if (this_item_size + sizeof(*item) + push_space > free_space)
3824                         break;
3825
3826                 push_items++;
3827                 push_space += this_item_size + sizeof(*item);
3828         }
3829
3830         if (push_items == 0) {
3831                 ret = 1;
3832                 goto out;
3833         }
3834         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3835
3836         /* push data from right to left */
3837         copy_extent_buffer(left, right,
3838                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3839                            btrfs_item_nr_offset(0),
3840                            push_items * sizeof(struct btrfs_item));
3841
3842         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3843                      btrfs_item_offset_nr(right, push_items - 1);
3844
3845         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3846                      leaf_data_end(left) - push_space,
3847                      BTRFS_LEAF_DATA_OFFSET +
3848                      btrfs_item_offset_nr(right, push_items - 1),
3849                      push_space);
3850         old_left_nritems = btrfs_header_nritems(left);
3851         BUG_ON(old_left_nritems <= 0);
3852
3853         btrfs_init_map_token(&token, left);
3854         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3855         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3856                 u32 ioff;
3857
3858                 item = btrfs_item_nr(i);
3859
3860                 ioff = btrfs_token_item_offset(&token, item);
3861                 btrfs_set_token_item_offset(&token, item,
3862                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3863         }
3864         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3865
3866         /* fixup right node */
3867         if (push_items > right_nritems)
3868                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3869                        right_nritems);
3870
3871         if (push_items < right_nritems) {
3872                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3873                                                   leaf_data_end(right);
3874                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3875                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3876                                       BTRFS_LEAF_DATA_OFFSET +
3877                                       leaf_data_end(right), push_space);
3878
3879                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3880                               btrfs_item_nr_offset(push_items),
3881                              (btrfs_header_nritems(right) - push_items) *
3882                              sizeof(struct btrfs_item));
3883         }
3884
3885         btrfs_init_map_token(&token, right);
3886         right_nritems -= push_items;
3887         btrfs_set_header_nritems(right, right_nritems);
3888         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3889         for (i = 0; i < right_nritems; i++) {
3890                 item = btrfs_item_nr(i);
3891
3892                 push_space = push_space - btrfs_token_item_size(&token, item);
3893                 btrfs_set_token_item_offset(&token, item, push_space);
3894         }
3895
3896         btrfs_mark_buffer_dirty(left);
3897         if (right_nritems)
3898                 btrfs_mark_buffer_dirty(right);
3899         else
3900                 btrfs_clean_tree_block(right);
3901
3902         btrfs_item_key(right, &disk_key, 0);
3903         fixup_low_keys(path, &disk_key, 1);
3904
3905         /* then fixup the leaf pointer in the path */
3906         if (path->slots[0] < push_items) {
3907                 path->slots[0] += old_left_nritems;
3908                 btrfs_tree_unlock(path->nodes[0]);
3909                 free_extent_buffer(path->nodes[0]);
3910                 path->nodes[0] = left;
3911                 path->slots[1] -= 1;
3912         } else {
3913                 btrfs_tree_unlock(left);
3914                 free_extent_buffer(left);
3915                 path->slots[0] -= push_items;
3916         }
3917         BUG_ON(path->slots[0] < 0);
3918         return ret;
3919 out:
3920         btrfs_tree_unlock(left);
3921         free_extent_buffer(left);
3922         return ret;
3923 }
3924
3925 /*
3926  * push some data in the path leaf to the left, trying to free up at
3927  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3928  *
3929  * max_slot can put a limit on how far into the leaf we'll push items.  The
3930  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3931  * items
3932  */
3933 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3934                           *root, struct btrfs_path *path, int min_data_size,
3935                           int data_size, int empty, u32 max_slot)
3936 {
3937         struct extent_buffer *right = path->nodes[0];
3938         struct extent_buffer *left;
3939         int slot;
3940         int free_space;
3941         u32 right_nritems;
3942         int ret = 0;
3943
3944         slot = path->slots[1];
3945         if (slot == 0)
3946                 return 1;
3947         if (!path->nodes[1])
3948                 return 1;
3949
3950         right_nritems = btrfs_header_nritems(right);
3951         if (right_nritems == 0)
3952                 return 1;
3953
3954         btrfs_assert_tree_locked(path->nodes[1]);
3955
3956         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3957         /*
3958          * slot - 1 is not valid or we fail to read the left node,
3959          * no big deal, just return.
3960          */
3961         if (IS_ERR(left))
3962                 return 1;
3963
3964         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3965
3966         free_space = btrfs_leaf_free_space(left);
3967         if (free_space < data_size) {
3968                 ret = 1;
3969                 goto out;
3970         }
3971
3972         /* cow and double check */
3973         ret = btrfs_cow_block(trans, root, left,
3974                               path->nodes[1], slot - 1, &left,
3975                               BTRFS_NESTING_LEFT_COW);
3976         if (ret) {
3977                 /* we hit -ENOSPC, but it isn't fatal here */
3978                 if (ret == -ENOSPC)
3979                         ret = 1;
3980                 goto out;
3981         }
3982
3983         free_space = btrfs_leaf_free_space(left);
3984         if (free_space < data_size) {
3985                 ret = 1;
3986                 goto out;
3987         }
3988
3989         if (check_sibling_keys(left, right)) {
3990                 ret = -EUCLEAN;
3991                 goto out;
3992         }
3993         return __push_leaf_left(path, min_data_size,
3994                                empty, left, free_space, right_nritems,
3995                                max_slot);
3996 out:
3997         btrfs_tree_unlock(left);
3998         free_extent_buffer(left);
3999         return ret;
4000 }
4001
4002 /*
4003  * split the path's leaf in two, making sure there is at least data_size
4004  * available for the resulting leaf level of the path.
4005  */
4006 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4007                                     struct btrfs_path *path,
4008                                     struct extent_buffer *l,
4009                                     struct extent_buffer *right,
4010                                     int slot, int mid, int nritems)
4011 {
4012         struct btrfs_fs_info *fs_info = trans->fs_info;
4013         int data_copy_size;
4014         int rt_data_off;
4015         int i;
4016         struct btrfs_disk_key disk_key;
4017         struct btrfs_map_token token;
4018
4019         nritems = nritems - mid;
4020         btrfs_set_header_nritems(right, nritems);
4021         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4022
4023         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4024                            btrfs_item_nr_offset(mid),
4025                            nritems * sizeof(struct btrfs_item));
4026
4027         copy_extent_buffer(right, l,
4028                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4029                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4030                      leaf_data_end(l), data_copy_size);
4031
4032         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4033
4034         btrfs_init_map_token(&token, right);
4035         for (i = 0; i < nritems; i++) {
4036                 struct btrfs_item *item = btrfs_item_nr(i);
4037                 u32 ioff;
4038
4039                 ioff = btrfs_token_item_offset(&token, item);
4040                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4041         }
4042
4043         btrfs_set_header_nritems(l, mid);
4044         btrfs_item_key(right, &disk_key, 0);
4045         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4046
4047         btrfs_mark_buffer_dirty(right);
4048         btrfs_mark_buffer_dirty(l);
4049         BUG_ON(path->slots[0] != slot);
4050
4051         if (mid <= slot) {
4052                 btrfs_tree_unlock(path->nodes[0]);
4053                 free_extent_buffer(path->nodes[0]);
4054                 path->nodes[0] = right;
4055                 path->slots[0] -= mid;
4056                 path->slots[1] += 1;
4057         } else {
4058                 btrfs_tree_unlock(right);
4059                 free_extent_buffer(right);
4060         }
4061
4062         BUG_ON(path->slots[0] < 0);
4063 }
4064
4065 /*
4066  * double splits happen when we need to insert a big item in the middle
4067  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4068  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4069  *          A                 B                 C
4070  *
4071  * We avoid this by trying to push the items on either side of our target
4072  * into the adjacent leaves.  If all goes well we can avoid the double split
4073  * completely.
4074  */
4075 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4076                                           struct btrfs_root *root,
4077                                           struct btrfs_path *path,
4078                                           int data_size)
4079 {
4080         int ret;
4081         int progress = 0;
4082         int slot;
4083         u32 nritems;
4084         int space_needed = data_size;
4085
4086         slot = path->slots[0];
4087         if (slot < btrfs_header_nritems(path->nodes[0]))
4088                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4089
4090         /*
4091          * try to push all the items after our slot into the
4092          * right leaf
4093          */
4094         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4095         if (ret < 0)
4096                 return ret;
4097
4098         if (ret == 0)
4099                 progress++;
4100
4101         nritems = btrfs_header_nritems(path->nodes[0]);
4102         /*
4103          * our goal is to get our slot at the start or end of a leaf.  If
4104          * we've done so we're done
4105          */
4106         if (path->slots[0] == 0 || path->slots[0] == nritems)
4107                 return 0;
4108
4109         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4110                 return 0;
4111
4112         /* try to push all the items before our slot into the next leaf */
4113         slot = path->slots[0];
4114         space_needed = data_size;
4115         if (slot > 0)
4116                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4117         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4118         if (ret < 0)
4119                 return ret;
4120
4121         if (ret == 0)
4122                 progress++;
4123
4124         if (progress)
4125                 return 0;
4126         return 1;
4127 }
4128
4129 /*
4130  * split the path's leaf in two, making sure there is at least data_size
4131  * available for the resulting leaf level of the path.
4132  *
4133  * returns 0 if all went well and < 0 on failure.
4134  */
4135 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4136                                struct btrfs_root *root,
4137                                const struct btrfs_key *ins_key,
4138                                struct btrfs_path *path, int data_size,
4139                                int extend)
4140 {
4141         struct btrfs_disk_key disk_key;
4142         struct extent_buffer *l;
4143         u32 nritems;
4144         int mid;
4145         int slot;
4146         struct extent_buffer *right;
4147         struct btrfs_fs_info *fs_info = root->fs_info;
4148         int ret = 0;
4149         int wret;
4150         int split;
4151         int num_doubles = 0;
4152         int tried_avoid_double = 0;
4153
4154         l = path->nodes[0];
4155         slot = path->slots[0];
4156         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4157             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4158                 return -EOVERFLOW;
4159
4160         /* first try to make some room by pushing left and right */
4161         if (data_size && path->nodes[1]) {
4162                 int space_needed = data_size;
4163
4164                 if (slot < btrfs_header_nritems(l))
4165                         space_needed -= btrfs_leaf_free_space(l);
4166
4167                 wret = push_leaf_right(trans, root, path, space_needed,
4168                                        space_needed, 0, 0);
4169                 if (wret < 0)
4170                         return wret;
4171                 if (wret) {
4172                         space_needed = data_size;
4173                         if (slot > 0)
4174                                 space_needed -= btrfs_leaf_free_space(l);
4175                         wret = push_leaf_left(trans, root, path, space_needed,
4176                                               space_needed, 0, (u32)-1);
4177                         if (wret < 0)
4178                                 return wret;
4179                 }
4180                 l = path->nodes[0];
4181
4182                 /* did the pushes work? */
4183                 if (btrfs_leaf_free_space(l) >= data_size)
4184                         return 0;
4185         }
4186
4187         if (!path->nodes[1]) {
4188                 ret = insert_new_root(trans, root, path, 1);
4189                 if (ret)
4190                         return ret;
4191         }
4192 again:
4193         split = 1;
4194         l = path->nodes[0];
4195         slot = path->slots[0];
4196         nritems = btrfs_header_nritems(l);
4197         mid = (nritems + 1) / 2;
4198
4199         if (mid <= slot) {
4200                 if (nritems == 1 ||
4201                     leaf_space_used(l, mid, nritems - mid) + data_size >
4202                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4203                         if (slot >= nritems) {
4204                                 split = 0;
4205                         } else {
4206                                 mid = slot;
4207                                 if (mid != nritems &&
4208                                     leaf_space_used(l, mid, nritems - mid) +
4209                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4210                                         if (data_size && !tried_avoid_double)
4211                                                 goto push_for_double;
4212                                         split = 2;
4213                                 }
4214                         }
4215                 }
4216         } else {
4217                 if (leaf_space_used(l, 0, mid) + data_size >
4218                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4219                         if (!extend && data_size && slot == 0) {
4220                                 split = 0;
4221                         } else if ((extend || !data_size) && slot == 0) {
4222                                 mid = 1;
4223                         } else {
4224                                 mid = slot;
4225                                 if (mid != nritems &&
4226                                     leaf_space_used(l, mid, nritems - mid) +
4227                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4228                                         if (data_size && !tried_avoid_double)
4229                                                 goto push_for_double;
4230                                         split = 2;
4231                                 }
4232                         }
4233                 }
4234         }
4235
4236         if (split == 0)
4237                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4238         else
4239                 btrfs_item_key(l, &disk_key, mid);
4240
4241         /*
4242          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4243          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4244          * subclasses, which is 8 at the time of this patch, and we've maxed it
4245          * out.  In the future we could add a
4246          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4247          * use BTRFS_NESTING_NEW_ROOT.
4248          */
4249         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4250                                              l->start, 0, num_doubles ?
4251                                              BTRFS_NESTING_NEW_ROOT :
4252                                              BTRFS_NESTING_SPLIT);
4253         if (IS_ERR(right))
4254                 return PTR_ERR(right);
4255
4256         root_add_used(root, fs_info->nodesize);
4257
4258         if (split == 0) {
4259                 if (mid <= slot) {
4260                         btrfs_set_header_nritems(right, 0);
4261                         insert_ptr(trans, path, &disk_key,
4262                                    right->start, path->slots[1] + 1, 1);
4263                         btrfs_tree_unlock(path->nodes[0]);
4264                         free_extent_buffer(path->nodes[0]);
4265                         path->nodes[0] = right;
4266                         path->slots[0] = 0;
4267                         path->slots[1] += 1;
4268                 } else {
4269                         btrfs_set_header_nritems(right, 0);
4270                         insert_ptr(trans, path, &disk_key,
4271                                    right->start, path->slots[1], 1);
4272                         btrfs_tree_unlock(path->nodes[0]);
4273                         free_extent_buffer(path->nodes[0]);
4274                         path->nodes[0] = right;
4275                         path->slots[0] = 0;
4276                         if (path->slots[1] == 0)
4277                                 fixup_low_keys(path, &disk_key, 1);
4278                 }
4279                 /*
4280                  * We create a new leaf 'right' for the required ins_len and
4281                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4282                  * the content of ins_len to 'right'.
4283                  */
4284                 return ret;
4285         }
4286
4287         copy_for_split(trans, path, l, right, slot, mid, nritems);
4288
4289         if (split == 2) {
4290                 BUG_ON(num_doubles != 0);
4291                 num_doubles++;
4292                 goto again;
4293         }
4294
4295         return 0;
4296
4297 push_for_double:
4298         push_for_double_split(trans, root, path, data_size);
4299         tried_avoid_double = 1;
4300         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4301                 return 0;
4302         goto again;
4303 }
4304
4305 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4306                                          struct btrfs_root *root,
4307                                          struct btrfs_path *path, int ins_len)
4308 {
4309         struct btrfs_key key;
4310         struct extent_buffer *leaf;
4311         struct btrfs_file_extent_item *fi;
4312         u64 extent_len = 0;
4313         u32 item_size;
4314         int ret;
4315
4316         leaf = path->nodes[0];
4317         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4318
4319         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4320                key.type != BTRFS_EXTENT_CSUM_KEY);
4321
4322         if (btrfs_leaf_free_space(leaf) >= ins_len)
4323                 return 0;
4324
4325         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4326         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4327                 fi = btrfs_item_ptr(leaf, path->slots[0],
4328                                     struct btrfs_file_extent_item);
4329                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4330         }
4331         btrfs_release_path(path);
4332
4333         path->keep_locks = 1;
4334         path->search_for_split = 1;
4335         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4336         path->search_for_split = 0;
4337         if (ret > 0)
4338                 ret = -EAGAIN;
4339         if (ret < 0)
4340                 goto err;
4341
4342         ret = -EAGAIN;
4343         leaf = path->nodes[0];
4344         /* if our item isn't there, return now */
4345         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4346                 goto err;
4347
4348         /* the leaf has  changed, it now has room.  return now */
4349         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4350                 goto err;
4351
4352         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4353                 fi = btrfs_item_ptr(leaf, path->slots[0],
4354                                     struct btrfs_file_extent_item);
4355                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4356                         goto err;
4357         }
4358
4359         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4360         if (ret)
4361                 goto err;
4362
4363         path->keep_locks = 0;
4364         btrfs_unlock_up_safe(path, 1);
4365         return 0;
4366 err:
4367         path->keep_locks = 0;
4368         return ret;
4369 }
4370
4371 static noinline int split_item(struct btrfs_path *path,
4372                                const struct btrfs_key *new_key,
4373                                unsigned long split_offset)
4374 {
4375         struct extent_buffer *leaf;
4376         struct btrfs_item *item;
4377         struct btrfs_item *new_item;
4378         int slot;
4379         char *buf;
4380         u32 nritems;
4381         u32 item_size;
4382         u32 orig_offset;
4383         struct btrfs_disk_key disk_key;
4384
4385         leaf = path->nodes[0];
4386         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4387
4388         item = btrfs_item_nr(path->slots[0]);
4389         orig_offset = btrfs_item_offset(leaf, item);
4390         item_size = btrfs_item_size(leaf, item);
4391
4392         buf = kmalloc(item_size, GFP_NOFS);
4393         if (!buf)
4394                 return -ENOMEM;
4395
4396         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4397                             path->slots[0]), item_size);
4398
4399         slot = path->slots[0] + 1;
4400         nritems = btrfs_header_nritems(leaf);
4401         if (slot != nritems) {
4402                 /* shift the items */
4403                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4404                                 btrfs_item_nr_offset(slot),
4405                                 (nritems - slot) * sizeof(struct btrfs_item));
4406         }
4407
4408         btrfs_cpu_key_to_disk(&disk_key, new_key);
4409         btrfs_set_item_key(leaf, &disk_key, slot);
4410
4411         new_item = btrfs_item_nr(slot);
4412
4413         btrfs_set_item_offset(leaf, new_item, orig_offset);
4414         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4415
4416         btrfs_set_item_offset(leaf, item,
4417                               orig_offset + item_size - split_offset);
4418         btrfs_set_item_size(leaf, item, split_offset);
4419
4420         btrfs_set_header_nritems(leaf, nritems + 1);
4421
4422         /* write the data for the start of the original item */
4423         write_extent_buffer(leaf, buf,
4424                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4425                             split_offset);
4426
4427         /* write the data for the new item */
4428         write_extent_buffer(leaf, buf + split_offset,
4429                             btrfs_item_ptr_offset(leaf, slot),
4430                             item_size - split_offset);
4431         btrfs_mark_buffer_dirty(leaf);
4432
4433         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4434         kfree(buf);
4435         return 0;
4436 }
4437
4438 /*
4439  * This function splits a single item into two items,
4440  * giving 'new_key' to the new item and splitting the
4441  * old one at split_offset (from the start of the item).
4442  *
4443  * The path may be released by this operation.  After
4444  * the split, the path is pointing to the old item.  The
4445  * new item is going to be in the same node as the old one.
4446  *
4447  * Note, the item being split must be smaller enough to live alone on
4448  * a tree block with room for one extra struct btrfs_item
4449  *
4450  * This allows us to split the item in place, keeping a lock on the
4451  * leaf the entire time.
4452  */
4453 int btrfs_split_item(struct btrfs_trans_handle *trans,
4454                      struct btrfs_root *root,
4455                      struct btrfs_path *path,
4456                      const struct btrfs_key *new_key,
4457                      unsigned long split_offset)
4458 {
4459         int ret;
4460         ret = setup_leaf_for_split(trans, root, path,
4461                                    sizeof(struct btrfs_item));
4462         if (ret)
4463                 return ret;
4464
4465         ret = split_item(path, new_key, split_offset);
4466         return ret;
4467 }
4468
4469 /*
4470  * This function duplicate a item, giving 'new_key' to the new item.
4471  * It guarantees both items live in the same tree leaf and the new item
4472  * is contiguous with the original item.
4473  *
4474  * This allows us to split file extent in place, keeping a lock on the
4475  * leaf the entire time.
4476  */
4477 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4478                          struct btrfs_root *root,
4479                          struct btrfs_path *path,
4480                          const struct btrfs_key *new_key)
4481 {
4482         struct extent_buffer *leaf;
4483         int ret;
4484         u32 item_size;
4485
4486         leaf = path->nodes[0];
4487         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4488         ret = setup_leaf_for_split(trans, root, path,
4489                                    item_size + sizeof(struct btrfs_item));
4490         if (ret)
4491                 return ret;
4492
4493         path->slots[0]++;
4494         setup_items_for_insert(root, path, new_key, &item_size, 1);
4495         leaf = path->nodes[0];
4496         memcpy_extent_buffer(leaf,
4497                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4498                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4499                              item_size);
4500         return 0;
4501 }
4502
4503 /*
4504  * make the item pointed to by the path smaller.  new_size indicates
4505  * how small to make it, and from_end tells us if we just chop bytes
4506  * off the end of the item or if we shift the item to chop bytes off
4507  * the front.
4508  */
4509 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4510 {
4511         int slot;
4512         struct extent_buffer *leaf;
4513         struct btrfs_item *item;
4514         u32 nritems;
4515         unsigned int data_end;
4516         unsigned int old_data_start;
4517         unsigned int old_size;
4518         unsigned int size_diff;
4519         int i;
4520         struct btrfs_map_token token;
4521
4522         leaf = path->nodes[0];
4523         slot = path->slots[0];
4524
4525         old_size = btrfs_item_size_nr(leaf, slot);
4526         if (old_size == new_size)
4527                 return;
4528
4529         nritems = btrfs_header_nritems(leaf);
4530         data_end = leaf_data_end(leaf);
4531
4532         old_data_start = btrfs_item_offset_nr(leaf, slot);
4533
4534         size_diff = old_size - new_size;
4535
4536         BUG_ON(slot < 0);
4537         BUG_ON(slot >= nritems);
4538
4539         /*
4540          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4541          */
4542         /* first correct the data pointers */
4543         btrfs_init_map_token(&token, leaf);
4544         for (i = slot; i < nritems; i++) {
4545                 u32 ioff;
4546                 item = btrfs_item_nr(i);
4547
4548                 ioff = btrfs_token_item_offset(&token, item);
4549                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4550         }
4551
4552         /* shift the data */
4553         if (from_end) {
4554                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4555                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4556                               data_end, old_data_start + new_size - data_end);
4557         } else {
4558                 struct btrfs_disk_key disk_key;
4559                 u64 offset;
4560
4561                 btrfs_item_key(leaf, &disk_key, slot);
4562
4563                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4564                         unsigned long ptr;
4565                         struct btrfs_file_extent_item *fi;
4566
4567                         fi = btrfs_item_ptr(leaf, slot,
4568                                             struct btrfs_file_extent_item);
4569                         fi = (struct btrfs_file_extent_item *)(
4570                              (unsigned long)fi - size_diff);
4571
4572                         if (btrfs_file_extent_type(leaf, fi) ==
4573                             BTRFS_FILE_EXTENT_INLINE) {
4574                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4575                                 memmove_extent_buffer(leaf, ptr,
4576                                       (unsigned long)fi,
4577                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4578                         }
4579                 }
4580
4581                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4582                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4583                               data_end, old_data_start - data_end);
4584
4585                 offset = btrfs_disk_key_offset(&disk_key);
4586                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4587                 btrfs_set_item_key(leaf, &disk_key, slot);
4588                 if (slot == 0)
4589                         fixup_low_keys(path, &disk_key, 1);
4590         }
4591
4592         item = btrfs_item_nr(slot);
4593         btrfs_set_item_size(leaf, item, new_size);
4594         btrfs_mark_buffer_dirty(leaf);
4595
4596         if (btrfs_leaf_free_space(leaf) < 0) {
4597                 btrfs_print_leaf(leaf);
4598                 BUG();
4599         }
4600 }
4601
4602 /*
4603  * make the item pointed to by the path bigger, data_size is the added size.
4604  */
4605 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4606 {
4607         int slot;
4608         struct extent_buffer *leaf;
4609         struct btrfs_item *item;
4610         u32 nritems;
4611         unsigned int data_end;
4612         unsigned int old_data;
4613         unsigned int old_size;
4614         int i;
4615         struct btrfs_map_token token;
4616
4617         leaf = path->nodes[0];
4618
4619         nritems = btrfs_header_nritems(leaf);
4620         data_end = leaf_data_end(leaf);
4621
4622         if (btrfs_leaf_free_space(leaf) < data_size) {
4623                 btrfs_print_leaf(leaf);
4624                 BUG();
4625         }
4626         slot = path->slots[0];
4627         old_data = btrfs_item_end_nr(leaf, slot);
4628
4629         BUG_ON(slot < 0);
4630         if (slot >= nritems) {
4631                 btrfs_print_leaf(leaf);
4632                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4633                            slot, nritems);
4634                 BUG();
4635         }
4636
4637         /*
4638          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4639          */
4640         /* first correct the data pointers */
4641         btrfs_init_map_token(&token, leaf);
4642         for (i = slot; i < nritems; i++) {
4643                 u32 ioff;
4644                 item = btrfs_item_nr(i);
4645
4646                 ioff = btrfs_token_item_offset(&token, item);
4647                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
4648         }
4649
4650         /* shift the data */
4651         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4652                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4653                       data_end, old_data - data_end);
4654
4655         data_end = old_data;
4656         old_size = btrfs_item_size_nr(leaf, slot);
4657         item = btrfs_item_nr(slot);
4658         btrfs_set_item_size(leaf, item, old_size + data_size);
4659         btrfs_mark_buffer_dirty(leaf);
4660
4661         if (btrfs_leaf_free_space(leaf) < 0) {
4662                 btrfs_print_leaf(leaf);
4663                 BUG();
4664         }
4665 }
4666
4667 /**
4668  * setup_items_for_insert - Helper called before inserting one or more items
4669  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4670  * in a function that doesn't call btrfs_search_slot
4671  *
4672  * @root:       root we are inserting items to
4673  * @path:       points to the leaf/slot where we are going to insert new items
4674  * @cpu_key:    array of keys for items to be inserted
4675  * @data_size:  size of the body of each item we are going to insert
4676  * @nr:         size of @cpu_key/@data_size arrays
4677  */
4678 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4679                             const struct btrfs_key *cpu_key, u32 *data_size,
4680                             int nr)
4681 {
4682         struct btrfs_fs_info *fs_info = root->fs_info;
4683         struct btrfs_item *item;
4684         int i;
4685         u32 nritems;
4686         unsigned int data_end;
4687         struct btrfs_disk_key disk_key;
4688         struct extent_buffer *leaf;
4689         int slot;
4690         struct btrfs_map_token token;
4691         u32 total_size;
4692         u32 total_data = 0;
4693
4694         for (i = 0; i < nr; i++)
4695                 total_data += data_size[i];
4696         total_size = total_data + (nr * sizeof(struct btrfs_item));
4697
4698         if (path->slots[0] == 0) {
4699                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4700                 fixup_low_keys(path, &disk_key, 1);
4701         }
4702         btrfs_unlock_up_safe(path, 1);
4703
4704         leaf = path->nodes[0];
4705         slot = path->slots[0];
4706
4707         nritems = btrfs_header_nritems(leaf);
4708         data_end = leaf_data_end(leaf);
4709
4710         if (btrfs_leaf_free_space(leaf) < total_size) {
4711                 btrfs_print_leaf(leaf);
4712                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4713                            total_size, btrfs_leaf_free_space(leaf));
4714                 BUG();
4715         }
4716
4717         btrfs_init_map_token(&token, leaf);
4718         if (slot != nritems) {
4719                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4720
4721                 if (old_data < data_end) {
4722                         btrfs_print_leaf(leaf);
4723                         btrfs_crit(fs_info,
4724                 "item at slot %d with data offset %u beyond data end of leaf %u",
4725                                    slot, old_data, data_end);
4726                         BUG();
4727                 }
4728                 /*
4729                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4730                  */
4731                 /* first correct the data pointers */
4732                 for (i = slot; i < nritems; i++) {
4733                         u32 ioff;
4734
4735                         item = btrfs_item_nr(i);
4736                         ioff = btrfs_token_item_offset(&token, item);
4737                         btrfs_set_token_item_offset(&token, item,
4738                                                     ioff - total_data);
4739                 }
4740                 /* shift the items */
4741                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4742                               btrfs_item_nr_offset(slot),
4743                               (nritems - slot) * sizeof(struct btrfs_item));
4744
4745                 /* shift the data */
4746                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4747                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4748                               data_end, old_data - data_end);
4749                 data_end = old_data;
4750         }
4751
4752         /* setup the item for the new data */
4753         for (i = 0; i < nr; i++) {
4754                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4755                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4756                 item = btrfs_item_nr(slot + i);
4757                 data_end -= data_size[i];
4758                 btrfs_set_token_item_offset(&token, item, data_end);
4759                 btrfs_set_token_item_size(&token, item, data_size[i]);
4760         }
4761
4762         btrfs_set_header_nritems(leaf, nritems + nr);
4763         btrfs_mark_buffer_dirty(leaf);
4764
4765         if (btrfs_leaf_free_space(leaf) < 0) {
4766                 btrfs_print_leaf(leaf);
4767                 BUG();
4768         }
4769 }
4770
4771 /*
4772  * Given a key and some data, insert items into the tree.
4773  * This does all the path init required, making room in the tree if needed.
4774  */
4775 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4776                             struct btrfs_root *root,
4777                             struct btrfs_path *path,
4778                             const struct btrfs_key *cpu_key, u32 *data_size,
4779                             int nr)
4780 {
4781         int ret = 0;
4782         int slot;
4783         int i;
4784         u32 total_size = 0;
4785         u32 total_data = 0;
4786
4787         for (i = 0; i < nr; i++)
4788                 total_data += data_size[i];
4789
4790         total_size = total_data + (nr * sizeof(struct btrfs_item));
4791         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4792         if (ret == 0)
4793                 return -EEXIST;
4794         if (ret < 0)
4795                 return ret;
4796
4797         slot = path->slots[0];
4798         BUG_ON(slot < 0);
4799
4800         setup_items_for_insert(root, path, cpu_key, data_size, nr);
4801         return 0;
4802 }
4803
4804 /*
4805  * Given a key and some data, insert an item into the tree.
4806  * This does all the path init required, making room in the tree if needed.
4807  */
4808 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4809                       const struct btrfs_key *cpu_key, void *data,
4810                       u32 data_size)
4811 {
4812         int ret = 0;
4813         struct btrfs_path *path;
4814         struct extent_buffer *leaf;
4815         unsigned long ptr;
4816
4817         path = btrfs_alloc_path();
4818         if (!path)
4819                 return -ENOMEM;
4820         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4821         if (!ret) {
4822                 leaf = path->nodes[0];
4823                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4824                 write_extent_buffer(leaf, data, ptr, data_size);
4825                 btrfs_mark_buffer_dirty(leaf);
4826         }
4827         btrfs_free_path(path);
4828         return ret;
4829 }
4830
4831 /*
4832  * delete the pointer from a given node.
4833  *
4834  * the tree should have been previously balanced so the deletion does not
4835  * empty a node.
4836  */
4837 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4838                     int level, int slot)
4839 {
4840         struct extent_buffer *parent = path->nodes[level];
4841         u32 nritems;
4842         int ret;
4843
4844         nritems = btrfs_header_nritems(parent);
4845         if (slot != nritems - 1) {
4846                 if (level) {
4847                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4848                                         nritems - slot - 1);
4849                         BUG_ON(ret < 0);
4850                 }
4851                 memmove_extent_buffer(parent,
4852                               btrfs_node_key_ptr_offset(slot),
4853                               btrfs_node_key_ptr_offset(slot + 1),
4854                               sizeof(struct btrfs_key_ptr) *
4855                               (nritems - slot - 1));
4856         } else if (level) {
4857                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4858                                 GFP_NOFS);
4859                 BUG_ON(ret < 0);
4860         }
4861
4862         nritems--;
4863         btrfs_set_header_nritems(parent, nritems);
4864         if (nritems == 0 && parent == root->node) {
4865                 BUG_ON(btrfs_header_level(root->node) != 1);
4866                 /* just turn the root into a leaf and break */
4867                 btrfs_set_header_level(root->node, 0);
4868         } else if (slot == 0) {
4869                 struct btrfs_disk_key disk_key;
4870
4871                 btrfs_node_key(parent, &disk_key, 0);
4872                 fixup_low_keys(path, &disk_key, level + 1);
4873         }
4874         btrfs_mark_buffer_dirty(parent);
4875 }
4876
4877 /*
4878  * a helper function to delete the leaf pointed to by path->slots[1] and
4879  * path->nodes[1].
4880  *
4881  * This deletes the pointer in path->nodes[1] and frees the leaf
4882  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4883  *
4884  * The path must have already been setup for deleting the leaf, including
4885  * all the proper balancing.  path->nodes[1] must be locked.
4886  */
4887 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4888                                     struct btrfs_root *root,
4889                                     struct btrfs_path *path,
4890                                     struct extent_buffer *leaf)
4891 {
4892         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4893         del_ptr(root, path, 1, path->slots[1]);
4894
4895         /*
4896          * btrfs_free_extent is expensive, we want to make sure we
4897          * aren't holding any locks when we call it
4898          */
4899         btrfs_unlock_up_safe(path, 0);
4900
4901         root_sub_used(root, leaf->len);
4902
4903         atomic_inc(&leaf->refs);
4904         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4905         free_extent_buffer_stale(leaf);
4906 }
4907 /*
4908  * delete the item at the leaf level in path.  If that empties
4909  * the leaf, remove it from the tree
4910  */
4911 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4912                     struct btrfs_path *path, int slot, int nr)
4913 {
4914         struct btrfs_fs_info *fs_info = root->fs_info;
4915         struct extent_buffer *leaf;
4916         struct btrfs_item *item;
4917         u32 last_off;
4918         u32 dsize = 0;
4919         int ret = 0;
4920         int wret;
4921         int i;
4922         u32 nritems;
4923
4924         leaf = path->nodes[0];
4925         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4926
4927         for (i = 0; i < nr; i++)
4928                 dsize += btrfs_item_size_nr(leaf, slot + i);
4929
4930         nritems = btrfs_header_nritems(leaf);
4931
4932         if (slot + nr != nritems) {
4933                 int data_end = leaf_data_end(leaf);
4934                 struct btrfs_map_token token;
4935
4936                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4937                               data_end + dsize,
4938                               BTRFS_LEAF_DATA_OFFSET + data_end,
4939                               last_off - data_end);
4940
4941                 btrfs_init_map_token(&token, leaf);
4942                 for (i = slot + nr; i < nritems; i++) {
4943                         u32 ioff;
4944
4945                         item = btrfs_item_nr(i);
4946                         ioff = btrfs_token_item_offset(&token, item);
4947                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
4948                 }
4949
4950                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4951                               btrfs_item_nr_offset(slot + nr),
4952                               sizeof(struct btrfs_item) *
4953                               (nritems - slot - nr));
4954         }
4955         btrfs_set_header_nritems(leaf, nritems - nr);
4956         nritems -= nr;
4957
4958         /* delete the leaf if we've emptied it */
4959         if (nritems == 0) {
4960                 if (leaf == root->node) {
4961                         btrfs_set_header_level(leaf, 0);
4962                 } else {
4963                         btrfs_clean_tree_block(leaf);
4964                         btrfs_del_leaf(trans, root, path, leaf);
4965                 }
4966         } else {
4967                 int used = leaf_space_used(leaf, 0, nritems);
4968                 if (slot == 0) {
4969                         struct btrfs_disk_key disk_key;
4970
4971                         btrfs_item_key(leaf, &disk_key, 0);
4972                         fixup_low_keys(path, &disk_key, 1);
4973                 }
4974
4975                 /* delete the leaf if it is mostly empty */
4976                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4977                         /* push_leaf_left fixes the path.
4978                          * make sure the path still points to our leaf
4979                          * for possible call to del_ptr below
4980                          */
4981                         slot = path->slots[1];
4982                         atomic_inc(&leaf->refs);
4983
4984                         wret = push_leaf_left(trans, root, path, 1, 1,
4985                                               1, (u32)-1);
4986                         if (wret < 0 && wret != -ENOSPC)
4987                                 ret = wret;
4988
4989                         if (path->nodes[0] == leaf &&
4990                             btrfs_header_nritems(leaf)) {
4991                                 wret = push_leaf_right(trans, root, path, 1,
4992                                                        1, 1, 0);
4993                                 if (wret < 0 && wret != -ENOSPC)
4994                                         ret = wret;
4995                         }
4996
4997                         if (btrfs_header_nritems(leaf) == 0) {
4998                                 path->slots[1] = slot;
4999                                 btrfs_del_leaf(trans, root, path, leaf);
5000                                 free_extent_buffer(leaf);
5001                                 ret = 0;
5002                         } else {
5003                                 /* if we're still in the path, make sure
5004                                  * we're dirty.  Otherwise, one of the
5005                                  * push_leaf functions must have already
5006                                  * dirtied this buffer
5007                                  */
5008                                 if (path->nodes[0] == leaf)
5009                                         btrfs_mark_buffer_dirty(leaf);
5010                                 free_extent_buffer(leaf);
5011                         }
5012                 } else {
5013                         btrfs_mark_buffer_dirty(leaf);
5014                 }
5015         }
5016         return ret;
5017 }
5018
5019 /*
5020  * search the tree again to find a leaf with lesser keys
5021  * returns 0 if it found something or 1 if there are no lesser leaves.
5022  * returns < 0 on io errors.
5023  *
5024  * This may release the path, and so you may lose any locks held at the
5025  * time you call it.
5026  */
5027 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5028 {
5029         struct btrfs_key key;
5030         struct btrfs_disk_key found_key;
5031         int ret;
5032
5033         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5034
5035         if (key.offset > 0) {
5036                 key.offset--;
5037         } else if (key.type > 0) {
5038                 key.type--;
5039                 key.offset = (u64)-1;
5040         } else if (key.objectid > 0) {
5041                 key.objectid--;
5042                 key.type = (u8)-1;
5043                 key.offset = (u64)-1;
5044         } else {
5045                 return 1;
5046         }
5047
5048         btrfs_release_path(path);
5049         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5050         if (ret < 0)
5051                 return ret;
5052         btrfs_item_key(path->nodes[0], &found_key, 0);
5053         ret = comp_keys(&found_key, &key);
5054         /*
5055          * We might have had an item with the previous key in the tree right
5056          * before we released our path. And after we released our path, that
5057          * item might have been pushed to the first slot (0) of the leaf we
5058          * were holding due to a tree balance. Alternatively, an item with the
5059          * previous key can exist as the only element of a leaf (big fat item).
5060          * Therefore account for these 2 cases, so that our callers (like
5061          * btrfs_previous_item) don't miss an existing item with a key matching
5062          * the previous key we computed above.
5063          */
5064         if (ret <= 0)
5065                 return 0;
5066         return 1;
5067 }
5068
5069 /*
5070  * A helper function to walk down the tree starting at min_key, and looking
5071  * for nodes or leaves that are have a minimum transaction id.
5072  * This is used by the btree defrag code, and tree logging
5073  *
5074  * This does not cow, but it does stuff the starting key it finds back
5075  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5076  * key and get a writable path.
5077  *
5078  * This honors path->lowest_level to prevent descent past a given level
5079  * of the tree.
5080  *
5081  * min_trans indicates the oldest transaction that you are interested
5082  * in walking through.  Any nodes or leaves older than min_trans are
5083  * skipped over (without reading them).
5084  *
5085  * returns zero if something useful was found, < 0 on error and 1 if there
5086  * was nothing in the tree that matched the search criteria.
5087  */
5088 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5089                          struct btrfs_path *path,
5090                          u64 min_trans)
5091 {
5092         struct extent_buffer *cur;
5093         struct btrfs_key found_key;
5094         int slot;
5095         int sret;
5096         u32 nritems;
5097         int level;
5098         int ret = 1;
5099         int keep_locks = path->keep_locks;
5100
5101         path->keep_locks = 1;
5102 again:
5103         cur = btrfs_read_lock_root_node(root);
5104         level = btrfs_header_level(cur);
5105         WARN_ON(path->nodes[level]);
5106         path->nodes[level] = cur;
5107         path->locks[level] = BTRFS_READ_LOCK;
5108
5109         if (btrfs_header_generation(cur) < min_trans) {
5110                 ret = 1;
5111                 goto out;
5112         }
5113         while (1) {
5114                 nritems = btrfs_header_nritems(cur);
5115                 level = btrfs_header_level(cur);
5116                 sret = btrfs_bin_search(cur, min_key, &slot);
5117                 if (sret < 0) {
5118                         ret = sret;
5119                         goto out;
5120                 }
5121
5122                 /* at the lowest level, we're done, setup the path and exit */
5123                 if (level == path->lowest_level) {
5124                         if (slot >= nritems)
5125                                 goto find_next_key;
5126                         ret = 0;
5127                         path->slots[level] = slot;
5128                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5129                         goto out;
5130                 }
5131                 if (sret && slot > 0)
5132                         slot--;
5133                 /*
5134                  * check this node pointer against the min_trans parameters.
5135                  * If it is too old, skip to the next one.
5136                  */
5137                 while (slot < nritems) {
5138                         u64 gen;
5139
5140                         gen = btrfs_node_ptr_generation(cur, slot);
5141                         if (gen < min_trans) {
5142                                 slot++;
5143                                 continue;
5144                         }
5145                         break;
5146                 }
5147 find_next_key:
5148                 /*
5149                  * we didn't find a candidate key in this node, walk forward
5150                  * and find another one
5151                  */
5152                 if (slot >= nritems) {
5153                         path->slots[level] = slot;
5154                         sret = btrfs_find_next_key(root, path, min_key, level,
5155                                                   min_trans);
5156                         if (sret == 0) {
5157                                 btrfs_release_path(path);
5158                                 goto again;
5159                         } else {
5160                                 goto out;
5161                         }
5162                 }
5163                 /* save our key for returning back */
5164                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5165                 path->slots[level] = slot;
5166                 if (level == path->lowest_level) {
5167                         ret = 0;
5168                         goto out;
5169                 }
5170                 cur = btrfs_read_node_slot(cur, slot);
5171                 if (IS_ERR(cur)) {
5172                         ret = PTR_ERR(cur);
5173                         goto out;
5174                 }
5175
5176                 btrfs_tree_read_lock(cur);
5177
5178                 path->locks[level - 1] = BTRFS_READ_LOCK;
5179                 path->nodes[level - 1] = cur;
5180                 unlock_up(path, level, 1, 0, NULL);
5181         }
5182 out:
5183         path->keep_locks = keep_locks;
5184         if (ret == 0) {
5185                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5186                 memcpy(min_key, &found_key, sizeof(found_key));
5187         }
5188         return ret;
5189 }
5190
5191 /*
5192  * this is similar to btrfs_next_leaf, but does not try to preserve
5193  * and fixup the path.  It looks for and returns the next key in the
5194  * tree based on the current path and the min_trans parameters.
5195  *
5196  * 0 is returned if another key is found, < 0 if there are any errors
5197  * and 1 is returned if there are no higher keys in the tree
5198  *
5199  * path->keep_locks should be set to 1 on the search made before
5200  * calling this function.
5201  */
5202 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5203                         struct btrfs_key *key, int level, u64 min_trans)
5204 {
5205         int slot;
5206         struct extent_buffer *c;
5207
5208         WARN_ON(!path->keep_locks && !path->skip_locking);
5209         while (level < BTRFS_MAX_LEVEL) {
5210                 if (!path->nodes[level])
5211                         return 1;
5212
5213                 slot = path->slots[level] + 1;
5214                 c = path->nodes[level];
5215 next:
5216                 if (slot >= btrfs_header_nritems(c)) {
5217                         int ret;
5218                         int orig_lowest;
5219                         struct btrfs_key cur_key;
5220                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5221                             !path->nodes[level + 1])
5222                                 return 1;
5223
5224                         if (path->locks[level + 1] || path->skip_locking) {
5225                                 level++;
5226                                 continue;
5227                         }
5228
5229                         slot = btrfs_header_nritems(c) - 1;
5230                         if (level == 0)
5231                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5232                         else
5233                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5234
5235                         orig_lowest = path->lowest_level;
5236                         btrfs_release_path(path);
5237                         path->lowest_level = level;
5238                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5239                                                 0, 0);
5240                         path->lowest_level = orig_lowest;
5241                         if (ret < 0)
5242                                 return ret;
5243
5244                         c = path->nodes[level];
5245                         slot = path->slots[level];
5246                         if (ret == 0)
5247                                 slot++;
5248                         goto next;
5249                 }
5250
5251                 if (level == 0)
5252                         btrfs_item_key_to_cpu(c, key, slot);
5253                 else {
5254                         u64 gen = btrfs_node_ptr_generation(c, slot);
5255
5256                         if (gen < min_trans) {
5257                                 slot++;
5258                                 goto next;
5259                         }
5260                         btrfs_node_key_to_cpu(c, key, slot);
5261                 }
5262                 return 0;
5263         }
5264         return 1;
5265 }
5266
5267 /*
5268  * search the tree again to find a leaf with greater keys
5269  * returns 0 if it found something or 1 if there are no greater leaves.
5270  * returns < 0 on io errors.
5271  */
5272 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5273 {
5274         return btrfs_next_old_leaf(root, path, 0);
5275 }
5276
5277 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5278                         u64 time_seq)
5279 {
5280         int slot;
5281         int level;
5282         struct extent_buffer *c;
5283         struct extent_buffer *next;
5284         struct btrfs_key key;
5285         u32 nritems;
5286         int ret;
5287         int i;
5288
5289         nritems = btrfs_header_nritems(path->nodes[0]);
5290         if (nritems == 0)
5291                 return 1;
5292
5293         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5294 again:
5295         level = 1;
5296         next = NULL;
5297         btrfs_release_path(path);
5298
5299         path->keep_locks = 1;
5300
5301         if (time_seq)
5302                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5303         else
5304                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5305         path->keep_locks = 0;
5306
5307         if (ret < 0)
5308                 return ret;
5309
5310         nritems = btrfs_header_nritems(path->nodes[0]);
5311         /*
5312          * by releasing the path above we dropped all our locks.  A balance
5313          * could have added more items next to the key that used to be
5314          * at the very end of the block.  So, check again here and
5315          * advance the path if there are now more items available.
5316          */
5317         if (nritems > 0 && path->slots[0] < nritems - 1) {
5318                 if (ret == 0)
5319                         path->slots[0]++;
5320                 ret = 0;
5321                 goto done;
5322         }
5323         /*
5324          * So the above check misses one case:
5325          * - after releasing the path above, someone has removed the item that
5326          *   used to be at the very end of the block, and balance between leafs
5327          *   gets another one with bigger key.offset to replace it.
5328          *
5329          * This one should be returned as well, or we can get leaf corruption
5330          * later(esp. in __btrfs_drop_extents()).
5331          *
5332          * And a bit more explanation about this check,
5333          * with ret > 0, the key isn't found, the path points to the slot
5334          * where it should be inserted, so the path->slots[0] item must be the
5335          * bigger one.
5336          */
5337         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5338                 ret = 0;
5339                 goto done;
5340         }
5341
5342         while (level < BTRFS_MAX_LEVEL) {
5343                 if (!path->nodes[level]) {
5344                         ret = 1;
5345                         goto done;
5346                 }
5347
5348                 slot = path->slots[level] + 1;
5349                 c = path->nodes[level];
5350                 if (slot >= btrfs_header_nritems(c)) {
5351                         level++;
5352                         if (level == BTRFS_MAX_LEVEL) {
5353                                 ret = 1;
5354                                 goto done;
5355                         }
5356                         continue;
5357                 }
5358
5359
5360                 /*
5361                  * Our current level is where we're going to start from, and to
5362                  * make sure lockdep doesn't complain we need to drop our locks
5363                  * and nodes from 0 to our current level.
5364                  */
5365                 for (i = 0; i < level; i++) {
5366                         if (path->locks[level]) {
5367                                 btrfs_tree_read_unlock(path->nodes[i]);
5368                                 path->locks[i] = 0;
5369                         }
5370                         free_extent_buffer(path->nodes[i]);
5371                         path->nodes[i] = NULL;
5372                 }
5373
5374                 next = c;
5375                 ret = read_block_for_search(root, path, &next, level,
5376                                             slot, &key);
5377                 if (ret == -EAGAIN)
5378                         goto again;
5379
5380                 if (ret < 0) {
5381                         btrfs_release_path(path);
5382                         goto done;
5383                 }
5384
5385                 if (!path->skip_locking) {
5386                         ret = btrfs_try_tree_read_lock(next);
5387                         if (!ret && time_seq) {
5388                                 /*
5389                                  * If we don't get the lock, we may be racing
5390                                  * with push_leaf_left, holding that lock while
5391                                  * itself waiting for the leaf we've currently
5392                                  * locked. To solve this situation, we give up
5393                                  * on our lock and cycle.
5394                                  */
5395                                 free_extent_buffer(next);
5396                                 btrfs_release_path(path);
5397                                 cond_resched();
5398                                 goto again;
5399                         }
5400                         if (!ret)
5401                                 btrfs_tree_read_lock(next);
5402                 }
5403                 break;
5404         }
5405         path->slots[level] = slot;
5406         while (1) {
5407                 level--;
5408                 path->nodes[level] = next;
5409                 path->slots[level] = 0;
5410                 if (!path->skip_locking)
5411                         path->locks[level] = BTRFS_READ_LOCK;
5412                 if (!level)
5413                         break;
5414
5415                 ret = read_block_for_search(root, path, &next, level,
5416                                             0, &key);
5417                 if (ret == -EAGAIN)
5418                         goto again;
5419
5420                 if (ret < 0) {
5421                         btrfs_release_path(path);
5422                         goto done;
5423                 }
5424
5425                 if (!path->skip_locking)
5426                         btrfs_tree_read_lock(next);
5427         }
5428         ret = 0;
5429 done:
5430         unlock_up(path, 0, 1, 0, NULL);
5431
5432         return ret;
5433 }
5434
5435 /*
5436  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5437  * searching until it gets past min_objectid or finds an item of 'type'
5438  *
5439  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5440  */
5441 int btrfs_previous_item(struct btrfs_root *root,
5442                         struct btrfs_path *path, u64 min_objectid,
5443                         int type)
5444 {
5445         struct btrfs_key found_key;
5446         struct extent_buffer *leaf;
5447         u32 nritems;
5448         int ret;
5449
5450         while (1) {
5451                 if (path->slots[0] == 0) {
5452                         ret = btrfs_prev_leaf(root, path);
5453                         if (ret != 0)
5454                                 return ret;
5455                 } else {
5456                         path->slots[0]--;
5457                 }
5458                 leaf = path->nodes[0];
5459                 nritems = btrfs_header_nritems(leaf);
5460                 if (nritems == 0)
5461                         return 1;
5462                 if (path->slots[0] == nritems)
5463                         path->slots[0]--;
5464
5465                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5466                 if (found_key.objectid < min_objectid)
5467                         break;
5468                 if (found_key.type == type)
5469                         return 0;
5470                 if (found_key.objectid == min_objectid &&
5471                     found_key.type < type)
5472                         break;
5473         }
5474         return 1;
5475 }
5476
5477 /*
5478  * search in extent tree to find a previous Metadata/Data extent item with
5479  * min objecitd.
5480  *
5481  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5482  */
5483 int btrfs_previous_extent_item(struct btrfs_root *root,
5484                         struct btrfs_path *path, u64 min_objectid)
5485 {
5486         struct btrfs_key found_key;
5487         struct extent_buffer *leaf;
5488         u32 nritems;
5489         int ret;
5490
5491         while (1) {
5492                 if (path->slots[0] == 0) {
5493                         ret = btrfs_prev_leaf(root, path);
5494                         if (ret != 0)
5495                                 return ret;
5496                 } else {
5497                         path->slots[0]--;
5498                 }
5499                 leaf = path->nodes[0];
5500                 nritems = btrfs_header_nritems(leaf);
5501                 if (nritems == 0)
5502                         return 1;
5503                 if (path->slots[0] == nritems)
5504                         path->slots[0]--;
5505
5506                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5507                 if (found_key.objectid < min_objectid)
5508                         break;
5509                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5510                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5511                         return 0;
5512                 if (found_key.objectid == min_objectid &&
5513                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5514                         break;
5515         }
5516         return 1;
5517 }