Btrfs: deal with enomem in the rewind path
[platform/adaptation/renesas_rcar/renesas_kernel.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
45
46 struct btrfs_path *btrfs_alloc_path(void)
47 {
48         struct btrfs_path *path;
49         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
50         return path;
51 }
52
53 /*
54  * set all locked nodes in the path to blocking locks.  This should
55  * be done before scheduling
56  */
57 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
58 {
59         int i;
60         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
61                 if (!p->nodes[i] || !p->locks[i])
62                         continue;
63                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
64                 if (p->locks[i] == BTRFS_READ_LOCK)
65                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
66                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
67                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
68         }
69 }
70
71 /*
72  * reset all the locked nodes in the patch to spinning locks.
73  *
74  * held is used to keep lockdep happy, when lockdep is enabled
75  * we set held to a blocking lock before we go around and
76  * retake all the spinlocks in the path.  You can safely use NULL
77  * for held
78  */
79 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
80                                         struct extent_buffer *held, int held_rw)
81 {
82         int i;
83
84 #ifdef CONFIG_DEBUG_LOCK_ALLOC
85         /* lockdep really cares that we take all of these spinlocks
86          * in the right order.  If any of the locks in the path are not
87          * currently blocking, it is going to complain.  So, make really
88          * really sure by forcing the path to blocking before we clear
89          * the path blocking.
90          */
91         if (held) {
92                 btrfs_set_lock_blocking_rw(held, held_rw);
93                 if (held_rw == BTRFS_WRITE_LOCK)
94                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
95                 else if (held_rw == BTRFS_READ_LOCK)
96                         held_rw = BTRFS_READ_LOCK_BLOCKING;
97         }
98         btrfs_set_path_blocking(p);
99 #endif
100
101         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
102                 if (p->nodes[i] && p->locks[i]) {
103                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
104                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
105                                 p->locks[i] = BTRFS_WRITE_LOCK;
106                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
107                                 p->locks[i] = BTRFS_READ_LOCK;
108                 }
109         }
110
111 #ifdef CONFIG_DEBUG_LOCK_ALLOC
112         if (held)
113                 btrfs_clear_lock_blocking_rw(held, held_rw);
114 #endif
115 }
116
117 /* this also releases the path */
118 void btrfs_free_path(struct btrfs_path *p)
119 {
120         if (!p)
121                 return;
122         btrfs_release_path(p);
123         kmem_cache_free(btrfs_path_cachep, p);
124 }
125
126 /*
127  * path release drops references on the extent buffers in the path
128  * and it drops any locks held by this path
129  *
130  * It is safe to call this on paths that no locks or extent buffers held.
131  */
132 noinline void btrfs_release_path(struct btrfs_path *p)
133 {
134         int i;
135
136         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
137                 p->slots[i] = 0;
138                 if (!p->nodes[i])
139                         continue;
140                 if (p->locks[i]) {
141                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
142                         p->locks[i] = 0;
143                 }
144                 free_extent_buffer(p->nodes[i]);
145                 p->nodes[i] = NULL;
146         }
147 }
148
149 /*
150  * safely gets a reference on the root node of a tree.  A lock
151  * is not taken, so a concurrent writer may put a different node
152  * at the root of the tree.  See btrfs_lock_root_node for the
153  * looping required.
154  *
155  * The extent buffer returned by this has a reference taken, so
156  * it won't disappear.  It may stop being the root of the tree
157  * at any time because there are no locks held.
158  */
159 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
160 {
161         struct extent_buffer *eb;
162
163         while (1) {
164                 rcu_read_lock();
165                 eb = rcu_dereference(root->node);
166
167                 /*
168                  * RCU really hurts here, we could free up the root node because
169                  * it was cow'ed but we may not get the new root node yet so do
170                  * the inc_not_zero dance and if it doesn't work then
171                  * synchronize_rcu and try again.
172                  */
173                 if (atomic_inc_not_zero(&eb->refs)) {
174                         rcu_read_unlock();
175                         break;
176                 }
177                 rcu_read_unlock();
178                 synchronize_rcu();
179         }
180         return eb;
181 }
182
183 /* loop around taking references on and locking the root node of the
184  * tree until you end up with a lock on the root.  A locked buffer
185  * is returned, with a reference held.
186  */
187 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
188 {
189         struct extent_buffer *eb;
190
191         while (1) {
192                 eb = btrfs_root_node(root);
193                 btrfs_tree_lock(eb);
194                 if (eb == root->node)
195                         break;
196                 btrfs_tree_unlock(eb);
197                 free_extent_buffer(eb);
198         }
199         return eb;
200 }
201
202 /* loop around taking references on and locking the root node of the
203  * tree until you end up with a lock on the root.  A locked buffer
204  * is returned, with a reference held.
205  */
206 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
207 {
208         struct extent_buffer *eb;
209
210         while (1) {
211                 eb = btrfs_root_node(root);
212                 btrfs_tree_read_lock(eb);
213                 if (eb == root->node)
214                         break;
215                 btrfs_tree_read_unlock(eb);
216                 free_extent_buffer(eb);
217         }
218         return eb;
219 }
220
221 /* cowonly root (everything not a reference counted cow subvolume), just get
222  * put onto a simple dirty list.  transaction.c walks this to make sure they
223  * get properly updated on disk.
224  */
225 static void add_root_to_dirty_list(struct btrfs_root *root)
226 {
227         spin_lock(&root->fs_info->trans_lock);
228         if (root->track_dirty && list_empty(&root->dirty_list)) {
229                 list_add(&root->dirty_list,
230                          &root->fs_info->dirty_cowonly_roots);
231         }
232         spin_unlock(&root->fs_info->trans_lock);
233 }
234
235 /*
236  * used by snapshot creation to make a copy of a root for a tree with
237  * a given objectid.  The buffer with the new root node is returned in
238  * cow_ret, and this func returns zero on success or a negative error code.
239  */
240 int btrfs_copy_root(struct btrfs_trans_handle *trans,
241                       struct btrfs_root *root,
242                       struct extent_buffer *buf,
243                       struct extent_buffer **cow_ret, u64 new_root_objectid)
244 {
245         struct extent_buffer *cow;
246         int ret = 0;
247         int level;
248         struct btrfs_disk_key disk_key;
249
250         WARN_ON(root->ref_cows && trans->transid !=
251                 root->fs_info->running_transaction->transid);
252         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
253
254         level = btrfs_header_level(buf);
255         if (level == 0)
256                 btrfs_item_key(buf, &disk_key, 0);
257         else
258                 btrfs_node_key(buf, &disk_key, 0);
259
260         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
261                                      new_root_objectid, &disk_key, level,
262                                      buf->start, 0);
263         if (IS_ERR(cow))
264                 return PTR_ERR(cow);
265
266         copy_extent_buffer(cow, buf, 0, 0, cow->len);
267         btrfs_set_header_bytenr(cow, cow->start);
268         btrfs_set_header_generation(cow, trans->transid);
269         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
270         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
271                                      BTRFS_HEADER_FLAG_RELOC);
272         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
273                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
274         else
275                 btrfs_set_header_owner(cow, new_root_objectid);
276
277         write_extent_buffer(cow, root->fs_info->fsid,
278                             (unsigned long)btrfs_header_fsid(cow),
279                             BTRFS_FSID_SIZE);
280
281         WARN_ON(btrfs_header_generation(buf) > trans->transid);
282         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
284         else
285                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
286
287         if (ret)
288                 return ret;
289
290         btrfs_mark_buffer_dirty(cow);
291         *cow_ret = cow;
292         return 0;
293 }
294
295 enum mod_log_op {
296         MOD_LOG_KEY_REPLACE,
297         MOD_LOG_KEY_ADD,
298         MOD_LOG_KEY_REMOVE,
299         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
300         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
301         MOD_LOG_MOVE_KEYS,
302         MOD_LOG_ROOT_REPLACE,
303 };
304
305 struct tree_mod_move {
306         int dst_slot;
307         int nr_items;
308 };
309
310 struct tree_mod_root {
311         u64 logical;
312         u8 level;
313 };
314
315 struct tree_mod_elem {
316         struct rb_node node;
317         u64 index;              /* shifted logical */
318         u64 seq;
319         enum mod_log_op op;
320
321         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
322         int slot;
323
324         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
325         u64 generation;
326
327         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
328         struct btrfs_disk_key key;
329         u64 blockptr;
330
331         /* this is used for op == MOD_LOG_MOVE_KEYS */
332         struct tree_mod_move move;
333
334         /* this is used for op == MOD_LOG_ROOT_REPLACE */
335         struct tree_mod_root old_root;
336 };
337
338 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
339 {
340         read_lock(&fs_info->tree_mod_log_lock);
341 }
342
343 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
344 {
345         read_unlock(&fs_info->tree_mod_log_lock);
346 }
347
348 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
349 {
350         write_lock(&fs_info->tree_mod_log_lock);
351 }
352
353 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
354 {
355         write_unlock(&fs_info->tree_mod_log_lock);
356 }
357
358 /*
359  * Increment the upper half of tree_mod_seq, set lower half zero.
360  *
361  * Must be called with fs_info->tree_mod_seq_lock held.
362  */
363 static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
364 {
365         u64 seq = atomic64_read(&fs_info->tree_mod_seq);
366         seq &= 0xffffffff00000000ull;
367         seq += 1ull << 32;
368         atomic64_set(&fs_info->tree_mod_seq, seq);
369         return seq;
370 }
371
372 /*
373  * Increment the lower half of tree_mod_seq.
374  *
375  * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
376  * are generated should not technically require a spin lock here. (Rationale:
377  * incrementing the minor while incrementing the major seq number is between its
378  * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
379  * just returns a unique sequence number as usual.) We have decided to leave
380  * that requirement in here and rethink it once we notice it really imposes a
381  * problem on some workload.
382  */
383 static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
384 {
385         return atomic64_inc_return(&fs_info->tree_mod_seq);
386 }
387
388 /*
389  * return the last minor in the previous major tree_mod_seq number
390  */
391 u64 btrfs_tree_mod_seq_prev(u64 seq)
392 {
393         return (seq & 0xffffffff00000000ull) - 1ull;
394 }
395
396 /*
397  * This adds a new blocker to the tree mod log's blocker list if the @elem
398  * passed does not already have a sequence number set. So when a caller expects
399  * to record tree modifications, it should ensure to set elem->seq to zero
400  * before calling btrfs_get_tree_mod_seq.
401  * Returns a fresh, unused tree log modification sequence number, even if no new
402  * blocker was added.
403  */
404 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
405                            struct seq_list *elem)
406 {
407         u64 seq;
408
409         tree_mod_log_write_lock(fs_info);
410         spin_lock(&fs_info->tree_mod_seq_lock);
411         if (!elem->seq) {
412                 elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
413                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
414         }
415         seq = btrfs_inc_tree_mod_seq_minor(fs_info);
416         spin_unlock(&fs_info->tree_mod_seq_lock);
417         tree_mod_log_write_unlock(fs_info);
418
419         return seq;
420 }
421
422 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
423                             struct seq_list *elem)
424 {
425         struct rb_root *tm_root;
426         struct rb_node *node;
427         struct rb_node *next;
428         struct seq_list *cur_elem;
429         struct tree_mod_elem *tm;
430         u64 min_seq = (u64)-1;
431         u64 seq_putting = elem->seq;
432
433         if (!seq_putting)
434                 return;
435
436         spin_lock(&fs_info->tree_mod_seq_lock);
437         list_del(&elem->list);
438         elem->seq = 0;
439
440         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
441                 if (cur_elem->seq < min_seq) {
442                         if (seq_putting > cur_elem->seq) {
443                                 /*
444                                  * blocker with lower sequence number exists, we
445                                  * cannot remove anything from the log
446                                  */
447                                 spin_unlock(&fs_info->tree_mod_seq_lock);
448                                 return;
449                         }
450                         min_seq = cur_elem->seq;
451                 }
452         }
453         spin_unlock(&fs_info->tree_mod_seq_lock);
454
455         /*
456          * anything that's lower than the lowest existing (read: blocked)
457          * sequence number can be removed from the tree.
458          */
459         tree_mod_log_write_lock(fs_info);
460         tm_root = &fs_info->tree_mod_log;
461         for (node = rb_first(tm_root); node; node = next) {
462                 next = rb_next(node);
463                 tm = container_of(node, struct tree_mod_elem, node);
464                 if (tm->seq > min_seq)
465                         continue;
466                 rb_erase(node, tm_root);
467                 kfree(tm);
468         }
469         tree_mod_log_write_unlock(fs_info);
470 }
471
472 /*
473  * key order of the log:
474  *       index -> sequence
475  *
476  * the index is the shifted logical of the *new* root node for root replace
477  * operations, or the shifted logical of the affected block for all other
478  * operations.
479  */
480 static noinline int
481 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
482 {
483         struct rb_root *tm_root;
484         struct rb_node **new;
485         struct rb_node *parent = NULL;
486         struct tree_mod_elem *cur;
487         int ret = 0;
488
489         BUG_ON(!tm);
490
491         tree_mod_log_write_lock(fs_info);
492         if (list_empty(&fs_info->tree_mod_seq_list)) {
493                 tree_mod_log_write_unlock(fs_info);
494                 /*
495                  * Ok we no longer care about logging modifications, free up tm
496                  * and return 0.  Any callers shouldn't be using tm after
497                  * calling tree_mod_log_insert, but if they do we can just
498                  * change this to return a special error code to let the callers
499                  * do their own thing.
500                  */
501                 kfree(tm);
502                 return 0;
503         }
504
505         spin_lock(&fs_info->tree_mod_seq_lock);
506         tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
507         spin_unlock(&fs_info->tree_mod_seq_lock);
508
509         tm_root = &fs_info->tree_mod_log;
510         new = &tm_root->rb_node;
511         while (*new) {
512                 cur = container_of(*new, struct tree_mod_elem, node);
513                 parent = *new;
514                 if (cur->index < tm->index)
515                         new = &((*new)->rb_left);
516                 else if (cur->index > tm->index)
517                         new = &((*new)->rb_right);
518                 else if (cur->seq < tm->seq)
519                         new = &((*new)->rb_left);
520                 else if (cur->seq > tm->seq)
521                         new = &((*new)->rb_right);
522                 else {
523                         ret = -EEXIST;
524                         kfree(tm);
525                         goto out;
526                 }
527         }
528
529         rb_link_node(&tm->node, parent, new);
530         rb_insert_color(&tm->node, tm_root);
531 out:
532         tree_mod_log_write_unlock(fs_info);
533         return ret;
534 }
535
536 /*
537  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
538  * returns zero with the tree_mod_log_lock acquired. The caller must hold
539  * this until all tree mod log insertions are recorded in the rb tree and then
540  * call tree_mod_log_write_unlock() to release.
541  */
542 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
543                                     struct extent_buffer *eb) {
544         smp_mb();
545         if (list_empty(&(fs_info)->tree_mod_seq_list))
546                 return 1;
547         if (eb && btrfs_header_level(eb) == 0)
548                 return 1;
549         return 0;
550 }
551
552 static inline int
553 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
554                           struct extent_buffer *eb, int slot,
555                           enum mod_log_op op, gfp_t flags)
556 {
557         struct tree_mod_elem *tm;
558
559         tm = kzalloc(sizeof(*tm), flags);
560         if (!tm)
561                 return -ENOMEM;
562
563         tm->index = eb->start >> PAGE_CACHE_SHIFT;
564         if (op != MOD_LOG_KEY_ADD) {
565                 btrfs_node_key(eb, &tm->key, slot);
566                 tm->blockptr = btrfs_node_blockptr(eb, slot);
567         }
568         tm->op = op;
569         tm->slot = slot;
570         tm->generation = btrfs_node_ptr_generation(eb, slot);
571
572         return __tree_mod_log_insert(fs_info, tm);
573 }
574
575 static noinline int
576 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
577                         struct extent_buffer *eb, int slot,
578                         enum mod_log_op op, gfp_t flags)
579 {
580         if (tree_mod_dont_log(fs_info, eb))
581                 return 0;
582
583         return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
584 }
585
586 static noinline int
587 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
588                          struct extent_buffer *eb, int dst_slot, int src_slot,
589                          int nr_items, gfp_t flags)
590 {
591         struct tree_mod_elem *tm;
592         int ret;
593         int i;
594
595         if (tree_mod_dont_log(fs_info, eb))
596                 return 0;
597
598         /*
599          * When we override something during the move, we log these removals.
600          * This can only happen when we move towards the beginning of the
601          * buffer, i.e. dst_slot < src_slot.
602          */
603         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
604                 ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
605                                 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
606                 BUG_ON(ret < 0);
607         }
608
609         tm = kzalloc(sizeof(*tm), flags);
610         if (!tm)
611                 return -ENOMEM;
612
613         tm->index = eb->start >> PAGE_CACHE_SHIFT;
614         tm->slot = src_slot;
615         tm->move.dst_slot = dst_slot;
616         tm->move.nr_items = nr_items;
617         tm->op = MOD_LOG_MOVE_KEYS;
618
619         return __tree_mod_log_insert(fs_info, tm);
620 }
621
622 static inline void
623 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
624 {
625         int i;
626         u32 nritems;
627         int ret;
628
629         if (btrfs_header_level(eb) == 0)
630                 return;
631
632         nritems = btrfs_header_nritems(eb);
633         for (i = nritems - 1; i >= 0; i--) {
634                 ret = __tree_mod_log_insert_key(fs_info, eb, i,
635                                 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
636                 BUG_ON(ret < 0);
637         }
638 }
639
640 static noinline int
641 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
642                          struct extent_buffer *old_root,
643                          struct extent_buffer *new_root, gfp_t flags,
644                          int log_removal)
645 {
646         struct tree_mod_elem *tm;
647
648         if (tree_mod_dont_log(fs_info, NULL))
649                 return 0;
650
651         if (log_removal)
652                 __tree_mod_log_free_eb(fs_info, old_root);
653
654         tm = kzalloc(sizeof(*tm), flags);
655         if (!tm)
656                 return -ENOMEM;
657
658         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
659         tm->old_root.logical = old_root->start;
660         tm->old_root.level = btrfs_header_level(old_root);
661         tm->generation = btrfs_header_generation(old_root);
662         tm->op = MOD_LOG_ROOT_REPLACE;
663
664         return __tree_mod_log_insert(fs_info, tm);
665 }
666
667 static struct tree_mod_elem *
668 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
669                       int smallest)
670 {
671         struct rb_root *tm_root;
672         struct rb_node *node;
673         struct tree_mod_elem *cur = NULL;
674         struct tree_mod_elem *found = NULL;
675         u64 index = start >> PAGE_CACHE_SHIFT;
676
677         tree_mod_log_read_lock(fs_info);
678         tm_root = &fs_info->tree_mod_log;
679         node = tm_root->rb_node;
680         while (node) {
681                 cur = container_of(node, struct tree_mod_elem, node);
682                 if (cur->index < index) {
683                         node = node->rb_left;
684                 } else if (cur->index > index) {
685                         node = node->rb_right;
686                 } else if (cur->seq < min_seq) {
687                         node = node->rb_left;
688                 } else if (!smallest) {
689                         /* we want the node with the highest seq */
690                         if (found)
691                                 BUG_ON(found->seq > cur->seq);
692                         found = cur;
693                         node = node->rb_left;
694                 } else if (cur->seq > min_seq) {
695                         /* we want the node with the smallest seq */
696                         if (found)
697                                 BUG_ON(found->seq < cur->seq);
698                         found = cur;
699                         node = node->rb_right;
700                 } else {
701                         found = cur;
702                         break;
703                 }
704         }
705         tree_mod_log_read_unlock(fs_info);
706
707         return found;
708 }
709
710 /*
711  * this returns the element from the log with the smallest time sequence
712  * value that's in the log (the oldest log item). any element with a time
713  * sequence lower than min_seq will be ignored.
714  */
715 static struct tree_mod_elem *
716 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
717                            u64 min_seq)
718 {
719         return __tree_mod_log_search(fs_info, start, min_seq, 1);
720 }
721
722 /*
723  * this returns the element from the log with the largest time sequence
724  * value that's in the log (the most recent log item). any element with
725  * a time sequence lower than min_seq will be ignored.
726  */
727 static struct tree_mod_elem *
728 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
729 {
730         return __tree_mod_log_search(fs_info, start, min_seq, 0);
731 }
732
733 static noinline void
734 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
735                      struct extent_buffer *src, unsigned long dst_offset,
736                      unsigned long src_offset, int nr_items)
737 {
738         int ret;
739         int i;
740
741         if (tree_mod_dont_log(fs_info, NULL))
742                 return;
743
744         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
745                 return;
746
747         for (i = 0; i < nr_items; i++) {
748                 ret = __tree_mod_log_insert_key(fs_info, src,
749                                                 i + src_offset,
750                                                 MOD_LOG_KEY_REMOVE, GFP_NOFS);
751                 BUG_ON(ret < 0);
752                 ret = __tree_mod_log_insert_key(fs_info, dst,
753                                                      i + dst_offset,
754                                                      MOD_LOG_KEY_ADD,
755                                                      GFP_NOFS);
756                 BUG_ON(ret < 0);
757         }
758 }
759
760 static inline void
761 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
762                      int dst_offset, int src_offset, int nr_items)
763 {
764         int ret;
765         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
766                                        nr_items, GFP_NOFS);
767         BUG_ON(ret < 0);
768 }
769
770 static noinline void
771 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
772                           struct extent_buffer *eb, int slot, int atomic)
773 {
774         int ret;
775
776         ret = __tree_mod_log_insert_key(fs_info, eb, slot,
777                                         MOD_LOG_KEY_REPLACE,
778                                         atomic ? GFP_ATOMIC : GFP_NOFS);
779         BUG_ON(ret < 0);
780 }
781
782 static noinline void
783 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
784 {
785         if (tree_mod_dont_log(fs_info, eb))
786                 return;
787         __tree_mod_log_free_eb(fs_info, eb);
788 }
789
790 static noinline void
791 tree_mod_log_set_root_pointer(struct btrfs_root *root,
792                               struct extent_buffer *new_root_node,
793                               int log_removal)
794 {
795         int ret;
796         ret = tree_mod_log_insert_root(root->fs_info, root->node,
797                                        new_root_node, GFP_NOFS, log_removal);
798         BUG_ON(ret < 0);
799 }
800
801 /*
802  * check if the tree block can be shared by multiple trees
803  */
804 int btrfs_block_can_be_shared(struct btrfs_root *root,
805                               struct extent_buffer *buf)
806 {
807         /*
808          * Tree blocks not in refernece counted trees and tree roots
809          * are never shared. If a block was allocated after the last
810          * snapshot and the block was not allocated by tree relocation,
811          * we know the block is not shared.
812          */
813         if (root->ref_cows &&
814             buf != root->node && buf != root->commit_root &&
815             (btrfs_header_generation(buf) <=
816              btrfs_root_last_snapshot(&root->root_item) ||
817              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
818                 return 1;
819 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
820         if (root->ref_cows &&
821             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
822                 return 1;
823 #endif
824         return 0;
825 }
826
827 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
828                                        struct btrfs_root *root,
829                                        struct extent_buffer *buf,
830                                        struct extent_buffer *cow,
831                                        int *last_ref)
832 {
833         u64 refs;
834         u64 owner;
835         u64 flags;
836         u64 new_flags = 0;
837         int ret;
838
839         /*
840          * Backrefs update rules:
841          *
842          * Always use full backrefs for extent pointers in tree block
843          * allocated by tree relocation.
844          *
845          * If a shared tree block is no longer referenced by its owner
846          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
847          * use full backrefs for extent pointers in tree block.
848          *
849          * If a tree block is been relocating
850          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
851          * use full backrefs for extent pointers in tree block.
852          * The reason for this is some operations (such as drop tree)
853          * are only allowed for blocks use full backrefs.
854          */
855
856         if (btrfs_block_can_be_shared(root, buf)) {
857                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
858                                                btrfs_header_level(buf), 1,
859                                                &refs, &flags);
860                 if (ret)
861                         return ret;
862                 if (refs == 0) {
863                         ret = -EROFS;
864                         btrfs_std_error(root->fs_info, ret);
865                         return ret;
866                 }
867         } else {
868                 refs = 1;
869                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
870                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
871                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
872                 else
873                         flags = 0;
874         }
875
876         owner = btrfs_header_owner(buf);
877         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
878                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
879
880         if (refs > 1) {
881                 if ((owner == root->root_key.objectid ||
882                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
883                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
884                         ret = btrfs_inc_ref(trans, root, buf, 1, 1);
885                         BUG_ON(ret); /* -ENOMEM */
886
887                         if (root->root_key.objectid ==
888                             BTRFS_TREE_RELOC_OBJECTID) {
889                                 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
890                                 BUG_ON(ret); /* -ENOMEM */
891                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
892                                 BUG_ON(ret); /* -ENOMEM */
893                         }
894                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
895                 } else {
896
897                         if (root->root_key.objectid ==
898                             BTRFS_TREE_RELOC_OBJECTID)
899                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
900                         else
901                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
902                         BUG_ON(ret); /* -ENOMEM */
903                 }
904                 if (new_flags != 0) {
905                         int level = btrfs_header_level(buf);
906
907                         ret = btrfs_set_disk_extent_flags(trans, root,
908                                                           buf->start,
909                                                           buf->len,
910                                                           new_flags, level, 0);
911                         if (ret)
912                                 return ret;
913                 }
914         } else {
915                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
916                         if (root->root_key.objectid ==
917                             BTRFS_TREE_RELOC_OBJECTID)
918                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
919                         else
920                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
921                         BUG_ON(ret); /* -ENOMEM */
922                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
923                         BUG_ON(ret); /* -ENOMEM */
924                 }
925                 clean_tree_block(trans, root, buf);
926                 *last_ref = 1;
927         }
928         return 0;
929 }
930
931 /*
932  * does the dirty work in cow of a single block.  The parent block (if
933  * supplied) is updated to point to the new cow copy.  The new buffer is marked
934  * dirty and returned locked.  If you modify the block it needs to be marked
935  * dirty again.
936  *
937  * search_start -- an allocation hint for the new block
938  *
939  * empty_size -- a hint that you plan on doing more cow.  This is the size in
940  * bytes the allocator should try to find free next to the block it returns.
941  * This is just a hint and may be ignored by the allocator.
942  */
943 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
944                              struct btrfs_root *root,
945                              struct extent_buffer *buf,
946                              struct extent_buffer *parent, int parent_slot,
947                              struct extent_buffer **cow_ret,
948                              u64 search_start, u64 empty_size)
949 {
950         struct btrfs_disk_key disk_key;
951         struct extent_buffer *cow;
952         int level, ret;
953         int last_ref = 0;
954         int unlock_orig = 0;
955         u64 parent_start;
956
957         if (*cow_ret == buf)
958                 unlock_orig = 1;
959
960         btrfs_assert_tree_locked(buf);
961
962         WARN_ON(root->ref_cows && trans->transid !=
963                 root->fs_info->running_transaction->transid);
964         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
965
966         level = btrfs_header_level(buf);
967
968         if (level == 0)
969                 btrfs_item_key(buf, &disk_key, 0);
970         else
971                 btrfs_node_key(buf, &disk_key, 0);
972
973         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
974                 if (parent)
975                         parent_start = parent->start;
976                 else
977                         parent_start = 0;
978         } else
979                 parent_start = 0;
980
981         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
982                                      root->root_key.objectid, &disk_key,
983                                      level, search_start, empty_size);
984         if (IS_ERR(cow))
985                 return PTR_ERR(cow);
986
987         /* cow is set to blocking by btrfs_init_new_buffer */
988
989         copy_extent_buffer(cow, buf, 0, 0, cow->len);
990         btrfs_set_header_bytenr(cow, cow->start);
991         btrfs_set_header_generation(cow, trans->transid);
992         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
993         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
994                                      BTRFS_HEADER_FLAG_RELOC);
995         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
996                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
997         else
998                 btrfs_set_header_owner(cow, root->root_key.objectid);
999
1000         write_extent_buffer(cow, root->fs_info->fsid,
1001                             (unsigned long)btrfs_header_fsid(cow),
1002                             BTRFS_FSID_SIZE);
1003
1004         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1005         if (ret) {
1006                 btrfs_abort_transaction(trans, root, ret);
1007                 return ret;
1008         }
1009
1010         if (root->ref_cows)
1011                 btrfs_reloc_cow_block(trans, root, buf, cow);
1012
1013         if (buf == root->node) {
1014                 WARN_ON(parent && parent != buf);
1015                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1016                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1017                         parent_start = buf->start;
1018                 else
1019                         parent_start = 0;
1020
1021                 extent_buffer_get(cow);
1022                 tree_mod_log_set_root_pointer(root, cow, 1);
1023                 rcu_assign_pointer(root->node, cow);
1024
1025                 btrfs_free_tree_block(trans, root, buf, parent_start,
1026                                       last_ref);
1027                 free_extent_buffer(buf);
1028                 add_root_to_dirty_list(root);
1029         } else {
1030                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1031                         parent_start = parent->start;
1032                 else
1033                         parent_start = 0;
1034
1035                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1036                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1037                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1038                 btrfs_set_node_blockptr(parent, parent_slot,
1039                                         cow->start);
1040                 btrfs_set_node_ptr_generation(parent, parent_slot,
1041                                               trans->transid);
1042                 btrfs_mark_buffer_dirty(parent);
1043                 if (last_ref)
1044                         tree_mod_log_free_eb(root->fs_info, buf);
1045                 btrfs_free_tree_block(trans, root, buf, parent_start,
1046                                       last_ref);
1047         }
1048         if (unlock_orig)
1049                 btrfs_tree_unlock(buf);
1050         free_extent_buffer_stale(buf);
1051         btrfs_mark_buffer_dirty(cow);
1052         *cow_ret = cow;
1053         return 0;
1054 }
1055
1056 /*
1057  * returns the logical address of the oldest predecessor of the given root.
1058  * entries older than time_seq are ignored.
1059  */
1060 static struct tree_mod_elem *
1061 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1062                            struct extent_buffer *eb_root, u64 time_seq)
1063 {
1064         struct tree_mod_elem *tm;
1065         struct tree_mod_elem *found = NULL;
1066         u64 root_logical = eb_root->start;
1067         int looped = 0;
1068
1069         if (!time_seq)
1070                 return 0;
1071
1072         /*
1073          * the very last operation that's logged for a root is the replacement
1074          * operation (if it is replaced at all). this has the index of the *new*
1075          * root, making it the very first operation that's logged for this root.
1076          */
1077         while (1) {
1078                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1079                                                 time_seq);
1080                 if (!looped && !tm)
1081                         return 0;
1082                 /*
1083                  * if there are no tree operation for the oldest root, we simply
1084                  * return it. this should only happen if that (old) root is at
1085                  * level 0.
1086                  */
1087                 if (!tm)
1088                         break;
1089
1090                 /*
1091                  * if there's an operation that's not a root replacement, we
1092                  * found the oldest version of our root. normally, we'll find a
1093                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1094                  */
1095                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1096                         break;
1097
1098                 found = tm;
1099                 root_logical = tm->old_root.logical;
1100                 looped = 1;
1101         }
1102
1103         /* if there's no old root to return, return what we found instead */
1104         if (!found)
1105                 found = tm;
1106
1107         return found;
1108 }
1109
1110 /*
1111  * tm is a pointer to the first operation to rewind within eb. then, all
1112  * previous operations will be rewinded (until we reach something older than
1113  * time_seq).
1114  */
1115 static void
1116 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1117                       u64 time_seq, struct tree_mod_elem *first_tm)
1118 {
1119         u32 n;
1120         struct rb_node *next;
1121         struct tree_mod_elem *tm = first_tm;
1122         unsigned long o_dst;
1123         unsigned long o_src;
1124         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1125
1126         n = btrfs_header_nritems(eb);
1127         tree_mod_log_read_lock(fs_info);
1128         while (tm && tm->seq >= time_seq) {
1129                 /*
1130                  * all the operations are recorded with the operator used for
1131                  * the modification. as we're going backwards, we do the
1132                  * opposite of each operation here.
1133                  */
1134                 switch (tm->op) {
1135                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1136                         BUG_ON(tm->slot < n);
1137                         /* Fallthrough */
1138                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1139                 case MOD_LOG_KEY_REMOVE:
1140                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1141                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1142                         btrfs_set_node_ptr_generation(eb, tm->slot,
1143                                                       tm->generation);
1144                         n++;
1145                         break;
1146                 case MOD_LOG_KEY_REPLACE:
1147                         BUG_ON(tm->slot >= n);
1148                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1149                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1150                         btrfs_set_node_ptr_generation(eb, tm->slot,
1151                                                       tm->generation);
1152                         break;
1153                 case MOD_LOG_KEY_ADD:
1154                         /* if a move operation is needed it's in the log */
1155                         n--;
1156                         break;
1157                 case MOD_LOG_MOVE_KEYS:
1158                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1159                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1160                         memmove_extent_buffer(eb, o_dst, o_src,
1161                                               tm->move.nr_items * p_size);
1162                         break;
1163                 case MOD_LOG_ROOT_REPLACE:
1164                         /*
1165                          * this operation is special. for roots, this must be
1166                          * handled explicitly before rewinding.
1167                          * for non-roots, this operation may exist if the node
1168                          * was a root: root A -> child B; then A gets empty and
1169                          * B is promoted to the new root. in the mod log, we'll
1170                          * have a root-replace operation for B, a tree block
1171                          * that is no root. we simply ignore that operation.
1172                          */
1173                         break;
1174                 }
1175                 next = rb_next(&tm->node);
1176                 if (!next)
1177                         break;
1178                 tm = container_of(next, struct tree_mod_elem, node);
1179                 if (tm->index != first_tm->index)
1180                         break;
1181         }
1182         tree_mod_log_read_unlock(fs_info);
1183         btrfs_set_header_nritems(eb, n);
1184 }
1185
1186 /*
1187  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1188  * is returned. If rewind operations happen, a fresh buffer is returned. The
1189  * returned buffer is always read-locked. If the returned buffer is not the
1190  * input buffer, the lock on the input buffer is released and the input buffer
1191  * is freed (its refcount is decremented).
1192  */
1193 static struct extent_buffer *
1194 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1195                     u64 time_seq)
1196 {
1197         struct extent_buffer *eb_rewin;
1198         struct tree_mod_elem *tm;
1199
1200         if (!time_seq)
1201                 return eb;
1202
1203         if (btrfs_header_level(eb) == 0)
1204                 return eb;
1205
1206         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1207         if (!tm)
1208                 return eb;
1209
1210         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1211                 BUG_ON(tm->slot != 0);
1212                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1213                                                 fs_info->tree_root->nodesize);
1214                 if (!eb_rewin) {
1215                         btrfs_tree_read_unlock(eb);
1216                         free_extent_buffer(eb);
1217                         return NULL;
1218                 }
1219                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1220                 btrfs_set_header_backref_rev(eb_rewin,
1221                                              btrfs_header_backref_rev(eb));
1222                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1223                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1224         } else {
1225                 eb_rewin = btrfs_clone_extent_buffer(eb);
1226                 if (!eb_rewin) {
1227                         btrfs_tree_read_unlock(eb);
1228                         free_extent_buffer(eb);
1229                         return NULL;
1230                 }
1231         }
1232
1233         btrfs_tree_read_unlock(eb);
1234         free_extent_buffer(eb);
1235
1236         extent_buffer_get(eb_rewin);
1237         btrfs_tree_read_lock(eb_rewin);
1238         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1239         WARN_ON(btrfs_header_nritems(eb_rewin) >
1240                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1241
1242         return eb_rewin;
1243 }
1244
1245 /*
1246  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1247  * value. If there are no changes, the current root->root_node is returned. If
1248  * anything changed in between, there's a fresh buffer allocated on which the
1249  * rewind operations are done. In any case, the returned buffer is read locked.
1250  * Returns NULL on error (with no locks held).
1251  */
1252 static inline struct extent_buffer *
1253 get_old_root(struct btrfs_root *root, u64 time_seq)
1254 {
1255         struct tree_mod_elem *tm;
1256         struct extent_buffer *eb = NULL;
1257         struct extent_buffer *eb_root;
1258         struct extent_buffer *old;
1259         struct tree_mod_root *old_root = NULL;
1260         u64 old_generation = 0;
1261         u64 logical;
1262         u32 blocksize;
1263
1264         eb_root = btrfs_read_lock_root_node(root);
1265         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1266         if (!tm)
1267                 return eb_root;
1268
1269         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1270                 old_root = &tm->old_root;
1271                 old_generation = tm->generation;
1272                 logical = old_root->logical;
1273         } else {
1274                 logical = eb_root->start;
1275         }
1276
1277         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1278         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1279                 btrfs_tree_read_unlock(eb_root);
1280                 free_extent_buffer(eb_root);
1281                 blocksize = btrfs_level_size(root, old_root->level);
1282                 old = read_tree_block(root, logical, blocksize, 0);
1283                 if (!old || !extent_buffer_uptodate(old)) {
1284                         free_extent_buffer(old);
1285                         pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1286                                 logical);
1287                         WARN_ON(1);
1288                 } else {
1289                         eb = btrfs_clone_extent_buffer(old);
1290                         free_extent_buffer(old);
1291                 }
1292         } else if (old_root) {
1293                 btrfs_tree_read_unlock(eb_root);
1294                 free_extent_buffer(eb_root);
1295                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1296         } else {
1297                 eb = btrfs_clone_extent_buffer(eb_root);
1298                 btrfs_tree_read_unlock(eb_root);
1299                 free_extent_buffer(eb_root);
1300         }
1301
1302         if (!eb)
1303                 return NULL;
1304         extent_buffer_get(eb);
1305         btrfs_tree_read_lock(eb);
1306         if (old_root) {
1307                 btrfs_set_header_bytenr(eb, eb->start);
1308                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1309                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1310                 btrfs_set_header_level(eb, old_root->level);
1311                 btrfs_set_header_generation(eb, old_generation);
1312         }
1313         if (tm)
1314                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1315         else
1316                 WARN_ON(btrfs_header_level(eb) != 0);
1317         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1318
1319         return eb;
1320 }
1321
1322 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1323 {
1324         struct tree_mod_elem *tm;
1325         int level;
1326         struct extent_buffer *eb_root = btrfs_root_node(root);
1327
1328         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1329         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1330                 level = tm->old_root.level;
1331         } else {
1332                 level = btrfs_header_level(eb_root);
1333         }
1334         free_extent_buffer(eb_root);
1335
1336         return level;
1337 }
1338
1339 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1340                                    struct btrfs_root *root,
1341                                    struct extent_buffer *buf)
1342 {
1343         /* ensure we can see the force_cow */
1344         smp_rmb();
1345
1346         /*
1347          * We do not need to cow a block if
1348          * 1) this block is not created or changed in this transaction;
1349          * 2) this block does not belong to TREE_RELOC tree;
1350          * 3) the root is not forced COW.
1351          *
1352          * What is forced COW:
1353          *    when we create snapshot during commiting the transaction,
1354          *    after we've finished coping src root, we must COW the shared
1355          *    block to ensure the metadata consistency.
1356          */
1357         if (btrfs_header_generation(buf) == trans->transid &&
1358             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1359             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1360               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1361             !root->force_cow)
1362                 return 0;
1363         return 1;
1364 }
1365
1366 /*
1367  * cows a single block, see __btrfs_cow_block for the real work.
1368  * This version of it has extra checks so that a block isn't cow'd more than
1369  * once per transaction, as long as it hasn't been written yet
1370  */
1371 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1372                     struct btrfs_root *root, struct extent_buffer *buf,
1373                     struct extent_buffer *parent, int parent_slot,
1374                     struct extent_buffer **cow_ret)
1375 {
1376         u64 search_start;
1377         int ret;
1378
1379         if (trans->transaction != root->fs_info->running_transaction)
1380                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1381                        (unsigned long long)trans->transid,
1382                        (unsigned long long)
1383                        root->fs_info->running_transaction->transid);
1384
1385         if (trans->transid != root->fs_info->generation)
1386                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1387                        (unsigned long long)trans->transid,
1388                        (unsigned long long)root->fs_info->generation);
1389
1390         if (!should_cow_block(trans, root, buf)) {
1391                 *cow_ret = buf;
1392                 return 0;
1393         }
1394
1395         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1396
1397         if (parent)
1398                 btrfs_set_lock_blocking(parent);
1399         btrfs_set_lock_blocking(buf);
1400
1401         ret = __btrfs_cow_block(trans, root, buf, parent,
1402                                  parent_slot, cow_ret, search_start, 0);
1403
1404         trace_btrfs_cow_block(root, buf, *cow_ret);
1405
1406         return ret;
1407 }
1408
1409 /*
1410  * helper function for defrag to decide if two blocks pointed to by a
1411  * node are actually close by
1412  */
1413 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1414 {
1415         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1416                 return 1;
1417         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1418                 return 1;
1419         return 0;
1420 }
1421
1422 /*
1423  * compare two keys in a memcmp fashion
1424  */
1425 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1426 {
1427         struct btrfs_key k1;
1428
1429         btrfs_disk_key_to_cpu(&k1, disk);
1430
1431         return btrfs_comp_cpu_keys(&k1, k2);
1432 }
1433
1434 /*
1435  * same as comp_keys only with two btrfs_key's
1436  */
1437 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1438 {
1439         if (k1->objectid > k2->objectid)
1440                 return 1;
1441         if (k1->objectid < k2->objectid)
1442                 return -1;
1443         if (k1->type > k2->type)
1444                 return 1;
1445         if (k1->type < k2->type)
1446                 return -1;
1447         if (k1->offset > k2->offset)
1448                 return 1;
1449         if (k1->offset < k2->offset)
1450                 return -1;
1451         return 0;
1452 }
1453
1454 /*
1455  * this is used by the defrag code to go through all the
1456  * leaves pointed to by a node and reallocate them so that
1457  * disk order is close to key order
1458  */
1459 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1460                        struct btrfs_root *root, struct extent_buffer *parent,
1461                        int start_slot, u64 *last_ret,
1462                        struct btrfs_key *progress)
1463 {
1464         struct extent_buffer *cur;
1465         u64 blocknr;
1466         u64 gen;
1467         u64 search_start = *last_ret;
1468         u64 last_block = 0;
1469         u64 other;
1470         u32 parent_nritems;
1471         int end_slot;
1472         int i;
1473         int err = 0;
1474         int parent_level;
1475         int uptodate;
1476         u32 blocksize;
1477         int progress_passed = 0;
1478         struct btrfs_disk_key disk_key;
1479
1480         parent_level = btrfs_header_level(parent);
1481
1482         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1483         WARN_ON(trans->transid != root->fs_info->generation);
1484
1485         parent_nritems = btrfs_header_nritems(parent);
1486         blocksize = btrfs_level_size(root, parent_level - 1);
1487         end_slot = parent_nritems;
1488
1489         if (parent_nritems == 1)
1490                 return 0;
1491
1492         btrfs_set_lock_blocking(parent);
1493
1494         for (i = start_slot; i < end_slot; i++) {
1495                 int close = 1;
1496
1497                 btrfs_node_key(parent, &disk_key, i);
1498                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1499                         continue;
1500
1501                 progress_passed = 1;
1502                 blocknr = btrfs_node_blockptr(parent, i);
1503                 gen = btrfs_node_ptr_generation(parent, i);
1504                 if (last_block == 0)
1505                         last_block = blocknr;
1506
1507                 if (i > 0) {
1508                         other = btrfs_node_blockptr(parent, i - 1);
1509                         close = close_blocks(blocknr, other, blocksize);
1510                 }
1511                 if (!close && i < end_slot - 2) {
1512                         other = btrfs_node_blockptr(parent, i + 1);
1513                         close = close_blocks(blocknr, other, blocksize);
1514                 }
1515                 if (close) {
1516                         last_block = blocknr;
1517                         continue;
1518                 }
1519
1520                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
1521                 if (cur)
1522                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1523                 else
1524                         uptodate = 0;
1525                 if (!cur || !uptodate) {
1526                         if (!cur) {
1527                                 cur = read_tree_block(root, blocknr,
1528                                                          blocksize, gen);
1529                                 if (!cur || !extent_buffer_uptodate(cur)) {
1530                                         free_extent_buffer(cur);
1531                                         return -EIO;
1532                                 }
1533                         } else if (!uptodate) {
1534                                 err = btrfs_read_buffer(cur, gen);
1535                                 if (err) {
1536                                         free_extent_buffer(cur);
1537                                         return err;
1538                                 }
1539                         }
1540                 }
1541                 if (search_start == 0)
1542                         search_start = last_block;
1543
1544                 btrfs_tree_lock(cur);
1545                 btrfs_set_lock_blocking(cur);
1546                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1547                                         &cur, search_start,
1548                                         min(16 * blocksize,
1549                                             (end_slot - i) * blocksize));
1550                 if (err) {
1551                         btrfs_tree_unlock(cur);
1552                         free_extent_buffer(cur);
1553                         break;
1554                 }
1555                 search_start = cur->start;
1556                 last_block = cur->start;
1557                 *last_ret = search_start;
1558                 btrfs_tree_unlock(cur);
1559                 free_extent_buffer(cur);
1560         }
1561         return err;
1562 }
1563
1564 /*
1565  * The leaf data grows from end-to-front in the node.
1566  * this returns the address of the start of the last item,
1567  * which is the stop of the leaf data stack
1568  */
1569 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1570                                          struct extent_buffer *leaf)
1571 {
1572         u32 nr = btrfs_header_nritems(leaf);
1573         if (nr == 0)
1574                 return BTRFS_LEAF_DATA_SIZE(root);
1575         return btrfs_item_offset_nr(leaf, nr - 1);
1576 }
1577
1578
1579 /*
1580  * search for key in the extent_buffer.  The items start at offset p,
1581  * and they are item_size apart.  There are 'max' items in p.
1582  *
1583  * the slot in the array is returned via slot, and it points to
1584  * the place where you would insert key if it is not found in
1585  * the array.
1586  *
1587  * slot may point to max if the key is bigger than all of the keys
1588  */
1589 static noinline int generic_bin_search(struct extent_buffer *eb,
1590                                        unsigned long p,
1591                                        int item_size, struct btrfs_key *key,
1592                                        int max, int *slot)
1593 {
1594         int low = 0;
1595         int high = max;
1596         int mid;
1597         int ret;
1598         struct btrfs_disk_key *tmp = NULL;
1599         struct btrfs_disk_key unaligned;
1600         unsigned long offset;
1601         char *kaddr = NULL;
1602         unsigned long map_start = 0;
1603         unsigned long map_len = 0;
1604         int err;
1605
1606         while (low < high) {
1607                 mid = (low + high) / 2;
1608                 offset = p + mid * item_size;
1609
1610                 if (!kaddr || offset < map_start ||
1611                     (offset + sizeof(struct btrfs_disk_key)) >
1612                     map_start + map_len) {
1613
1614                         err = map_private_extent_buffer(eb, offset,
1615                                                 sizeof(struct btrfs_disk_key),
1616                                                 &kaddr, &map_start, &map_len);
1617
1618                         if (!err) {
1619                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1620                                                         map_start);
1621                         } else {
1622                                 read_extent_buffer(eb, &unaligned,
1623                                                    offset, sizeof(unaligned));
1624                                 tmp = &unaligned;
1625                         }
1626
1627                 } else {
1628                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1629                                                         map_start);
1630                 }
1631                 ret = comp_keys(tmp, key);
1632
1633                 if (ret < 0)
1634                         low = mid + 1;
1635                 else if (ret > 0)
1636                         high = mid;
1637                 else {
1638                         *slot = mid;
1639                         return 0;
1640                 }
1641         }
1642         *slot = low;
1643         return 1;
1644 }
1645
1646 /*
1647  * simple bin_search frontend that does the right thing for
1648  * leaves vs nodes
1649  */
1650 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1651                       int level, int *slot)
1652 {
1653         if (level == 0)
1654                 return generic_bin_search(eb,
1655                                           offsetof(struct btrfs_leaf, items),
1656                                           sizeof(struct btrfs_item),
1657                                           key, btrfs_header_nritems(eb),
1658                                           slot);
1659         else
1660                 return generic_bin_search(eb,
1661                                           offsetof(struct btrfs_node, ptrs),
1662                                           sizeof(struct btrfs_key_ptr),
1663                                           key, btrfs_header_nritems(eb),
1664                                           slot);
1665 }
1666
1667 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1668                      int level, int *slot)
1669 {
1670         return bin_search(eb, key, level, slot);
1671 }
1672
1673 static void root_add_used(struct btrfs_root *root, u32 size)
1674 {
1675         spin_lock(&root->accounting_lock);
1676         btrfs_set_root_used(&root->root_item,
1677                             btrfs_root_used(&root->root_item) + size);
1678         spin_unlock(&root->accounting_lock);
1679 }
1680
1681 static void root_sub_used(struct btrfs_root *root, u32 size)
1682 {
1683         spin_lock(&root->accounting_lock);
1684         btrfs_set_root_used(&root->root_item,
1685                             btrfs_root_used(&root->root_item) - size);
1686         spin_unlock(&root->accounting_lock);
1687 }
1688
1689 /* given a node and slot number, this reads the blocks it points to.  The
1690  * extent buffer is returned with a reference taken (but unlocked).
1691  * NULL is returned on error.
1692  */
1693 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1694                                    struct extent_buffer *parent, int slot)
1695 {
1696         int level = btrfs_header_level(parent);
1697         struct extent_buffer *eb;
1698
1699         if (slot < 0)
1700                 return NULL;
1701         if (slot >= btrfs_header_nritems(parent))
1702                 return NULL;
1703
1704         BUG_ON(level == 0);
1705
1706         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1707                              btrfs_level_size(root, level - 1),
1708                              btrfs_node_ptr_generation(parent, slot));
1709         if (eb && !extent_buffer_uptodate(eb)) {
1710                 free_extent_buffer(eb);
1711                 eb = NULL;
1712         }
1713
1714         return eb;
1715 }
1716
1717 /*
1718  * node level balancing, used to make sure nodes are in proper order for
1719  * item deletion.  We balance from the top down, so we have to make sure
1720  * that a deletion won't leave an node completely empty later on.
1721  */
1722 static noinline int balance_level(struct btrfs_trans_handle *trans,
1723                          struct btrfs_root *root,
1724                          struct btrfs_path *path, int level)
1725 {
1726         struct extent_buffer *right = NULL;
1727         struct extent_buffer *mid;
1728         struct extent_buffer *left = NULL;
1729         struct extent_buffer *parent = NULL;
1730         int ret = 0;
1731         int wret;
1732         int pslot;
1733         int orig_slot = path->slots[level];
1734         u64 orig_ptr;
1735
1736         if (level == 0)
1737                 return 0;
1738
1739         mid = path->nodes[level];
1740
1741         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1742                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1743         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1744
1745         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1746
1747         if (level < BTRFS_MAX_LEVEL - 1) {
1748                 parent = path->nodes[level + 1];
1749                 pslot = path->slots[level + 1];
1750         }
1751
1752         /*
1753          * deal with the case where there is only one pointer in the root
1754          * by promoting the node below to a root
1755          */
1756         if (!parent) {
1757                 struct extent_buffer *child;
1758
1759                 if (btrfs_header_nritems(mid) != 1)
1760                         return 0;
1761
1762                 /* promote the child to a root */
1763                 child = read_node_slot(root, mid, 0);
1764                 if (!child) {
1765                         ret = -EROFS;
1766                         btrfs_std_error(root->fs_info, ret);
1767                         goto enospc;
1768                 }
1769
1770                 btrfs_tree_lock(child);
1771                 btrfs_set_lock_blocking(child);
1772                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1773                 if (ret) {
1774                         btrfs_tree_unlock(child);
1775                         free_extent_buffer(child);
1776                         goto enospc;
1777                 }
1778
1779                 tree_mod_log_set_root_pointer(root, child, 1);
1780                 rcu_assign_pointer(root->node, child);
1781
1782                 add_root_to_dirty_list(root);
1783                 btrfs_tree_unlock(child);
1784
1785                 path->locks[level] = 0;
1786                 path->nodes[level] = NULL;
1787                 clean_tree_block(trans, root, mid);
1788                 btrfs_tree_unlock(mid);
1789                 /* once for the path */
1790                 free_extent_buffer(mid);
1791
1792                 root_sub_used(root, mid->len);
1793                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1794                 /* once for the root ptr */
1795                 free_extent_buffer_stale(mid);
1796                 return 0;
1797         }
1798         if (btrfs_header_nritems(mid) >
1799             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1800                 return 0;
1801
1802         left = read_node_slot(root, parent, pslot - 1);
1803         if (left) {
1804                 btrfs_tree_lock(left);
1805                 btrfs_set_lock_blocking(left);
1806                 wret = btrfs_cow_block(trans, root, left,
1807                                        parent, pslot - 1, &left);
1808                 if (wret) {
1809                         ret = wret;
1810                         goto enospc;
1811                 }
1812         }
1813         right = read_node_slot(root, parent, pslot + 1);
1814         if (right) {
1815                 btrfs_tree_lock(right);
1816                 btrfs_set_lock_blocking(right);
1817                 wret = btrfs_cow_block(trans, root, right,
1818                                        parent, pslot + 1, &right);
1819                 if (wret) {
1820                         ret = wret;
1821                         goto enospc;
1822                 }
1823         }
1824
1825         /* first, try to make some room in the middle buffer */
1826         if (left) {
1827                 orig_slot += btrfs_header_nritems(left);
1828                 wret = push_node_left(trans, root, left, mid, 1);
1829                 if (wret < 0)
1830                         ret = wret;
1831         }
1832
1833         /*
1834          * then try to empty the right most buffer into the middle
1835          */
1836         if (right) {
1837                 wret = push_node_left(trans, root, mid, right, 1);
1838                 if (wret < 0 && wret != -ENOSPC)
1839                         ret = wret;
1840                 if (btrfs_header_nritems(right) == 0) {
1841                         clean_tree_block(trans, root, right);
1842                         btrfs_tree_unlock(right);
1843                         del_ptr(root, path, level + 1, pslot + 1);
1844                         root_sub_used(root, right->len);
1845                         btrfs_free_tree_block(trans, root, right, 0, 1);
1846                         free_extent_buffer_stale(right);
1847                         right = NULL;
1848                 } else {
1849                         struct btrfs_disk_key right_key;
1850                         btrfs_node_key(right, &right_key, 0);
1851                         tree_mod_log_set_node_key(root->fs_info, parent,
1852                                                   pslot + 1, 0);
1853                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1854                         btrfs_mark_buffer_dirty(parent);
1855                 }
1856         }
1857         if (btrfs_header_nritems(mid) == 1) {
1858                 /*
1859                  * we're not allowed to leave a node with one item in the
1860                  * tree during a delete.  A deletion from lower in the tree
1861                  * could try to delete the only pointer in this node.
1862                  * So, pull some keys from the left.
1863                  * There has to be a left pointer at this point because
1864                  * otherwise we would have pulled some pointers from the
1865                  * right
1866                  */
1867                 if (!left) {
1868                         ret = -EROFS;
1869                         btrfs_std_error(root->fs_info, ret);
1870                         goto enospc;
1871                 }
1872                 wret = balance_node_right(trans, root, mid, left);
1873                 if (wret < 0) {
1874                         ret = wret;
1875                         goto enospc;
1876                 }
1877                 if (wret == 1) {
1878                         wret = push_node_left(trans, root, left, mid, 1);
1879                         if (wret < 0)
1880                                 ret = wret;
1881                 }
1882                 BUG_ON(wret == 1);
1883         }
1884         if (btrfs_header_nritems(mid) == 0) {
1885                 clean_tree_block(trans, root, mid);
1886                 btrfs_tree_unlock(mid);
1887                 del_ptr(root, path, level + 1, pslot);
1888                 root_sub_used(root, mid->len);
1889                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1890                 free_extent_buffer_stale(mid);
1891                 mid = NULL;
1892         } else {
1893                 /* update the parent key to reflect our changes */
1894                 struct btrfs_disk_key mid_key;
1895                 btrfs_node_key(mid, &mid_key, 0);
1896                 tree_mod_log_set_node_key(root->fs_info, parent,
1897                                           pslot, 0);
1898                 btrfs_set_node_key(parent, &mid_key, pslot);
1899                 btrfs_mark_buffer_dirty(parent);
1900         }
1901
1902         /* update the path */
1903         if (left) {
1904                 if (btrfs_header_nritems(left) > orig_slot) {
1905                         extent_buffer_get(left);
1906                         /* left was locked after cow */
1907                         path->nodes[level] = left;
1908                         path->slots[level + 1] -= 1;
1909                         path->slots[level] = orig_slot;
1910                         if (mid) {
1911                                 btrfs_tree_unlock(mid);
1912                                 free_extent_buffer(mid);
1913                         }
1914                 } else {
1915                         orig_slot -= btrfs_header_nritems(left);
1916                         path->slots[level] = orig_slot;
1917                 }
1918         }
1919         /* double check we haven't messed things up */
1920         if (orig_ptr !=
1921             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1922                 BUG();
1923 enospc:
1924         if (right) {
1925                 btrfs_tree_unlock(right);
1926                 free_extent_buffer(right);
1927         }
1928         if (left) {
1929                 if (path->nodes[level] != left)
1930                         btrfs_tree_unlock(left);
1931                 free_extent_buffer(left);
1932         }
1933         return ret;
1934 }
1935
1936 /* Node balancing for insertion.  Here we only split or push nodes around
1937  * when they are completely full.  This is also done top down, so we
1938  * have to be pessimistic.
1939  */
1940 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1941                                           struct btrfs_root *root,
1942                                           struct btrfs_path *path, int level)
1943 {
1944         struct extent_buffer *right = NULL;
1945         struct extent_buffer *mid;
1946         struct extent_buffer *left = NULL;
1947         struct extent_buffer *parent = NULL;
1948         int ret = 0;
1949         int wret;
1950         int pslot;
1951         int orig_slot = path->slots[level];
1952
1953         if (level == 0)
1954                 return 1;
1955
1956         mid = path->nodes[level];
1957         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1958
1959         if (level < BTRFS_MAX_LEVEL - 1) {
1960                 parent = path->nodes[level + 1];
1961                 pslot = path->slots[level + 1];
1962         }
1963
1964         if (!parent)
1965                 return 1;
1966
1967         left = read_node_slot(root, parent, pslot - 1);
1968
1969         /* first, try to make some room in the middle buffer */
1970         if (left) {
1971                 u32 left_nr;
1972
1973                 btrfs_tree_lock(left);
1974                 btrfs_set_lock_blocking(left);
1975
1976                 left_nr = btrfs_header_nritems(left);
1977                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1978                         wret = 1;
1979                 } else {
1980                         ret = btrfs_cow_block(trans, root, left, parent,
1981                                               pslot - 1, &left);
1982                         if (ret)
1983                                 wret = 1;
1984                         else {
1985                                 wret = push_node_left(trans, root,
1986                                                       left, mid, 0);
1987                         }
1988                 }
1989                 if (wret < 0)
1990                         ret = wret;
1991                 if (wret == 0) {
1992                         struct btrfs_disk_key disk_key;
1993                         orig_slot += left_nr;
1994                         btrfs_node_key(mid, &disk_key, 0);
1995                         tree_mod_log_set_node_key(root->fs_info, parent,
1996                                                   pslot, 0);
1997                         btrfs_set_node_key(parent, &disk_key, pslot);
1998                         btrfs_mark_buffer_dirty(parent);
1999                         if (btrfs_header_nritems(left) > orig_slot) {
2000                                 path->nodes[level] = left;
2001                                 path->slots[level + 1] -= 1;
2002                                 path->slots[level] = orig_slot;
2003                                 btrfs_tree_unlock(mid);
2004                                 free_extent_buffer(mid);
2005                         } else {
2006                                 orig_slot -=
2007                                         btrfs_header_nritems(left);
2008                                 path->slots[level] = orig_slot;
2009                                 btrfs_tree_unlock(left);
2010                                 free_extent_buffer(left);
2011                         }
2012                         return 0;
2013                 }
2014                 btrfs_tree_unlock(left);
2015                 free_extent_buffer(left);
2016         }
2017         right = read_node_slot(root, parent, pslot + 1);
2018
2019         /*
2020          * then try to empty the right most buffer into the middle
2021          */
2022         if (right) {
2023                 u32 right_nr;
2024
2025                 btrfs_tree_lock(right);
2026                 btrfs_set_lock_blocking(right);
2027
2028                 right_nr = btrfs_header_nritems(right);
2029                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2030                         wret = 1;
2031                 } else {
2032                         ret = btrfs_cow_block(trans, root, right,
2033                                               parent, pslot + 1,
2034                                               &right);
2035                         if (ret)
2036                                 wret = 1;
2037                         else {
2038                                 wret = balance_node_right(trans, root,
2039                                                           right, mid);
2040                         }
2041                 }
2042                 if (wret < 0)
2043                         ret = wret;
2044                 if (wret == 0) {
2045                         struct btrfs_disk_key disk_key;
2046
2047                         btrfs_node_key(right, &disk_key, 0);
2048                         tree_mod_log_set_node_key(root->fs_info, parent,
2049                                                   pslot + 1, 0);
2050                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2051                         btrfs_mark_buffer_dirty(parent);
2052
2053                         if (btrfs_header_nritems(mid) <= orig_slot) {
2054                                 path->nodes[level] = right;
2055                                 path->slots[level + 1] += 1;
2056                                 path->slots[level] = orig_slot -
2057                                         btrfs_header_nritems(mid);
2058                                 btrfs_tree_unlock(mid);
2059                                 free_extent_buffer(mid);
2060                         } else {
2061                                 btrfs_tree_unlock(right);
2062                                 free_extent_buffer(right);
2063                         }
2064                         return 0;
2065                 }
2066                 btrfs_tree_unlock(right);
2067                 free_extent_buffer(right);
2068         }
2069         return 1;
2070 }
2071
2072 /*
2073  * readahead one full node of leaves, finding things that are close
2074  * to the block in 'slot', and triggering ra on them.
2075  */
2076 static void reada_for_search(struct btrfs_root *root,
2077                              struct btrfs_path *path,
2078                              int level, int slot, u64 objectid)
2079 {
2080         struct extent_buffer *node;
2081         struct btrfs_disk_key disk_key;
2082         u32 nritems;
2083         u64 search;
2084         u64 target;
2085         u64 nread = 0;
2086         u64 gen;
2087         int direction = path->reada;
2088         struct extent_buffer *eb;
2089         u32 nr;
2090         u32 blocksize;
2091         u32 nscan = 0;
2092
2093         if (level != 1)
2094                 return;
2095
2096         if (!path->nodes[level])
2097                 return;
2098
2099         node = path->nodes[level];
2100
2101         search = btrfs_node_blockptr(node, slot);
2102         blocksize = btrfs_level_size(root, level - 1);
2103         eb = btrfs_find_tree_block(root, search, blocksize);
2104         if (eb) {
2105                 free_extent_buffer(eb);
2106                 return;
2107         }
2108
2109         target = search;
2110
2111         nritems = btrfs_header_nritems(node);
2112         nr = slot;
2113
2114         while (1) {
2115                 if (direction < 0) {
2116                         if (nr == 0)
2117                                 break;
2118                         nr--;
2119                 } else if (direction > 0) {
2120                         nr++;
2121                         if (nr >= nritems)
2122                                 break;
2123                 }
2124                 if (path->reada < 0 && objectid) {
2125                         btrfs_node_key(node, &disk_key, nr);
2126                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2127                                 break;
2128                 }
2129                 search = btrfs_node_blockptr(node, nr);
2130                 if ((search <= target && target - search <= 65536) ||
2131                     (search > target && search - target <= 65536)) {
2132                         gen = btrfs_node_ptr_generation(node, nr);
2133                         readahead_tree_block(root, search, blocksize, gen);
2134                         nread += blocksize;
2135                 }
2136                 nscan++;
2137                 if ((nread > 65536 || nscan > 32))
2138                         break;
2139         }
2140 }
2141
2142 static noinline void reada_for_balance(struct btrfs_root *root,
2143                                        struct btrfs_path *path, int level)
2144 {
2145         int slot;
2146         int nritems;
2147         struct extent_buffer *parent;
2148         struct extent_buffer *eb;
2149         u64 gen;
2150         u64 block1 = 0;
2151         u64 block2 = 0;
2152         int blocksize;
2153
2154         parent = path->nodes[level + 1];
2155         if (!parent)
2156                 return;
2157
2158         nritems = btrfs_header_nritems(parent);
2159         slot = path->slots[level + 1];
2160         blocksize = btrfs_level_size(root, level);
2161
2162         if (slot > 0) {
2163                 block1 = btrfs_node_blockptr(parent, slot - 1);
2164                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2165                 eb = btrfs_find_tree_block(root, block1, blocksize);
2166                 /*
2167                  * if we get -eagain from btrfs_buffer_uptodate, we
2168                  * don't want to return eagain here.  That will loop
2169                  * forever
2170                  */
2171                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2172                         block1 = 0;
2173                 free_extent_buffer(eb);
2174         }
2175         if (slot + 1 < nritems) {
2176                 block2 = btrfs_node_blockptr(parent, slot + 1);
2177                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2178                 eb = btrfs_find_tree_block(root, block2, blocksize);
2179                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2180                         block2 = 0;
2181                 free_extent_buffer(eb);
2182         }
2183
2184         if (block1)
2185                 readahead_tree_block(root, block1, blocksize, 0);
2186         if (block2)
2187                 readahead_tree_block(root, block2, blocksize, 0);
2188 }
2189
2190
2191 /*
2192  * when we walk down the tree, it is usually safe to unlock the higher layers
2193  * in the tree.  The exceptions are when our path goes through slot 0, because
2194  * operations on the tree might require changing key pointers higher up in the
2195  * tree.
2196  *
2197  * callers might also have set path->keep_locks, which tells this code to keep
2198  * the lock if the path points to the last slot in the block.  This is part of
2199  * walking through the tree, and selecting the next slot in the higher block.
2200  *
2201  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2202  * if lowest_unlock is 1, level 0 won't be unlocked
2203  */
2204 static noinline void unlock_up(struct btrfs_path *path, int level,
2205                                int lowest_unlock, int min_write_lock_level,
2206                                int *write_lock_level)
2207 {
2208         int i;
2209         int skip_level = level;
2210         int no_skips = 0;
2211         struct extent_buffer *t;
2212
2213         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2214                 if (!path->nodes[i])
2215                         break;
2216                 if (!path->locks[i])
2217                         break;
2218                 if (!no_skips && path->slots[i] == 0) {
2219                         skip_level = i + 1;
2220                         continue;
2221                 }
2222                 if (!no_skips && path->keep_locks) {
2223                         u32 nritems;
2224                         t = path->nodes[i];
2225                         nritems = btrfs_header_nritems(t);
2226                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2227                                 skip_level = i + 1;
2228                                 continue;
2229                         }
2230                 }
2231                 if (skip_level < i && i >= lowest_unlock)
2232                         no_skips = 1;
2233
2234                 t = path->nodes[i];
2235                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2236                         btrfs_tree_unlock_rw(t, path->locks[i]);
2237                         path->locks[i] = 0;
2238                         if (write_lock_level &&
2239                             i > min_write_lock_level &&
2240                             i <= *write_lock_level) {
2241                                 *write_lock_level = i - 1;
2242                         }
2243                 }
2244         }
2245 }
2246
2247 /*
2248  * This releases any locks held in the path starting at level and
2249  * going all the way up to the root.
2250  *
2251  * btrfs_search_slot will keep the lock held on higher nodes in a few
2252  * corner cases, such as COW of the block at slot zero in the node.  This
2253  * ignores those rules, and it should only be called when there are no
2254  * more updates to be done higher up in the tree.
2255  */
2256 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2257 {
2258         int i;
2259
2260         if (path->keep_locks)
2261                 return;
2262
2263         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2264                 if (!path->nodes[i])
2265                         continue;
2266                 if (!path->locks[i])
2267                         continue;
2268                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2269                 path->locks[i] = 0;
2270         }
2271 }
2272
2273 /*
2274  * helper function for btrfs_search_slot.  The goal is to find a block
2275  * in cache without setting the path to blocking.  If we find the block
2276  * we return zero and the path is unchanged.
2277  *
2278  * If we can't find the block, we set the path blocking and do some
2279  * reada.  -EAGAIN is returned and the search must be repeated.
2280  */
2281 static int
2282 read_block_for_search(struct btrfs_trans_handle *trans,
2283                        struct btrfs_root *root, struct btrfs_path *p,
2284                        struct extent_buffer **eb_ret, int level, int slot,
2285                        struct btrfs_key *key, u64 time_seq)
2286 {
2287         u64 blocknr;
2288         u64 gen;
2289         u32 blocksize;
2290         struct extent_buffer *b = *eb_ret;
2291         struct extent_buffer *tmp;
2292         int ret;
2293
2294         blocknr = btrfs_node_blockptr(b, slot);
2295         gen = btrfs_node_ptr_generation(b, slot);
2296         blocksize = btrfs_level_size(root, level - 1);
2297
2298         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2299         if (tmp) {
2300                 /* first we do an atomic uptodate check */
2301                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2302                         *eb_ret = tmp;
2303                         return 0;
2304                 }
2305
2306                 /* the pages were up to date, but we failed
2307                  * the generation number check.  Do a full
2308                  * read for the generation number that is correct.
2309                  * We must do this without dropping locks so
2310                  * we can trust our generation number
2311                  */
2312                 btrfs_set_path_blocking(p);
2313
2314                 /* now we're allowed to do a blocking uptodate check */
2315                 ret = btrfs_read_buffer(tmp, gen);
2316                 if (!ret) {
2317                         *eb_ret = tmp;
2318                         return 0;
2319                 }
2320                 free_extent_buffer(tmp);
2321                 btrfs_release_path(p);
2322                 return -EIO;
2323         }
2324
2325         /*
2326          * reduce lock contention at high levels
2327          * of the btree by dropping locks before
2328          * we read.  Don't release the lock on the current
2329          * level because we need to walk this node to figure
2330          * out which blocks to read.
2331          */
2332         btrfs_unlock_up_safe(p, level + 1);
2333         btrfs_set_path_blocking(p);
2334
2335         free_extent_buffer(tmp);
2336         if (p->reada)
2337                 reada_for_search(root, p, level, slot, key->objectid);
2338
2339         btrfs_release_path(p);
2340
2341         ret = -EAGAIN;
2342         tmp = read_tree_block(root, blocknr, blocksize, 0);
2343         if (tmp) {
2344                 /*
2345                  * If the read above didn't mark this buffer up to date,
2346                  * it will never end up being up to date.  Set ret to EIO now
2347                  * and give up so that our caller doesn't loop forever
2348                  * on our EAGAINs.
2349                  */
2350                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2351                         ret = -EIO;
2352                 free_extent_buffer(tmp);
2353         }
2354         return ret;
2355 }
2356
2357 /*
2358  * helper function for btrfs_search_slot.  This does all of the checks
2359  * for node-level blocks and does any balancing required based on
2360  * the ins_len.
2361  *
2362  * If no extra work was required, zero is returned.  If we had to
2363  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2364  * start over
2365  */
2366 static int
2367 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2368                        struct btrfs_root *root, struct btrfs_path *p,
2369                        struct extent_buffer *b, int level, int ins_len,
2370                        int *write_lock_level)
2371 {
2372         int ret;
2373         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2374             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2375                 int sret;
2376
2377                 if (*write_lock_level < level + 1) {
2378                         *write_lock_level = level + 1;
2379                         btrfs_release_path(p);
2380                         goto again;
2381                 }
2382
2383                 btrfs_set_path_blocking(p);
2384                 reada_for_balance(root, p, level);
2385                 sret = split_node(trans, root, p, level);
2386                 btrfs_clear_path_blocking(p, NULL, 0);
2387
2388                 BUG_ON(sret > 0);
2389                 if (sret) {
2390                         ret = sret;
2391                         goto done;
2392                 }
2393                 b = p->nodes[level];
2394         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2395                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2396                 int sret;
2397
2398                 if (*write_lock_level < level + 1) {
2399                         *write_lock_level = level + 1;
2400                         btrfs_release_path(p);
2401                         goto again;
2402                 }
2403
2404                 btrfs_set_path_blocking(p);
2405                 reada_for_balance(root, p, level);
2406                 sret = balance_level(trans, root, p, level);
2407                 btrfs_clear_path_blocking(p, NULL, 0);
2408
2409                 if (sret) {
2410                         ret = sret;
2411                         goto done;
2412                 }
2413                 b = p->nodes[level];
2414                 if (!b) {
2415                         btrfs_release_path(p);
2416                         goto again;
2417                 }
2418                 BUG_ON(btrfs_header_nritems(b) == 1);
2419         }
2420         return 0;
2421
2422 again:
2423         ret = -EAGAIN;
2424 done:
2425         return ret;
2426 }
2427
2428 /*
2429  * look for key in the tree.  path is filled in with nodes along the way
2430  * if key is found, we return zero and you can find the item in the leaf
2431  * level of the path (level 0)
2432  *
2433  * If the key isn't found, the path points to the slot where it should
2434  * be inserted, and 1 is returned.  If there are other errors during the
2435  * search a negative error number is returned.
2436  *
2437  * if ins_len > 0, nodes and leaves will be split as we walk down the
2438  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2439  * possible)
2440  */
2441 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2442                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2443                       ins_len, int cow)
2444 {
2445         struct extent_buffer *b;
2446         int slot;
2447         int ret;
2448         int err;
2449         int level;
2450         int lowest_unlock = 1;
2451         int root_lock;
2452         /* everything at write_lock_level or lower must be write locked */
2453         int write_lock_level = 0;
2454         u8 lowest_level = 0;
2455         int min_write_lock_level;
2456
2457         lowest_level = p->lowest_level;
2458         WARN_ON(lowest_level && ins_len > 0);
2459         WARN_ON(p->nodes[0] != NULL);
2460
2461         if (ins_len < 0) {
2462                 lowest_unlock = 2;
2463
2464                 /* when we are removing items, we might have to go up to level
2465                  * two as we update tree pointers  Make sure we keep write
2466                  * for those levels as well
2467                  */
2468                 write_lock_level = 2;
2469         } else if (ins_len > 0) {
2470                 /*
2471                  * for inserting items, make sure we have a write lock on
2472                  * level 1 so we can update keys
2473                  */
2474                 write_lock_level = 1;
2475         }
2476
2477         if (!cow)
2478                 write_lock_level = -1;
2479
2480         if (cow && (p->keep_locks || p->lowest_level))
2481                 write_lock_level = BTRFS_MAX_LEVEL;
2482
2483         min_write_lock_level = write_lock_level;
2484
2485 again:
2486         /*
2487          * we try very hard to do read locks on the root
2488          */
2489         root_lock = BTRFS_READ_LOCK;
2490         level = 0;
2491         if (p->search_commit_root) {
2492                 /*
2493                  * the commit roots are read only
2494                  * so we always do read locks
2495                  */
2496                 b = root->commit_root;
2497                 extent_buffer_get(b);
2498                 level = btrfs_header_level(b);
2499                 if (!p->skip_locking)
2500                         btrfs_tree_read_lock(b);
2501         } else {
2502                 if (p->skip_locking) {
2503                         b = btrfs_root_node(root);
2504                         level = btrfs_header_level(b);
2505                 } else {
2506                         /* we don't know the level of the root node
2507                          * until we actually have it read locked
2508                          */
2509                         b = btrfs_read_lock_root_node(root);
2510                         level = btrfs_header_level(b);
2511                         if (level <= write_lock_level) {
2512                                 /* whoops, must trade for write lock */
2513                                 btrfs_tree_read_unlock(b);
2514                                 free_extent_buffer(b);
2515                                 b = btrfs_lock_root_node(root);
2516                                 root_lock = BTRFS_WRITE_LOCK;
2517
2518                                 /* the level might have changed, check again */
2519                                 level = btrfs_header_level(b);
2520                         }
2521                 }
2522         }
2523         p->nodes[level] = b;
2524         if (!p->skip_locking)
2525                 p->locks[level] = root_lock;
2526
2527         while (b) {
2528                 level = btrfs_header_level(b);
2529
2530                 /*
2531                  * setup the path here so we can release it under lock
2532                  * contention with the cow code
2533                  */
2534                 if (cow) {
2535                         /*
2536                          * if we don't really need to cow this block
2537                          * then we don't want to set the path blocking,
2538                          * so we test it here
2539                          */
2540                         if (!should_cow_block(trans, root, b))
2541                                 goto cow_done;
2542
2543                         btrfs_set_path_blocking(p);
2544
2545                         /*
2546                          * must have write locks on this node and the
2547                          * parent
2548                          */
2549                         if (level > write_lock_level ||
2550                             (level + 1 > write_lock_level &&
2551                             level + 1 < BTRFS_MAX_LEVEL &&
2552                             p->nodes[level + 1])) {
2553                                 write_lock_level = level + 1;
2554                                 btrfs_release_path(p);
2555                                 goto again;
2556                         }
2557
2558                         err = btrfs_cow_block(trans, root, b,
2559                                               p->nodes[level + 1],
2560                                               p->slots[level + 1], &b);
2561                         if (err) {
2562                                 ret = err;
2563                                 goto done;
2564                         }
2565                 }
2566 cow_done:
2567                 BUG_ON(!cow && ins_len);
2568
2569                 p->nodes[level] = b;
2570                 btrfs_clear_path_blocking(p, NULL, 0);
2571
2572                 /*
2573                  * we have a lock on b and as long as we aren't changing
2574                  * the tree, there is no way to for the items in b to change.
2575                  * It is safe to drop the lock on our parent before we
2576                  * go through the expensive btree search on b.
2577                  *
2578                  * If cow is true, then we might be changing slot zero,
2579                  * which may require changing the parent.  So, we can't
2580                  * drop the lock until after we know which slot we're
2581                  * operating on.
2582                  */
2583                 if (!cow)
2584                         btrfs_unlock_up_safe(p, level + 1);
2585
2586                 ret = bin_search(b, key, level, &slot);
2587
2588                 if (level != 0) {
2589                         int dec = 0;
2590                         if (ret && slot > 0) {
2591                                 dec = 1;
2592                                 slot -= 1;
2593                         }
2594                         p->slots[level] = slot;
2595                         err = setup_nodes_for_search(trans, root, p, b, level,
2596                                              ins_len, &write_lock_level);
2597                         if (err == -EAGAIN)
2598                                 goto again;
2599                         if (err) {
2600                                 ret = err;
2601                                 goto done;
2602                         }
2603                         b = p->nodes[level];
2604                         slot = p->slots[level];
2605
2606                         /*
2607                          * slot 0 is special, if we change the key
2608                          * we have to update the parent pointer
2609                          * which means we must have a write lock
2610                          * on the parent
2611                          */
2612                         if (slot == 0 && cow &&
2613                             write_lock_level < level + 1) {
2614                                 write_lock_level = level + 1;
2615                                 btrfs_release_path(p);
2616                                 goto again;
2617                         }
2618
2619                         unlock_up(p, level, lowest_unlock,
2620                                   min_write_lock_level, &write_lock_level);
2621
2622                         if (level == lowest_level) {
2623                                 if (dec)
2624                                         p->slots[level]++;
2625                                 goto done;
2626                         }
2627
2628                         err = read_block_for_search(trans, root, p,
2629                                                     &b, level, slot, key, 0);
2630                         if (err == -EAGAIN)
2631                                 goto again;
2632                         if (err) {
2633                                 ret = err;
2634                                 goto done;
2635                         }
2636
2637                         if (!p->skip_locking) {
2638                                 level = btrfs_header_level(b);
2639                                 if (level <= write_lock_level) {
2640                                         err = btrfs_try_tree_write_lock(b);
2641                                         if (!err) {
2642                                                 btrfs_set_path_blocking(p);
2643                                                 btrfs_tree_lock(b);
2644                                                 btrfs_clear_path_blocking(p, b,
2645                                                                   BTRFS_WRITE_LOCK);
2646                                         }
2647                                         p->locks[level] = BTRFS_WRITE_LOCK;
2648                                 } else {
2649                                         err = btrfs_try_tree_read_lock(b);
2650                                         if (!err) {
2651                                                 btrfs_set_path_blocking(p);
2652                                                 btrfs_tree_read_lock(b);
2653                                                 btrfs_clear_path_blocking(p, b,
2654                                                                   BTRFS_READ_LOCK);
2655                                         }
2656                                         p->locks[level] = BTRFS_READ_LOCK;
2657                                 }
2658                                 p->nodes[level] = b;
2659                         }
2660                 } else {
2661                         p->slots[level] = slot;
2662                         if (ins_len > 0 &&
2663                             btrfs_leaf_free_space(root, b) < ins_len) {
2664                                 if (write_lock_level < 1) {
2665                                         write_lock_level = 1;
2666                                         btrfs_release_path(p);
2667                                         goto again;
2668                                 }
2669
2670                                 btrfs_set_path_blocking(p);
2671                                 err = split_leaf(trans, root, key,
2672                                                  p, ins_len, ret == 0);
2673                                 btrfs_clear_path_blocking(p, NULL, 0);
2674
2675                                 BUG_ON(err > 0);
2676                                 if (err) {
2677                                         ret = err;
2678                                         goto done;
2679                                 }
2680                         }
2681                         if (!p->search_for_split)
2682                                 unlock_up(p, level, lowest_unlock,
2683                                           min_write_lock_level, &write_lock_level);
2684                         goto done;
2685                 }
2686         }
2687         ret = 1;
2688 done:
2689         /*
2690          * we don't really know what they plan on doing with the path
2691          * from here on, so for now just mark it as blocking
2692          */
2693         if (!p->leave_spinning)
2694                 btrfs_set_path_blocking(p);
2695         if (ret < 0)
2696                 btrfs_release_path(p);
2697         return ret;
2698 }
2699
2700 /*
2701  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2702  * current state of the tree together with the operations recorded in the tree
2703  * modification log to search for the key in a previous version of this tree, as
2704  * denoted by the time_seq parameter.
2705  *
2706  * Naturally, there is no support for insert, delete or cow operations.
2707  *
2708  * The resulting path and return value will be set up as if we called
2709  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2710  */
2711 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2712                           struct btrfs_path *p, u64 time_seq)
2713 {
2714         struct extent_buffer *b;
2715         int slot;
2716         int ret;
2717         int err;
2718         int level;
2719         int lowest_unlock = 1;
2720         u8 lowest_level = 0;
2721
2722         lowest_level = p->lowest_level;
2723         WARN_ON(p->nodes[0] != NULL);
2724
2725         if (p->search_commit_root) {
2726                 BUG_ON(time_seq);
2727                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2728         }
2729
2730 again:
2731         b = get_old_root(root, time_seq);
2732         level = btrfs_header_level(b);
2733         p->locks[level] = BTRFS_READ_LOCK;
2734
2735         while (b) {
2736                 level = btrfs_header_level(b);
2737                 p->nodes[level] = b;
2738                 btrfs_clear_path_blocking(p, NULL, 0);
2739
2740                 /*
2741                  * we have a lock on b and as long as we aren't changing
2742                  * the tree, there is no way to for the items in b to change.
2743                  * It is safe to drop the lock on our parent before we
2744                  * go through the expensive btree search on b.
2745                  */
2746                 btrfs_unlock_up_safe(p, level + 1);
2747
2748                 ret = bin_search(b, key, level, &slot);
2749
2750                 if (level != 0) {
2751                         int dec = 0;
2752                         if (ret && slot > 0) {
2753                                 dec = 1;
2754                                 slot -= 1;
2755                         }
2756                         p->slots[level] = slot;
2757                         unlock_up(p, level, lowest_unlock, 0, NULL);
2758
2759                         if (level == lowest_level) {
2760                                 if (dec)
2761                                         p->slots[level]++;
2762                                 goto done;
2763                         }
2764
2765                         err = read_block_for_search(NULL, root, p, &b, level,
2766                                                     slot, key, time_seq);
2767                         if (err == -EAGAIN)
2768                                 goto again;
2769                         if (err) {
2770                                 ret = err;
2771                                 goto done;
2772                         }
2773
2774                         level = btrfs_header_level(b);
2775                         err = btrfs_try_tree_read_lock(b);
2776                         if (!err) {
2777                                 btrfs_set_path_blocking(p);
2778                                 btrfs_tree_read_lock(b);
2779                                 btrfs_clear_path_blocking(p, b,
2780                                                           BTRFS_READ_LOCK);
2781                         }
2782                         b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2783                         if (!b) {
2784                                 ret = -ENOMEM;
2785                                 goto done;
2786                         }
2787                         p->locks[level] = BTRFS_READ_LOCK;
2788                         p->nodes[level] = b;
2789                 } else {
2790                         p->slots[level] = slot;
2791                         unlock_up(p, level, lowest_unlock, 0, NULL);
2792                         goto done;
2793                 }
2794         }
2795         ret = 1;
2796 done:
2797         if (!p->leave_spinning)
2798                 btrfs_set_path_blocking(p);
2799         if (ret < 0)
2800                 btrfs_release_path(p);
2801
2802         return ret;
2803 }
2804
2805 /*
2806  * helper to use instead of search slot if no exact match is needed but
2807  * instead the next or previous item should be returned.
2808  * When find_higher is true, the next higher item is returned, the next lower
2809  * otherwise.
2810  * When return_any and find_higher are both true, and no higher item is found,
2811  * return the next lower instead.
2812  * When return_any is true and find_higher is false, and no lower item is found,
2813  * return the next higher instead.
2814  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2815  * < 0 on error
2816  */
2817 int btrfs_search_slot_for_read(struct btrfs_root *root,
2818                                struct btrfs_key *key, struct btrfs_path *p,
2819                                int find_higher, int return_any)
2820 {
2821         int ret;
2822         struct extent_buffer *leaf;
2823
2824 again:
2825         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2826         if (ret <= 0)
2827                 return ret;
2828         /*
2829          * a return value of 1 means the path is at the position where the
2830          * item should be inserted. Normally this is the next bigger item,
2831          * but in case the previous item is the last in a leaf, path points
2832          * to the first free slot in the previous leaf, i.e. at an invalid
2833          * item.
2834          */
2835         leaf = p->nodes[0];
2836
2837         if (find_higher) {
2838                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2839                         ret = btrfs_next_leaf(root, p);
2840                         if (ret <= 0)
2841                                 return ret;
2842                         if (!return_any)
2843                                 return 1;
2844                         /*
2845                          * no higher item found, return the next
2846                          * lower instead
2847                          */
2848                         return_any = 0;
2849                         find_higher = 0;
2850                         btrfs_release_path(p);
2851                         goto again;
2852                 }
2853         } else {
2854                 if (p->slots[0] == 0) {
2855                         ret = btrfs_prev_leaf(root, p);
2856                         if (ret < 0)
2857                                 return ret;
2858                         if (!ret) {
2859                                 p->slots[0] = btrfs_header_nritems(leaf) - 1;
2860                                 return 0;
2861                         }
2862                         if (!return_any)
2863                                 return 1;
2864                         /*
2865                          * no lower item found, return the next
2866                          * higher instead
2867                          */
2868                         return_any = 0;
2869                         find_higher = 1;
2870                         btrfs_release_path(p);
2871                         goto again;
2872                 } else {
2873                         --p->slots[0];
2874                 }
2875         }
2876         return 0;
2877 }
2878
2879 /*
2880  * adjust the pointers going up the tree, starting at level
2881  * making sure the right key of each node is points to 'key'.
2882  * This is used after shifting pointers to the left, so it stops
2883  * fixing up pointers when a given leaf/node is not in slot 0 of the
2884  * higher levels
2885  *
2886  */
2887 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
2888                            struct btrfs_disk_key *key, int level)
2889 {
2890         int i;
2891         struct extent_buffer *t;
2892
2893         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2894                 int tslot = path->slots[i];
2895                 if (!path->nodes[i])
2896                         break;
2897                 t = path->nodes[i];
2898                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2899                 btrfs_set_node_key(t, key, tslot);
2900                 btrfs_mark_buffer_dirty(path->nodes[i]);
2901                 if (tslot != 0)
2902                         break;
2903         }
2904 }
2905
2906 /*
2907  * update item key.
2908  *
2909  * This function isn't completely safe. It's the caller's responsibility
2910  * that the new key won't break the order
2911  */
2912 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
2913                              struct btrfs_key *new_key)
2914 {
2915         struct btrfs_disk_key disk_key;
2916         struct extent_buffer *eb;
2917         int slot;
2918
2919         eb = path->nodes[0];
2920         slot = path->slots[0];
2921         if (slot > 0) {
2922                 btrfs_item_key(eb, &disk_key, slot - 1);
2923                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2924         }
2925         if (slot < btrfs_header_nritems(eb) - 1) {
2926                 btrfs_item_key(eb, &disk_key, slot + 1);
2927                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2928         }
2929
2930         btrfs_cpu_key_to_disk(&disk_key, new_key);
2931         btrfs_set_item_key(eb, &disk_key, slot);
2932         btrfs_mark_buffer_dirty(eb);
2933         if (slot == 0)
2934                 fixup_low_keys(root, path, &disk_key, 1);
2935 }
2936
2937 /*
2938  * try to push data from one node into the next node left in the
2939  * tree.
2940  *
2941  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2942  * error, and > 0 if there was no room in the left hand block.
2943  */
2944 static int push_node_left(struct btrfs_trans_handle *trans,
2945                           struct btrfs_root *root, struct extent_buffer *dst,
2946                           struct extent_buffer *src, int empty)
2947 {
2948         int push_items = 0;
2949         int src_nritems;
2950         int dst_nritems;
2951         int ret = 0;
2952
2953         src_nritems = btrfs_header_nritems(src);
2954         dst_nritems = btrfs_header_nritems(dst);
2955         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2956         WARN_ON(btrfs_header_generation(src) != trans->transid);
2957         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2958
2959         if (!empty && src_nritems <= 8)
2960                 return 1;
2961
2962         if (push_items <= 0)
2963                 return 1;
2964
2965         if (empty) {
2966                 push_items = min(src_nritems, push_items);
2967                 if (push_items < src_nritems) {
2968                         /* leave at least 8 pointers in the node if
2969                          * we aren't going to empty it
2970                          */
2971                         if (src_nritems - push_items < 8) {
2972                                 if (push_items <= 8)
2973                                         return 1;
2974                                 push_items -= 8;
2975                         }
2976                 }
2977         } else
2978                 push_items = min(src_nritems - 8, push_items);
2979
2980         tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
2981                              push_items);
2982         copy_extent_buffer(dst, src,
2983                            btrfs_node_key_ptr_offset(dst_nritems),
2984                            btrfs_node_key_ptr_offset(0),
2985                            push_items * sizeof(struct btrfs_key_ptr));
2986
2987         if (push_items < src_nritems) {
2988                 /*
2989                  * don't call tree_mod_log_eb_move here, key removal was already
2990                  * fully logged by tree_mod_log_eb_copy above.
2991                  */
2992                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2993                                       btrfs_node_key_ptr_offset(push_items),
2994                                       (src_nritems - push_items) *
2995                                       sizeof(struct btrfs_key_ptr));
2996         }
2997         btrfs_set_header_nritems(src, src_nritems - push_items);
2998         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2999         btrfs_mark_buffer_dirty(src);
3000         btrfs_mark_buffer_dirty(dst);
3001
3002         return ret;
3003 }
3004
3005 /*
3006  * try to push data from one node into the next node right in the
3007  * tree.
3008  *
3009  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3010  * error, and > 0 if there was no room in the right hand block.
3011  *
3012  * this will  only push up to 1/2 the contents of the left node over
3013  */
3014 static int balance_node_right(struct btrfs_trans_handle *trans,
3015                               struct btrfs_root *root,
3016                               struct extent_buffer *dst,
3017                               struct extent_buffer *src)
3018 {
3019         int push_items = 0;
3020         int max_push;
3021         int src_nritems;
3022         int dst_nritems;
3023         int ret = 0;
3024
3025         WARN_ON(btrfs_header_generation(src) != trans->transid);
3026         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3027
3028         src_nritems = btrfs_header_nritems(src);
3029         dst_nritems = btrfs_header_nritems(dst);
3030         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3031         if (push_items <= 0)
3032                 return 1;
3033
3034         if (src_nritems < 4)
3035                 return 1;
3036
3037         max_push = src_nritems / 2 + 1;
3038         /* don't try to empty the node */
3039         if (max_push >= src_nritems)
3040                 return 1;
3041
3042         if (max_push < push_items)
3043                 push_items = max_push;
3044
3045         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3046         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3047                                       btrfs_node_key_ptr_offset(0),
3048                                       (dst_nritems) *
3049                                       sizeof(struct btrfs_key_ptr));
3050
3051         tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3052                              src_nritems - push_items, push_items);
3053         copy_extent_buffer(dst, src,
3054                            btrfs_node_key_ptr_offset(0),
3055                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3056                            push_items * sizeof(struct btrfs_key_ptr));
3057
3058         btrfs_set_header_nritems(src, src_nritems - push_items);
3059         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3060
3061         btrfs_mark_buffer_dirty(src);
3062         btrfs_mark_buffer_dirty(dst);
3063
3064         return ret;
3065 }
3066
3067 /*
3068  * helper function to insert a new root level in the tree.
3069  * A new node is allocated, and a single item is inserted to
3070  * point to the existing root
3071  *
3072  * returns zero on success or < 0 on failure.
3073  */
3074 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3075                            struct btrfs_root *root,
3076                            struct btrfs_path *path, int level)
3077 {
3078         u64 lower_gen;
3079         struct extent_buffer *lower;
3080         struct extent_buffer *c;
3081         struct extent_buffer *old;
3082         struct btrfs_disk_key lower_key;
3083
3084         BUG_ON(path->nodes[level]);
3085         BUG_ON(path->nodes[level-1] != root->node);
3086
3087         lower = path->nodes[level-1];
3088         if (level == 1)
3089                 btrfs_item_key(lower, &lower_key, 0);
3090         else
3091                 btrfs_node_key(lower, &lower_key, 0);
3092
3093         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3094                                    root->root_key.objectid, &lower_key,
3095                                    level, root->node->start, 0);
3096         if (IS_ERR(c))
3097                 return PTR_ERR(c);
3098
3099         root_add_used(root, root->nodesize);
3100
3101         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3102         btrfs_set_header_nritems(c, 1);
3103         btrfs_set_header_level(c, level);
3104         btrfs_set_header_bytenr(c, c->start);
3105         btrfs_set_header_generation(c, trans->transid);
3106         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3107         btrfs_set_header_owner(c, root->root_key.objectid);
3108
3109         write_extent_buffer(c, root->fs_info->fsid,
3110                             (unsigned long)btrfs_header_fsid(c),
3111                             BTRFS_FSID_SIZE);
3112
3113         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3114                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
3115                             BTRFS_UUID_SIZE);
3116
3117         btrfs_set_node_key(c, &lower_key, 0);
3118         btrfs_set_node_blockptr(c, 0, lower->start);
3119         lower_gen = btrfs_header_generation(lower);
3120         WARN_ON(lower_gen != trans->transid);
3121
3122         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3123
3124         btrfs_mark_buffer_dirty(c);
3125
3126         old = root->node;
3127         tree_mod_log_set_root_pointer(root, c, 0);
3128         rcu_assign_pointer(root->node, c);
3129
3130         /* the super has an extra ref to root->node */
3131         free_extent_buffer(old);
3132
3133         add_root_to_dirty_list(root);
3134         extent_buffer_get(c);
3135         path->nodes[level] = c;
3136         path->locks[level] = BTRFS_WRITE_LOCK;
3137         path->slots[level] = 0;
3138         return 0;
3139 }
3140
3141 /*
3142  * worker function to insert a single pointer in a node.
3143  * the node should have enough room for the pointer already
3144  *
3145  * slot and level indicate where you want the key to go, and
3146  * blocknr is the block the key points to.
3147  */
3148 static void insert_ptr(struct btrfs_trans_handle *trans,
3149                        struct btrfs_root *root, struct btrfs_path *path,
3150                        struct btrfs_disk_key *key, u64 bytenr,
3151                        int slot, int level)
3152 {
3153         struct extent_buffer *lower;
3154         int nritems;
3155         int ret;
3156
3157         BUG_ON(!path->nodes[level]);
3158         btrfs_assert_tree_locked(path->nodes[level]);
3159         lower = path->nodes[level];
3160         nritems = btrfs_header_nritems(lower);
3161         BUG_ON(slot > nritems);
3162         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3163         if (slot != nritems) {
3164                 if (level)
3165                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3166                                              slot, nritems - slot);
3167                 memmove_extent_buffer(lower,
3168                               btrfs_node_key_ptr_offset(slot + 1),
3169                               btrfs_node_key_ptr_offset(slot),
3170                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3171         }
3172         if (level) {
3173                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3174                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3175                 BUG_ON(ret < 0);
3176         }
3177         btrfs_set_node_key(lower, key, slot);
3178         btrfs_set_node_blockptr(lower, slot, bytenr);
3179         WARN_ON(trans->transid == 0);
3180         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3181         btrfs_set_header_nritems(lower, nritems + 1);
3182         btrfs_mark_buffer_dirty(lower);
3183 }
3184
3185 /*
3186  * split the node at the specified level in path in two.
3187  * The path is corrected to point to the appropriate node after the split
3188  *
3189  * Before splitting this tries to make some room in the node by pushing
3190  * left and right, if either one works, it returns right away.
3191  *
3192  * returns 0 on success and < 0 on failure
3193  */
3194 static noinline int split_node(struct btrfs_trans_handle *trans,
3195                                struct btrfs_root *root,
3196                                struct btrfs_path *path, int level)
3197 {
3198         struct extent_buffer *c;
3199         struct extent_buffer *split;
3200         struct btrfs_disk_key disk_key;
3201         int mid;
3202         int ret;
3203         u32 c_nritems;
3204
3205         c = path->nodes[level];
3206         WARN_ON(btrfs_header_generation(c) != trans->transid);
3207         if (c == root->node) {
3208                 /*
3209                  * trying to split the root, lets make a new one
3210                  *
3211                  * tree mod log: We don't log_removal old root in
3212                  * insert_new_root, because that root buffer will be kept as a
3213                  * normal node. We are going to log removal of half of the
3214                  * elements below with tree_mod_log_eb_copy. We're holding a
3215                  * tree lock on the buffer, which is why we cannot race with
3216                  * other tree_mod_log users.
3217                  */
3218                 ret = insert_new_root(trans, root, path, level + 1);
3219                 if (ret)
3220                         return ret;
3221         } else {
3222                 ret = push_nodes_for_insert(trans, root, path, level);
3223                 c = path->nodes[level];
3224                 if (!ret && btrfs_header_nritems(c) <
3225                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3226                         return 0;
3227                 if (ret < 0)
3228                         return ret;
3229         }
3230
3231         c_nritems = btrfs_header_nritems(c);
3232         mid = (c_nritems + 1) / 2;
3233         btrfs_node_key(c, &disk_key, mid);
3234
3235         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3236                                         root->root_key.objectid,
3237                                         &disk_key, level, c->start, 0);
3238         if (IS_ERR(split))
3239                 return PTR_ERR(split);
3240
3241         root_add_used(root, root->nodesize);
3242
3243         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3244         btrfs_set_header_level(split, btrfs_header_level(c));
3245         btrfs_set_header_bytenr(split, split->start);
3246         btrfs_set_header_generation(split, trans->transid);
3247         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3248         btrfs_set_header_owner(split, root->root_key.objectid);
3249         write_extent_buffer(split, root->fs_info->fsid,
3250                             (unsigned long)btrfs_header_fsid(split),
3251                             BTRFS_FSID_SIZE);
3252         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3253                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
3254                             BTRFS_UUID_SIZE);
3255
3256         tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3257         copy_extent_buffer(split, c,
3258                            btrfs_node_key_ptr_offset(0),
3259                            btrfs_node_key_ptr_offset(mid),
3260                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3261         btrfs_set_header_nritems(split, c_nritems - mid);
3262         btrfs_set_header_nritems(c, mid);
3263         ret = 0;
3264
3265         btrfs_mark_buffer_dirty(c);
3266         btrfs_mark_buffer_dirty(split);
3267
3268         insert_ptr(trans, root, path, &disk_key, split->start,
3269                    path->slots[level + 1] + 1, level + 1);
3270
3271         if (path->slots[level] >= mid) {
3272                 path->slots[level] -= mid;
3273                 btrfs_tree_unlock(c);
3274                 free_extent_buffer(c);
3275                 path->nodes[level] = split;
3276                 path->slots[level + 1] += 1;
3277         } else {
3278                 btrfs_tree_unlock(split);
3279                 free_extent_buffer(split);
3280         }
3281         return ret;
3282 }
3283
3284 /*
3285  * how many bytes are required to store the items in a leaf.  start
3286  * and nr indicate which items in the leaf to check.  This totals up the
3287  * space used both by the item structs and the item data
3288  */
3289 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3290 {
3291         struct btrfs_item *start_item;
3292         struct btrfs_item *end_item;
3293         struct btrfs_map_token token;
3294         int data_len;
3295         int nritems = btrfs_header_nritems(l);
3296         int end = min(nritems, start + nr) - 1;
3297
3298         if (!nr)
3299                 return 0;
3300         btrfs_init_map_token(&token);
3301         start_item = btrfs_item_nr(l, start);
3302         end_item = btrfs_item_nr(l, end);
3303         data_len = btrfs_token_item_offset(l, start_item, &token) +
3304                 btrfs_token_item_size(l, start_item, &token);
3305         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3306         data_len += sizeof(struct btrfs_item) * nr;
3307         WARN_ON(data_len < 0);
3308         return data_len;
3309 }
3310
3311 /*
3312  * The space between the end of the leaf items and
3313  * the start of the leaf data.  IOW, how much room
3314  * the leaf has left for both items and data
3315  */
3316 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3317                                    struct extent_buffer *leaf)
3318 {
3319         int nritems = btrfs_header_nritems(leaf);
3320         int ret;
3321         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3322         if (ret < 0) {
3323                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3324                        "used %d nritems %d\n",
3325                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3326                        leaf_space_used(leaf, 0, nritems), nritems);
3327         }
3328         return ret;
3329 }
3330
3331 /*
3332  * min slot controls the lowest index we're willing to push to the
3333  * right.  We'll push up to and including min_slot, but no lower
3334  */
3335 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3336                                       struct btrfs_root *root,
3337                                       struct btrfs_path *path,
3338                                       int data_size, int empty,
3339                                       struct extent_buffer *right,
3340                                       int free_space, u32 left_nritems,
3341                                       u32 min_slot)
3342 {
3343         struct extent_buffer *left = path->nodes[0];
3344         struct extent_buffer *upper = path->nodes[1];
3345         struct btrfs_map_token token;
3346         struct btrfs_disk_key disk_key;
3347         int slot;
3348         u32 i;
3349         int push_space = 0;
3350         int push_items = 0;
3351         struct btrfs_item *item;
3352         u32 nr;
3353         u32 right_nritems;
3354         u32 data_end;
3355         u32 this_item_size;
3356
3357         btrfs_init_map_token(&token);
3358
3359         if (empty)
3360                 nr = 0;
3361         else
3362                 nr = max_t(u32, 1, min_slot);
3363
3364         if (path->slots[0] >= left_nritems)
3365                 push_space += data_size;
3366
3367         slot = path->slots[1];
3368         i = left_nritems - 1;
3369         while (i >= nr) {
3370                 item = btrfs_item_nr(left, i);
3371
3372                 if (!empty && push_items > 0) {
3373                         if (path->slots[0] > i)
3374                                 break;
3375                         if (path->slots[0] == i) {
3376                                 int space = btrfs_leaf_free_space(root, left);
3377                                 if (space + push_space * 2 > free_space)
3378                                         break;
3379                         }
3380                 }
3381
3382                 if (path->slots[0] == i)
3383                         push_space += data_size;
3384
3385                 this_item_size = btrfs_item_size(left, item);
3386                 if (this_item_size + sizeof(*item) + push_space > free_space)
3387                         break;
3388
3389                 push_items++;
3390                 push_space += this_item_size + sizeof(*item);
3391                 if (i == 0)
3392                         break;
3393                 i--;
3394         }
3395
3396         if (push_items == 0)
3397                 goto out_unlock;
3398
3399         WARN_ON(!empty && push_items == left_nritems);
3400
3401         /* push left to right */
3402         right_nritems = btrfs_header_nritems(right);
3403
3404         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3405         push_space -= leaf_data_end(root, left);
3406
3407         /* make room in the right data area */
3408         data_end = leaf_data_end(root, right);
3409         memmove_extent_buffer(right,
3410                               btrfs_leaf_data(right) + data_end - push_space,
3411                               btrfs_leaf_data(right) + data_end,
3412                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3413
3414         /* copy from the left data area */
3415         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3416                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3417                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3418                      push_space);
3419
3420         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3421                               btrfs_item_nr_offset(0),
3422                               right_nritems * sizeof(struct btrfs_item));
3423
3424         /* copy the items from left to right */
3425         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3426                    btrfs_item_nr_offset(left_nritems - push_items),
3427                    push_items * sizeof(struct btrfs_item));
3428
3429         /* update the item pointers */
3430         right_nritems += push_items;
3431         btrfs_set_header_nritems(right, right_nritems);
3432         push_space = BTRFS_LEAF_DATA_SIZE(root);
3433         for (i = 0; i < right_nritems; i++) {
3434                 item = btrfs_item_nr(right, i);
3435                 push_space -= btrfs_token_item_size(right, item, &token);
3436                 btrfs_set_token_item_offset(right, item, push_space, &token);
3437         }
3438
3439         left_nritems -= push_items;
3440         btrfs_set_header_nritems(left, left_nritems);
3441
3442         if (left_nritems)
3443                 btrfs_mark_buffer_dirty(left);
3444         else
3445                 clean_tree_block(trans, root, left);
3446
3447         btrfs_mark_buffer_dirty(right);
3448
3449         btrfs_item_key(right, &disk_key, 0);
3450         btrfs_set_node_key(upper, &disk_key, slot + 1);
3451         btrfs_mark_buffer_dirty(upper);
3452
3453         /* then fixup the leaf pointer in the path */
3454         if (path->slots[0] >= left_nritems) {
3455                 path->slots[0] -= left_nritems;
3456                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3457                         clean_tree_block(trans, root, path->nodes[0]);
3458                 btrfs_tree_unlock(path->nodes[0]);
3459                 free_extent_buffer(path->nodes[0]);
3460                 path->nodes[0] = right;
3461                 path->slots[1] += 1;
3462         } else {
3463                 btrfs_tree_unlock(right);
3464                 free_extent_buffer(right);
3465         }
3466         return 0;
3467
3468 out_unlock:
3469         btrfs_tree_unlock(right);
3470         free_extent_buffer(right);
3471         return 1;
3472 }
3473
3474 /*
3475  * push some data in the path leaf to the right, trying to free up at
3476  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3477  *
3478  * returns 1 if the push failed because the other node didn't have enough
3479  * room, 0 if everything worked out and < 0 if there were major errors.
3480  *
3481  * this will push starting from min_slot to the end of the leaf.  It won't
3482  * push any slot lower than min_slot
3483  */
3484 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3485                            *root, struct btrfs_path *path,
3486                            int min_data_size, int data_size,
3487                            int empty, u32 min_slot)
3488 {
3489         struct extent_buffer *left = path->nodes[0];
3490         struct extent_buffer *right;
3491         struct extent_buffer *upper;
3492         int slot;
3493         int free_space;
3494         u32 left_nritems;
3495         int ret;
3496
3497         if (!path->nodes[1])
3498                 return 1;
3499
3500         slot = path->slots[1];
3501         upper = path->nodes[1];
3502         if (slot >= btrfs_header_nritems(upper) - 1)
3503                 return 1;
3504
3505         btrfs_assert_tree_locked(path->nodes[1]);
3506
3507         right = read_node_slot(root, upper, slot + 1);
3508         if (right == NULL)
3509                 return 1;
3510
3511         btrfs_tree_lock(right);
3512         btrfs_set_lock_blocking(right);
3513
3514         free_space = btrfs_leaf_free_space(root, right);
3515         if (free_space < data_size)
3516                 goto out_unlock;
3517
3518         /* cow and double check */
3519         ret = btrfs_cow_block(trans, root, right, upper,
3520                               slot + 1, &right);
3521         if (ret)
3522                 goto out_unlock;
3523
3524         free_space = btrfs_leaf_free_space(root, right);
3525         if (free_space < data_size)
3526                 goto out_unlock;
3527
3528         left_nritems = btrfs_header_nritems(left);
3529         if (left_nritems == 0)
3530                 goto out_unlock;
3531
3532         return __push_leaf_right(trans, root, path, min_data_size, empty,
3533                                 right, free_space, left_nritems, min_slot);
3534 out_unlock:
3535         btrfs_tree_unlock(right);
3536         free_extent_buffer(right);
3537         return 1;
3538 }
3539
3540 /*
3541  * push some data in the path leaf to the left, trying to free up at
3542  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3543  *
3544  * max_slot can put a limit on how far into the leaf we'll push items.  The
3545  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3546  * items
3547  */
3548 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3549                                      struct btrfs_root *root,
3550                                      struct btrfs_path *path, int data_size,
3551                                      int empty, struct extent_buffer *left,
3552                                      int free_space, u32 right_nritems,
3553                                      u32 max_slot)
3554 {
3555         struct btrfs_disk_key disk_key;
3556         struct extent_buffer *right = path->nodes[0];
3557         int i;
3558         int push_space = 0;
3559         int push_items = 0;
3560         struct btrfs_item *item;
3561         u32 old_left_nritems;
3562         u32 nr;
3563         int ret = 0;
3564         u32 this_item_size;
3565         u32 old_left_item_size;
3566         struct btrfs_map_token token;
3567
3568         btrfs_init_map_token(&token);
3569
3570         if (empty)
3571                 nr = min(right_nritems, max_slot);
3572         else
3573                 nr = min(right_nritems - 1, max_slot);
3574
3575         for (i = 0; i < nr; i++) {
3576                 item = btrfs_item_nr(right, i);
3577
3578                 if (!empty && push_items > 0) {
3579                         if (path->slots[0] < i)
3580                                 break;
3581                         if (path->slots[0] == i) {
3582                                 int space = btrfs_leaf_free_space(root, right);
3583                                 if (space + push_space * 2 > free_space)
3584                                         break;
3585                         }
3586                 }
3587
3588                 if (path->slots[0] == i)
3589                         push_space += data_size;
3590
3591                 this_item_size = btrfs_item_size(right, item);
3592                 if (this_item_size + sizeof(*item) + push_space > free_space)
3593                         break;
3594
3595                 push_items++;
3596                 push_space += this_item_size + sizeof(*item);
3597         }
3598
3599         if (push_items == 0) {
3600                 ret = 1;
3601                 goto out;
3602         }
3603         if (!empty && push_items == btrfs_header_nritems(right))
3604                 WARN_ON(1);
3605
3606         /* push data from right to left */
3607         copy_extent_buffer(left, right,
3608                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3609                            btrfs_item_nr_offset(0),
3610                            push_items * sizeof(struct btrfs_item));
3611
3612         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3613                      btrfs_item_offset_nr(right, push_items - 1);
3614
3615         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3616                      leaf_data_end(root, left) - push_space,
3617                      btrfs_leaf_data(right) +
3618                      btrfs_item_offset_nr(right, push_items - 1),
3619                      push_space);
3620         old_left_nritems = btrfs_header_nritems(left);
3621         BUG_ON(old_left_nritems <= 0);
3622
3623         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3624         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3625                 u32 ioff;
3626
3627                 item = btrfs_item_nr(left, i);
3628
3629                 ioff = btrfs_token_item_offset(left, item, &token);
3630                 btrfs_set_token_item_offset(left, item,
3631                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3632                       &token);
3633         }
3634         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3635
3636         /* fixup right node */
3637         if (push_items > right_nritems)
3638                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3639                        right_nritems);
3640
3641         if (push_items < right_nritems) {
3642                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3643                                                   leaf_data_end(root, right);
3644                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3645                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3646                                       btrfs_leaf_data(right) +
3647                                       leaf_data_end(root, right), push_space);
3648
3649                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3650                               btrfs_item_nr_offset(push_items),
3651                              (btrfs_header_nritems(right) - push_items) *
3652                              sizeof(struct btrfs_item));
3653         }
3654         right_nritems -= push_items;
3655         btrfs_set_header_nritems(right, right_nritems);
3656         push_space = BTRFS_LEAF_DATA_SIZE(root);
3657         for (i = 0; i < right_nritems; i++) {
3658                 item = btrfs_item_nr(right, i);
3659
3660                 push_space = push_space - btrfs_token_item_size(right,
3661                                                                 item, &token);
3662                 btrfs_set_token_item_offset(right, item, push_space, &token);
3663         }
3664
3665         btrfs_mark_buffer_dirty(left);
3666         if (right_nritems)
3667                 btrfs_mark_buffer_dirty(right);
3668         else
3669                 clean_tree_block(trans, root, right);
3670
3671         btrfs_item_key(right, &disk_key, 0);
3672         fixup_low_keys(root, path, &disk_key, 1);
3673
3674         /* then fixup the leaf pointer in the path */
3675         if (path->slots[0] < push_items) {
3676                 path->slots[0] += old_left_nritems;
3677                 btrfs_tree_unlock(path->nodes[0]);
3678                 free_extent_buffer(path->nodes[0]);
3679                 path->nodes[0] = left;
3680                 path->slots[1] -= 1;
3681         } else {
3682                 btrfs_tree_unlock(left);
3683                 free_extent_buffer(left);
3684                 path->slots[0] -= push_items;
3685         }
3686         BUG_ON(path->slots[0] < 0);
3687         return ret;
3688 out:
3689         btrfs_tree_unlock(left);
3690         free_extent_buffer(left);
3691         return ret;
3692 }
3693
3694 /*
3695  * push some data in the path leaf to the left, trying to free up at
3696  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3697  *
3698  * max_slot can put a limit on how far into the leaf we'll push items.  The
3699  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3700  * items
3701  */
3702 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3703                           *root, struct btrfs_path *path, int min_data_size,
3704                           int data_size, int empty, u32 max_slot)
3705 {
3706         struct extent_buffer *right = path->nodes[0];
3707         struct extent_buffer *left;
3708         int slot;
3709         int free_space;
3710         u32 right_nritems;
3711         int ret = 0;
3712
3713         slot = path->slots[1];
3714         if (slot == 0)
3715                 return 1;
3716         if (!path->nodes[1])
3717                 return 1;
3718
3719         right_nritems = btrfs_header_nritems(right);
3720         if (right_nritems == 0)
3721                 return 1;
3722
3723         btrfs_assert_tree_locked(path->nodes[1]);
3724
3725         left = read_node_slot(root, path->nodes[1], slot - 1);
3726         if (left == NULL)
3727                 return 1;
3728
3729         btrfs_tree_lock(left);
3730         btrfs_set_lock_blocking(left);
3731
3732         free_space = btrfs_leaf_free_space(root, left);
3733         if (free_space < data_size) {
3734                 ret = 1;
3735                 goto out;
3736         }
3737
3738         /* cow and double check */
3739         ret = btrfs_cow_block(trans, root, left,
3740                               path->nodes[1], slot - 1, &left);
3741         if (ret) {
3742                 /* we hit -ENOSPC, but it isn't fatal here */
3743                 if (ret == -ENOSPC)
3744                         ret = 1;
3745                 goto out;
3746         }
3747
3748         free_space = btrfs_leaf_free_space(root, left);
3749         if (free_space < data_size) {
3750                 ret = 1;
3751                 goto out;
3752         }
3753
3754         return __push_leaf_left(trans, root, path, min_data_size,
3755                                empty, left, free_space, right_nritems,
3756                                max_slot);
3757 out:
3758         btrfs_tree_unlock(left);
3759         free_extent_buffer(left);
3760         return ret;
3761 }
3762
3763 /*
3764  * split the path's leaf in two, making sure there is at least data_size
3765  * available for the resulting leaf level of the path.
3766  */
3767 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3768                                     struct btrfs_root *root,
3769                                     struct btrfs_path *path,
3770                                     struct extent_buffer *l,
3771                                     struct extent_buffer *right,
3772                                     int slot, int mid, int nritems)
3773 {
3774         int data_copy_size;
3775         int rt_data_off;
3776         int i;
3777         struct btrfs_disk_key disk_key;
3778         struct btrfs_map_token token;
3779
3780         btrfs_init_map_token(&token);
3781
3782         nritems = nritems - mid;
3783         btrfs_set_header_nritems(right, nritems);
3784         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3785
3786         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3787                            btrfs_item_nr_offset(mid),
3788                            nritems * sizeof(struct btrfs_item));
3789
3790         copy_extent_buffer(right, l,
3791                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3792                      data_copy_size, btrfs_leaf_data(l) +
3793                      leaf_data_end(root, l), data_copy_size);
3794
3795         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3796                       btrfs_item_end_nr(l, mid);
3797
3798         for (i = 0; i < nritems; i++) {
3799                 struct btrfs_item *item = btrfs_item_nr(right, i);
3800                 u32 ioff;
3801
3802                 ioff = btrfs_token_item_offset(right, item, &token);
3803                 btrfs_set_token_item_offset(right, item,
3804                                             ioff + rt_data_off, &token);
3805         }
3806
3807         btrfs_set_header_nritems(l, mid);
3808         btrfs_item_key(right, &disk_key, 0);
3809         insert_ptr(trans, root, path, &disk_key, right->start,
3810                    path->slots[1] + 1, 1);
3811
3812         btrfs_mark_buffer_dirty(right);
3813         btrfs_mark_buffer_dirty(l);
3814         BUG_ON(path->slots[0] != slot);
3815
3816         if (mid <= slot) {
3817                 btrfs_tree_unlock(path->nodes[0]);
3818                 free_extent_buffer(path->nodes[0]);
3819                 path->nodes[0] = right;
3820                 path->slots[0] -= mid;
3821                 path->slots[1] += 1;
3822         } else {
3823                 btrfs_tree_unlock(right);
3824                 free_extent_buffer(right);
3825         }
3826
3827         BUG_ON(path->slots[0] < 0);
3828 }
3829
3830 /*
3831  * double splits happen when we need to insert a big item in the middle
3832  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3833  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3834  *          A                 B                 C
3835  *
3836  * We avoid this by trying to push the items on either side of our target
3837  * into the adjacent leaves.  If all goes well we can avoid the double split
3838  * completely.
3839  */
3840 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3841                                           struct btrfs_root *root,
3842                                           struct btrfs_path *path,
3843                                           int data_size)
3844 {
3845         int ret;
3846         int progress = 0;
3847         int slot;
3848         u32 nritems;
3849
3850         slot = path->slots[0];
3851
3852         /*
3853          * try to push all the items after our slot into the
3854          * right leaf
3855          */
3856         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3857         if (ret < 0)
3858                 return ret;
3859
3860         if (ret == 0)
3861                 progress++;
3862
3863         nritems = btrfs_header_nritems(path->nodes[0]);
3864         /*
3865          * our goal is to get our slot at the start or end of a leaf.  If
3866          * we've done so we're done
3867          */
3868         if (path->slots[0] == 0 || path->slots[0] == nritems)
3869                 return 0;
3870
3871         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3872                 return 0;
3873
3874         /* try to push all the items before our slot into the next leaf */
3875         slot = path->slots[0];
3876         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3877         if (ret < 0)
3878                 return ret;
3879
3880         if (ret == 0)
3881                 progress++;
3882
3883         if (progress)
3884                 return 0;
3885         return 1;
3886 }
3887
3888 /*
3889  * split the path's leaf in two, making sure there is at least data_size
3890  * available for the resulting leaf level of the path.
3891  *
3892  * returns 0 if all went well and < 0 on failure.
3893  */
3894 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3895                                struct btrfs_root *root,
3896                                struct btrfs_key *ins_key,
3897                                struct btrfs_path *path, int data_size,
3898                                int extend)
3899 {
3900         struct btrfs_disk_key disk_key;
3901         struct extent_buffer *l;
3902         u32 nritems;
3903         int mid;
3904         int slot;
3905         struct extent_buffer *right;
3906         int ret = 0;
3907         int wret;
3908         int split;
3909         int num_doubles = 0;
3910         int tried_avoid_double = 0;
3911
3912         l = path->nodes[0];
3913         slot = path->slots[0];
3914         if (extend && data_size + btrfs_item_size_nr(l, slot) +
3915             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3916                 return -EOVERFLOW;
3917
3918         /* first try to make some room by pushing left and right */
3919         if (data_size && path->nodes[1]) {
3920                 wret = push_leaf_right(trans, root, path, data_size,
3921                                        data_size, 0, 0);
3922                 if (wret < 0)
3923                         return wret;
3924                 if (wret) {
3925                         wret = push_leaf_left(trans, root, path, data_size,
3926                                               data_size, 0, (u32)-1);
3927                         if (wret < 0)
3928                                 return wret;
3929                 }
3930                 l = path->nodes[0];
3931
3932                 /* did the pushes work? */
3933                 if (btrfs_leaf_free_space(root, l) >= data_size)
3934                         return 0;
3935         }
3936
3937         if (!path->nodes[1]) {
3938                 ret = insert_new_root(trans, root, path, 1);
3939                 if (ret)
3940                         return ret;
3941         }
3942 again:
3943         split = 1;
3944         l = path->nodes[0];
3945         slot = path->slots[0];
3946         nritems = btrfs_header_nritems(l);
3947         mid = (nritems + 1) / 2;
3948
3949         if (mid <= slot) {
3950                 if (nritems == 1 ||
3951                     leaf_space_used(l, mid, nritems - mid) + data_size >
3952                         BTRFS_LEAF_DATA_SIZE(root)) {
3953                         if (slot >= nritems) {
3954                                 split = 0;
3955                         } else {
3956                                 mid = slot;
3957                                 if (mid != nritems &&
3958                                     leaf_space_used(l, mid, nritems - mid) +
3959                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3960                                         if (data_size && !tried_avoid_double)
3961                                                 goto push_for_double;
3962                                         split = 2;
3963                                 }
3964                         }
3965                 }
3966         } else {
3967                 if (leaf_space_used(l, 0, mid) + data_size >
3968                         BTRFS_LEAF_DATA_SIZE(root)) {
3969                         if (!extend && data_size && slot == 0) {
3970                                 split = 0;
3971                         } else if ((extend || !data_size) && slot == 0) {
3972                                 mid = 1;
3973                         } else {
3974                                 mid = slot;
3975                                 if (mid != nritems &&
3976                                     leaf_space_used(l, mid, nritems - mid) +
3977                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3978                                         if (data_size && !tried_avoid_double)
3979                                                 goto push_for_double;
3980                                         split = 2 ;
3981                                 }
3982                         }
3983                 }
3984         }
3985
3986         if (split == 0)
3987                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3988         else
3989                 btrfs_item_key(l, &disk_key, mid);
3990
3991         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
3992                                         root->root_key.objectid,
3993                                         &disk_key, 0, l->start, 0);
3994         if (IS_ERR(right))
3995                 return PTR_ERR(right);
3996
3997         root_add_used(root, root->leafsize);
3998
3999         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4000         btrfs_set_header_bytenr(right, right->start);
4001         btrfs_set_header_generation(right, trans->transid);
4002         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4003         btrfs_set_header_owner(right, root->root_key.objectid);
4004         btrfs_set_header_level(right, 0);
4005         write_extent_buffer(right, root->fs_info->fsid,
4006                             (unsigned long)btrfs_header_fsid(right),
4007                             BTRFS_FSID_SIZE);
4008
4009         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4010                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
4011                             BTRFS_UUID_SIZE);
4012
4013         if (split == 0) {
4014                 if (mid <= slot) {
4015                         btrfs_set_header_nritems(right, 0);
4016                         insert_ptr(trans, root, path, &disk_key, right->start,
4017                                    path->slots[1] + 1, 1);
4018                         btrfs_tree_unlock(path->nodes[0]);
4019                         free_extent_buffer(path->nodes[0]);
4020                         path->nodes[0] = right;
4021                         path->slots[0] = 0;
4022                         path->slots[1] += 1;
4023                 } else {
4024                         btrfs_set_header_nritems(right, 0);
4025                         insert_ptr(trans, root, path, &disk_key, right->start,
4026                                           path->slots[1], 1);
4027                         btrfs_tree_unlock(path->nodes[0]);
4028                         free_extent_buffer(path->nodes[0]);
4029                         path->nodes[0] = right;
4030                         path->slots[0] = 0;
4031                         if (path->slots[1] == 0)
4032                                 fixup_low_keys(root, path, &disk_key, 1);
4033                 }
4034                 btrfs_mark_buffer_dirty(right);
4035                 return ret;
4036         }
4037
4038         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4039
4040         if (split == 2) {
4041                 BUG_ON(num_doubles != 0);
4042                 num_doubles++;
4043                 goto again;
4044         }
4045
4046         return 0;
4047
4048 push_for_double:
4049         push_for_double_split(trans, root, path, data_size);
4050         tried_avoid_double = 1;
4051         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4052                 return 0;
4053         goto again;
4054 }
4055
4056 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4057                                          struct btrfs_root *root,
4058                                          struct btrfs_path *path, int ins_len)
4059 {
4060         struct btrfs_key key;
4061         struct extent_buffer *leaf;
4062         struct btrfs_file_extent_item *fi;
4063         u64 extent_len = 0;
4064         u32 item_size;
4065         int ret;
4066
4067         leaf = path->nodes[0];
4068         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4069
4070         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4071                key.type != BTRFS_EXTENT_CSUM_KEY);
4072
4073         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4074                 return 0;
4075
4076         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4077         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4078                 fi = btrfs_item_ptr(leaf, path->slots[0],
4079                                     struct btrfs_file_extent_item);
4080                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4081         }
4082         btrfs_release_path(path);
4083
4084         path->keep_locks = 1;
4085         path->search_for_split = 1;
4086         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4087         path->search_for_split = 0;
4088         if (ret < 0)
4089                 goto err;
4090
4091         ret = -EAGAIN;
4092         leaf = path->nodes[0];
4093         /* if our item isn't there or got smaller, return now */
4094         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4095                 goto err;
4096
4097         /* the leaf has  changed, it now has room.  return now */
4098         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4099                 goto err;
4100
4101         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4102                 fi = btrfs_item_ptr(leaf, path->slots[0],
4103                                     struct btrfs_file_extent_item);
4104                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4105                         goto err;
4106         }
4107
4108         btrfs_set_path_blocking(path);
4109         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4110         if (ret)
4111                 goto err;
4112
4113         path->keep_locks = 0;
4114         btrfs_unlock_up_safe(path, 1);
4115         return 0;
4116 err:
4117         path->keep_locks = 0;
4118         return ret;
4119 }
4120
4121 static noinline int split_item(struct btrfs_trans_handle *trans,
4122                                struct btrfs_root *root,
4123                                struct btrfs_path *path,
4124                                struct btrfs_key *new_key,
4125                                unsigned long split_offset)
4126 {
4127         struct extent_buffer *leaf;
4128         struct btrfs_item *item;
4129         struct btrfs_item *new_item;
4130         int slot;
4131         char *buf;
4132         u32 nritems;
4133         u32 item_size;
4134         u32 orig_offset;
4135         struct btrfs_disk_key disk_key;
4136
4137         leaf = path->nodes[0];
4138         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4139
4140         btrfs_set_path_blocking(path);
4141
4142         item = btrfs_item_nr(leaf, path->slots[0]);
4143         orig_offset = btrfs_item_offset(leaf, item);
4144         item_size = btrfs_item_size(leaf, item);
4145
4146         buf = kmalloc(item_size, GFP_NOFS);
4147         if (!buf)
4148                 return -ENOMEM;
4149
4150         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4151                             path->slots[0]), item_size);
4152
4153         slot = path->slots[0] + 1;
4154         nritems = btrfs_header_nritems(leaf);
4155         if (slot != nritems) {
4156                 /* shift the items */
4157                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4158                                 btrfs_item_nr_offset(slot),
4159                                 (nritems - slot) * sizeof(struct btrfs_item));
4160         }
4161
4162         btrfs_cpu_key_to_disk(&disk_key, new_key);
4163         btrfs_set_item_key(leaf, &disk_key, slot);
4164
4165         new_item = btrfs_item_nr(leaf, slot);
4166
4167         btrfs_set_item_offset(leaf, new_item, orig_offset);
4168         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4169
4170         btrfs_set_item_offset(leaf, item,
4171                               orig_offset + item_size - split_offset);
4172         btrfs_set_item_size(leaf, item, split_offset);
4173
4174         btrfs_set_header_nritems(leaf, nritems + 1);
4175
4176         /* write the data for the start of the original item */
4177         write_extent_buffer(leaf, buf,
4178                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4179                             split_offset);
4180
4181         /* write the data for the new item */
4182         write_extent_buffer(leaf, buf + split_offset,
4183                             btrfs_item_ptr_offset(leaf, slot),
4184                             item_size - split_offset);
4185         btrfs_mark_buffer_dirty(leaf);
4186
4187         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4188         kfree(buf);
4189         return 0;
4190 }
4191
4192 /*
4193  * This function splits a single item into two items,
4194  * giving 'new_key' to the new item and splitting the
4195  * old one at split_offset (from the start of the item).
4196  *
4197  * The path may be released by this operation.  After
4198  * the split, the path is pointing to the old item.  The
4199  * new item is going to be in the same node as the old one.
4200  *
4201  * Note, the item being split must be smaller enough to live alone on
4202  * a tree block with room for one extra struct btrfs_item
4203  *
4204  * This allows us to split the item in place, keeping a lock on the
4205  * leaf the entire time.
4206  */
4207 int btrfs_split_item(struct btrfs_trans_handle *trans,
4208                      struct btrfs_root *root,
4209                      struct btrfs_path *path,
4210                      struct btrfs_key *new_key,
4211                      unsigned long split_offset)
4212 {
4213         int ret;
4214         ret = setup_leaf_for_split(trans, root, path,
4215                                    sizeof(struct btrfs_item));
4216         if (ret)
4217                 return ret;
4218
4219         ret = split_item(trans, root, path, new_key, split_offset);
4220         return ret;
4221 }
4222
4223 /*
4224  * This function duplicate a item, giving 'new_key' to the new item.
4225  * It guarantees both items live in the same tree leaf and the new item
4226  * is contiguous with the original item.
4227  *
4228  * This allows us to split file extent in place, keeping a lock on the
4229  * leaf the entire time.
4230  */
4231 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4232                          struct btrfs_root *root,
4233                          struct btrfs_path *path,
4234                          struct btrfs_key *new_key)
4235 {
4236         struct extent_buffer *leaf;
4237         int ret;
4238         u32 item_size;
4239
4240         leaf = path->nodes[0];
4241         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4242         ret = setup_leaf_for_split(trans, root, path,
4243                                    item_size + sizeof(struct btrfs_item));
4244         if (ret)
4245                 return ret;
4246
4247         path->slots[0]++;
4248         setup_items_for_insert(root, path, new_key, &item_size,
4249                                item_size, item_size +
4250                                sizeof(struct btrfs_item), 1);
4251         leaf = path->nodes[0];
4252         memcpy_extent_buffer(leaf,
4253                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4254                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4255                              item_size);
4256         return 0;
4257 }
4258
4259 /*
4260  * make the item pointed to by the path smaller.  new_size indicates
4261  * how small to make it, and from_end tells us if we just chop bytes
4262  * off the end of the item or if we shift the item to chop bytes off
4263  * the front.
4264  */
4265 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4266                          u32 new_size, int from_end)
4267 {
4268         int slot;
4269         struct extent_buffer *leaf;
4270         struct btrfs_item *item;
4271         u32 nritems;
4272         unsigned int data_end;
4273         unsigned int old_data_start;
4274         unsigned int old_size;
4275         unsigned int size_diff;
4276         int i;
4277         struct btrfs_map_token token;
4278
4279         btrfs_init_map_token(&token);
4280
4281         leaf = path->nodes[0];
4282         slot = path->slots[0];
4283
4284         old_size = btrfs_item_size_nr(leaf, slot);
4285         if (old_size == new_size)
4286                 return;
4287
4288         nritems = btrfs_header_nritems(leaf);
4289         data_end = leaf_data_end(root, leaf);
4290
4291         old_data_start = btrfs_item_offset_nr(leaf, slot);
4292
4293         size_diff = old_size - new_size;
4294
4295         BUG_ON(slot < 0);
4296         BUG_ON(slot >= nritems);
4297
4298         /*
4299          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4300          */
4301         /* first correct the data pointers */
4302         for (i = slot; i < nritems; i++) {
4303                 u32 ioff;
4304                 item = btrfs_item_nr(leaf, i);
4305
4306                 ioff = btrfs_token_item_offset(leaf, item, &token);
4307                 btrfs_set_token_item_offset(leaf, item,
4308                                             ioff + size_diff, &token);
4309         }
4310
4311         /* shift the data */
4312         if (from_end) {
4313                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4314                               data_end + size_diff, btrfs_leaf_data(leaf) +
4315                               data_end, old_data_start + new_size - data_end);
4316         } else {
4317                 struct btrfs_disk_key disk_key;
4318                 u64 offset;
4319
4320                 btrfs_item_key(leaf, &disk_key, slot);
4321
4322                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4323                         unsigned long ptr;
4324                         struct btrfs_file_extent_item *fi;
4325
4326                         fi = btrfs_item_ptr(leaf, slot,
4327                                             struct btrfs_file_extent_item);
4328                         fi = (struct btrfs_file_extent_item *)(
4329                              (unsigned long)fi - size_diff);
4330
4331                         if (btrfs_file_extent_type(leaf, fi) ==
4332                             BTRFS_FILE_EXTENT_INLINE) {
4333                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4334                                 memmove_extent_buffer(leaf, ptr,
4335                                       (unsigned long)fi,
4336                                       offsetof(struct btrfs_file_extent_item,
4337                                                  disk_bytenr));
4338                         }
4339                 }
4340
4341                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4342                               data_end + size_diff, btrfs_leaf_data(leaf) +
4343                               data_end, old_data_start - data_end);
4344
4345                 offset = btrfs_disk_key_offset(&disk_key);
4346                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4347                 btrfs_set_item_key(leaf, &disk_key, slot);
4348                 if (slot == 0)
4349                         fixup_low_keys(root, path, &disk_key, 1);
4350         }
4351
4352         item = btrfs_item_nr(leaf, slot);
4353         btrfs_set_item_size(leaf, item, new_size);
4354         btrfs_mark_buffer_dirty(leaf);
4355
4356         if (btrfs_leaf_free_space(root, leaf) < 0) {
4357                 btrfs_print_leaf(root, leaf);
4358                 BUG();
4359         }
4360 }
4361
4362 /*
4363  * make the item pointed to by the path bigger, data_size is the added size.
4364  */
4365 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4366                        u32 data_size)
4367 {
4368         int slot;
4369         struct extent_buffer *leaf;
4370         struct btrfs_item *item;
4371         u32 nritems;
4372         unsigned int data_end;
4373         unsigned int old_data;
4374         unsigned int old_size;
4375         int i;
4376         struct btrfs_map_token token;
4377
4378         btrfs_init_map_token(&token);
4379
4380         leaf = path->nodes[0];
4381
4382         nritems = btrfs_header_nritems(leaf);
4383         data_end = leaf_data_end(root, leaf);
4384
4385         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4386                 btrfs_print_leaf(root, leaf);
4387                 BUG();
4388         }
4389         slot = path->slots[0];
4390         old_data = btrfs_item_end_nr(leaf, slot);
4391
4392         BUG_ON(slot < 0);
4393         if (slot >= nritems) {
4394                 btrfs_print_leaf(root, leaf);
4395                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
4396                        slot, nritems);
4397                 BUG_ON(1);
4398         }
4399
4400         /*
4401          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4402          */
4403         /* first correct the data pointers */
4404         for (i = slot; i < nritems; i++) {
4405                 u32 ioff;
4406                 item = btrfs_item_nr(leaf, i);
4407
4408                 ioff = btrfs_token_item_offset(leaf, item, &token);
4409                 btrfs_set_token_item_offset(leaf, item,
4410                                             ioff - data_size, &token);
4411         }
4412
4413         /* shift the data */
4414         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4415                       data_end - data_size, btrfs_leaf_data(leaf) +
4416                       data_end, old_data - data_end);
4417
4418         data_end = old_data;
4419         old_size = btrfs_item_size_nr(leaf, slot);
4420         item = btrfs_item_nr(leaf, slot);
4421         btrfs_set_item_size(leaf, item, old_size + data_size);
4422         btrfs_mark_buffer_dirty(leaf);
4423
4424         if (btrfs_leaf_free_space(root, leaf) < 0) {
4425                 btrfs_print_leaf(root, leaf);
4426                 BUG();
4427         }
4428 }
4429
4430 /*
4431  * this is a helper for btrfs_insert_empty_items, the main goal here is
4432  * to save stack depth by doing the bulk of the work in a function
4433  * that doesn't call btrfs_search_slot
4434  */
4435 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4436                             struct btrfs_key *cpu_key, u32 *data_size,
4437                             u32 total_data, u32 total_size, int nr)
4438 {
4439         struct btrfs_item *item;
4440         int i;
4441         u32 nritems;
4442         unsigned int data_end;
4443         struct btrfs_disk_key disk_key;
4444         struct extent_buffer *leaf;
4445         int slot;
4446         struct btrfs_map_token token;
4447
4448         btrfs_init_map_token(&token);
4449
4450         leaf = path->nodes[0];
4451         slot = path->slots[0];
4452
4453         nritems = btrfs_header_nritems(leaf);
4454         data_end = leaf_data_end(root, leaf);
4455
4456         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4457                 btrfs_print_leaf(root, leaf);
4458                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
4459                        total_size, btrfs_leaf_free_space(root, leaf));
4460                 BUG();
4461         }
4462
4463         if (slot != nritems) {
4464                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4465
4466                 if (old_data < data_end) {
4467                         btrfs_print_leaf(root, leaf);
4468                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4469                                slot, old_data, data_end);
4470                         BUG_ON(1);
4471                 }
4472                 /*
4473                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4474                  */
4475                 /* first correct the data pointers */
4476                 for (i = slot; i < nritems; i++) {
4477                         u32 ioff;
4478
4479                         item = btrfs_item_nr(leaf, i);
4480                         ioff = btrfs_token_item_offset(leaf, item, &token);
4481                         btrfs_set_token_item_offset(leaf, item,
4482                                                     ioff - total_data, &token);
4483                 }
4484                 /* shift the items */
4485                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4486                               btrfs_item_nr_offset(slot),
4487                               (nritems - slot) * sizeof(struct btrfs_item));
4488
4489                 /* shift the data */
4490                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4491                               data_end - total_data, btrfs_leaf_data(leaf) +
4492                               data_end, old_data - data_end);
4493                 data_end = old_data;
4494         }
4495
4496         /* setup the item for the new data */
4497         for (i = 0; i < nr; i++) {
4498                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4499                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4500                 item = btrfs_item_nr(leaf, slot + i);
4501                 btrfs_set_token_item_offset(leaf, item,
4502                                             data_end - data_size[i], &token);
4503                 data_end -= data_size[i];
4504                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4505         }
4506
4507         btrfs_set_header_nritems(leaf, nritems + nr);
4508
4509         if (slot == 0) {
4510                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4511                 fixup_low_keys(root, path, &disk_key, 1);
4512         }
4513         btrfs_unlock_up_safe(path, 1);
4514         btrfs_mark_buffer_dirty(leaf);
4515
4516         if (btrfs_leaf_free_space(root, leaf) < 0) {
4517                 btrfs_print_leaf(root, leaf);
4518                 BUG();
4519         }
4520 }
4521
4522 /*
4523  * Given a key and some data, insert items into the tree.
4524  * This does all the path init required, making room in the tree if needed.
4525  */
4526 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4527                             struct btrfs_root *root,
4528                             struct btrfs_path *path,
4529                             struct btrfs_key *cpu_key, u32 *data_size,
4530                             int nr)
4531 {
4532         int ret = 0;
4533         int slot;
4534         int i;
4535         u32 total_size = 0;
4536         u32 total_data = 0;
4537
4538         for (i = 0; i < nr; i++)
4539                 total_data += data_size[i];
4540
4541         total_size = total_data + (nr * sizeof(struct btrfs_item));
4542         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4543         if (ret == 0)
4544                 return -EEXIST;
4545         if (ret < 0)
4546                 return ret;
4547
4548         slot = path->slots[0];
4549         BUG_ON(slot < 0);
4550
4551         setup_items_for_insert(root, path, cpu_key, data_size,
4552                                total_data, total_size, nr);
4553         return 0;
4554 }
4555
4556 /*
4557  * Given a key and some data, insert an item into the tree.
4558  * This does all the path init required, making room in the tree if needed.
4559  */
4560 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4561                       *root, struct btrfs_key *cpu_key, void *data, u32
4562                       data_size)
4563 {
4564         int ret = 0;
4565         struct btrfs_path *path;
4566         struct extent_buffer *leaf;
4567         unsigned long ptr;
4568
4569         path = btrfs_alloc_path();
4570         if (!path)
4571                 return -ENOMEM;
4572         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4573         if (!ret) {
4574                 leaf = path->nodes[0];
4575                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4576                 write_extent_buffer(leaf, data, ptr, data_size);
4577                 btrfs_mark_buffer_dirty(leaf);
4578         }
4579         btrfs_free_path(path);
4580         return ret;
4581 }
4582
4583 /*
4584  * delete the pointer from a given node.
4585  *
4586  * the tree should have been previously balanced so the deletion does not
4587  * empty a node.
4588  */
4589 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4590                     int level, int slot)
4591 {
4592         struct extent_buffer *parent = path->nodes[level];
4593         u32 nritems;
4594         int ret;
4595
4596         nritems = btrfs_header_nritems(parent);
4597         if (slot != nritems - 1) {
4598                 if (level)
4599                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4600                                              slot + 1, nritems - slot - 1);
4601                 memmove_extent_buffer(parent,
4602                               btrfs_node_key_ptr_offset(slot),
4603                               btrfs_node_key_ptr_offset(slot + 1),
4604                               sizeof(struct btrfs_key_ptr) *
4605                               (nritems - slot - 1));
4606         } else if (level) {
4607                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4608                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4609                 BUG_ON(ret < 0);
4610         }
4611
4612         nritems--;
4613         btrfs_set_header_nritems(parent, nritems);
4614         if (nritems == 0 && parent == root->node) {
4615                 BUG_ON(btrfs_header_level(root->node) != 1);
4616                 /* just turn the root into a leaf and break */
4617                 btrfs_set_header_level(root->node, 0);
4618         } else if (slot == 0) {
4619                 struct btrfs_disk_key disk_key;
4620
4621                 btrfs_node_key(parent, &disk_key, 0);
4622                 fixup_low_keys(root, path, &disk_key, level + 1);
4623         }
4624         btrfs_mark_buffer_dirty(parent);
4625 }
4626
4627 /*
4628  * a helper function to delete the leaf pointed to by path->slots[1] and
4629  * path->nodes[1].
4630  *
4631  * This deletes the pointer in path->nodes[1] and frees the leaf
4632  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4633  *
4634  * The path must have already been setup for deleting the leaf, including
4635  * all the proper balancing.  path->nodes[1] must be locked.
4636  */
4637 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4638                                     struct btrfs_root *root,
4639                                     struct btrfs_path *path,
4640                                     struct extent_buffer *leaf)
4641 {
4642         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4643         del_ptr(root, path, 1, path->slots[1]);
4644
4645         /*
4646          * btrfs_free_extent is expensive, we want to make sure we
4647          * aren't holding any locks when we call it
4648          */
4649         btrfs_unlock_up_safe(path, 0);
4650
4651         root_sub_used(root, leaf->len);
4652
4653         extent_buffer_get(leaf);
4654         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4655         free_extent_buffer_stale(leaf);
4656 }
4657 /*
4658  * delete the item at the leaf level in path.  If that empties
4659  * the leaf, remove it from the tree
4660  */
4661 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4662                     struct btrfs_path *path, int slot, int nr)
4663 {
4664         struct extent_buffer *leaf;
4665         struct btrfs_item *item;
4666         int last_off;
4667         int dsize = 0;
4668         int ret = 0;
4669         int wret;
4670         int i;
4671         u32 nritems;
4672         struct btrfs_map_token token;
4673
4674         btrfs_init_map_token(&token);
4675
4676         leaf = path->nodes[0];
4677         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4678
4679         for (i = 0; i < nr; i++)
4680                 dsize += btrfs_item_size_nr(leaf, slot + i);
4681
4682         nritems = btrfs_header_nritems(leaf);
4683
4684         if (slot + nr != nritems) {
4685                 int data_end = leaf_data_end(root, leaf);
4686
4687                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4688                               data_end + dsize,
4689                               btrfs_leaf_data(leaf) + data_end,
4690                               last_off - data_end);
4691
4692                 for (i = slot + nr; i < nritems; i++) {
4693                         u32 ioff;
4694
4695                         item = btrfs_item_nr(leaf, i);
4696                         ioff = btrfs_token_item_offset(leaf, item, &token);
4697                         btrfs_set_token_item_offset(leaf, item,
4698                                                     ioff + dsize, &token);
4699                 }
4700
4701                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4702                               btrfs_item_nr_offset(slot + nr),
4703                               sizeof(struct btrfs_item) *
4704                               (nritems - slot - nr));
4705         }
4706         btrfs_set_header_nritems(leaf, nritems - nr);
4707         nritems -= nr;
4708
4709         /* delete the leaf if we've emptied it */
4710         if (nritems == 0) {
4711                 if (leaf == root->node) {
4712                         btrfs_set_header_level(leaf, 0);
4713                 } else {
4714                         btrfs_set_path_blocking(path);
4715                         clean_tree_block(trans, root, leaf);
4716                         btrfs_del_leaf(trans, root, path, leaf);
4717                 }
4718         } else {
4719                 int used = leaf_space_used(leaf, 0, nritems);
4720                 if (slot == 0) {
4721                         struct btrfs_disk_key disk_key;
4722
4723                         btrfs_item_key(leaf, &disk_key, 0);
4724                         fixup_low_keys(root, path, &disk_key, 1);
4725                 }
4726
4727                 /* delete the leaf if it is mostly empty */
4728                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4729                         /* push_leaf_left fixes the path.
4730                          * make sure the path still points to our leaf
4731                          * for possible call to del_ptr below
4732                          */
4733                         slot = path->slots[1];
4734                         extent_buffer_get(leaf);
4735
4736                         btrfs_set_path_blocking(path);
4737                         wret = push_leaf_left(trans, root, path, 1, 1,
4738                                               1, (u32)-1);
4739                         if (wret < 0 && wret != -ENOSPC)
4740                                 ret = wret;
4741
4742                         if (path->nodes[0] == leaf &&
4743                             btrfs_header_nritems(leaf)) {
4744                                 wret = push_leaf_right(trans, root, path, 1,
4745                                                        1, 1, 0);
4746                                 if (wret < 0 && wret != -ENOSPC)
4747                                         ret = wret;
4748                         }
4749
4750                         if (btrfs_header_nritems(leaf) == 0) {
4751                                 path->slots[1] = slot;
4752                                 btrfs_del_leaf(trans, root, path, leaf);
4753                                 free_extent_buffer(leaf);
4754                                 ret = 0;
4755                         } else {
4756                                 /* if we're still in the path, make sure
4757                                  * we're dirty.  Otherwise, one of the
4758                                  * push_leaf functions must have already
4759                                  * dirtied this buffer
4760                                  */
4761                                 if (path->nodes[0] == leaf)
4762                                         btrfs_mark_buffer_dirty(leaf);
4763                                 free_extent_buffer(leaf);
4764                         }
4765                 } else {
4766                         btrfs_mark_buffer_dirty(leaf);
4767                 }
4768         }
4769         return ret;
4770 }
4771
4772 /*
4773  * search the tree again to find a leaf with lesser keys
4774  * returns 0 if it found something or 1 if there are no lesser leaves.
4775  * returns < 0 on io errors.
4776  *
4777  * This may release the path, and so you may lose any locks held at the
4778  * time you call it.
4779  */
4780 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4781 {
4782         struct btrfs_key key;
4783         struct btrfs_disk_key found_key;
4784         int ret;
4785
4786         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4787
4788         if (key.offset > 0)
4789                 key.offset--;
4790         else if (key.type > 0)
4791                 key.type--;
4792         else if (key.objectid > 0)
4793                 key.objectid--;
4794         else
4795                 return 1;
4796
4797         btrfs_release_path(path);
4798         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4799         if (ret < 0)
4800                 return ret;
4801         btrfs_item_key(path->nodes[0], &found_key, 0);
4802         ret = comp_keys(&found_key, &key);
4803         if (ret < 0)
4804                 return 0;
4805         return 1;
4806 }
4807
4808 /*
4809  * A helper function to walk down the tree starting at min_key, and looking
4810  * for nodes or leaves that are have a minimum transaction id.
4811  * This is used by the btree defrag code, and tree logging
4812  *
4813  * This does not cow, but it does stuff the starting key it finds back
4814  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4815  * key and get a writable path.
4816  *
4817  * This does lock as it descends, and path->keep_locks should be set
4818  * to 1 by the caller.
4819  *
4820  * This honors path->lowest_level to prevent descent past a given level
4821  * of the tree.
4822  *
4823  * min_trans indicates the oldest transaction that you are interested
4824  * in walking through.  Any nodes or leaves older than min_trans are
4825  * skipped over (without reading them).
4826  *
4827  * returns zero if something useful was found, < 0 on error and 1 if there
4828  * was nothing in the tree that matched the search criteria.
4829  */
4830 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4831                          struct btrfs_key *max_key,
4832                          struct btrfs_path *path,
4833                          u64 min_trans)
4834 {
4835         struct extent_buffer *cur;
4836         struct btrfs_key found_key;
4837         int slot;
4838         int sret;
4839         u32 nritems;
4840         int level;
4841         int ret = 1;
4842
4843         WARN_ON(!path->keep_locks);
4844 again:
4845         cur = btrfs_read_lock_root_node(root);
4846         level = btrfs_header_level(cur);
4847         WARN_ON(path->nodes[level]);
4848         path->nodes[level] = cur;
4849         path->locks[level] = BTRFS_READ_LOCK;
4850
4851         if (btrfs_header_generation(cur) < min_trans) {
4852                 ret = 1;
4853                 goto out;
4854         }
4855         while (1) {
4856                 nritems = btrfs_header_nritems(cur);
4857                 level = btrfs_header_level(cur);
4858                 sret = bin_search(cur, min_key, level, &slot);
4859
4860                 /* at the lowest level, we're done, setup the path and exit */
4861                 if (level == path->lowest_level) {
4862                         if (slot >= nritems)
4863                                 goto find_next_key;
4864                         ret = 0;
4865                         path->slots[level] = slot;
4866                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4867                         goto out;
4868                 }
4869                 if (sret && slot > 0)
4870                         slot--;
4871                 /*
4872                  * check this node pointer against the min_trans parameters.
4873                  * If it is too old, old, skip to the next one.
4874                  */
4875                 while (slot < nritems) {
4876                         u64 blockptr;
4877                         u64 gen;
4878
4879                         blockptr = btrfs_node_blockptr(cur, slot);
4880                         gen = btrfs_node_ptr_generation(cur, slot);
4881                         if (gen < min_trans) {
4882                                 slot++;
4883                                 continue;
4884                         }
4885                         break;
4886                 }
4887 find_next_key:
4888                 /*
4889                  * we didn't find a candidate key in this node, walk forward
4890                  * and find another one
4891                  */
4892                 if (slot >= nritems) {
4893                         path->slots[level] = slot;
4894                         btrfs_set_path_blocking(path);
4895                         sret = btrfs_find_next_key(root, path, min_key, level,
4896                                                   min_trans);
4897                         if (sret == 0) {
4898                                 btrfs_release_path(path);
4899                                 goto again;
4900                         } else {
4901                                 goto out;
4902                         }
4903                 }
4904                 /* save our key for returning back */
4905                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4906                 path->slots[level] = slot;
4907                 if (level == path->lowest_level) {
4908                         ret = 0;
4909                         unlock_up(path, level, 1, 0, NULL);
4910                         goto out;
4911                 }
4912                 btrfs_set_path_blocking(path);
4913                 cur = read_node_slot(root, cur, slot);
4914                 BUG_ON(!cur); /* -ENOMEM */
4915
4916                 btrfs_tree_read_lock(cur);
4917
4918                 path->locks[level - 1] = BTRFS_READ_LOCK;
4919                 path->nodes[level - 1] = cur;
4920                 unlock_up(path, level, 1, 0, NULL);
4921                 btrfs_clear_path_blocking(path, NULL, 0);
4922         }
4923 out:
4924         if (ret == 0)
4925                 memcpy(min_key, &found_key, sizeof(found_key));
4926         btrfs_set_path_blocking(path);
4927         return ret;
4928 }
4929
4930 static void tree_move_down(struct btrfs_root *root,
4931                            struct btrfs_path *path,
4932                            int *level, int root_level)
4933 {
4934         BUG_ON(*level == 0);
4935         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
4936                                         path->slots[*level]);
4937         path->slots[*level - 1] = 0;
4938         (*level)--;
4939 }
4940
4941 static int tree_move_next_or_upnext(struct btrfs_root *root,
4942                                     struct btrfs_path *path,
4943                                     int *level, int root_level)
4944 {
4945         int ret = 0;
4946         int nritems;
4947         nritems = btrfs_header_nritems(path->nodes[*level]);
4948
4949         path->slots[*level]++;
4950
4951         while (path->slots[*level] >= nritems) {
4952                 if (*level == root_level)
4953                         return -1;
4954
4955                 /* move upnext */
4956                 path->slots[*level] = 0;
4957                 free_extent_buffer(path->nodes[*level]);
4958                 path->nodes[*level] = NULL;
4959                 (*level)++;
4960                 path->slots[*level]++;
4961
4962                 nritems = btrfs_header_nritems(path->nodes[*level]);
4963                 ret = 1;
4964         }
4965         return ret;
4966 }
4967
4968 /*
4969  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
4970  * or down.
4971  */
4972 static int tree_advance(struct btrfs_root *root,
4973                         struct btrfs_path *path,
4974                         int *level, int root_level,
4975                         int allow_down,
4976                         struct btrfs_key *key)
4977 {
4978         int ret;
4979
4980         if (*level == 0 || !allow_down) {
4981                 ret = tree_move_next_or_upnext(root, path, level, root_level);
4982         } else {
4983                 tree_move_down(root, path, level, root_level);
4984                 ret = 0;
4985         }
4986         if (ret >= 0) {
4987                 if (*level == 0)
4988                         btrfs_item_key_to_cpu(path->nodes[*level], key,
4989                                         path->slots[*level]);
4990                 else
4991                         btrfs_node_key_to_cpu(path->nodes[*level], key,
4992                                         path->slots[*level]);
4993         }
4994         return ret;
4995 }
4996
4997 static int tree_compare_item(struct btrfs_root *left_root,
4998                              struct btrfs_path *left_path,
4999                              struct btrfs_path *right_path,
5000                              char *tmp_buf)
5001 {
5002         int cmp;
5003         int len1, len2;
5004         unsigned long off1, off2;
5005
5006         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5007         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5008         if (len1 != len2)
5009                 return 1;
5010
5011         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5012         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5013                                 right_path->slots[0]);
5014
5015         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5016
5017         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5018         if (cmp)
5019                 return 1;
5020         return 0;
5021 }
5022
5023 #define ADVANCE 1
5024 #define ADVANCE_ONLY_NEXT -1
5025
5026 /*
5027  * This function compares two trees and calls the provided callback for
5028  * every changed/new/deleted item it finds.
5029  * If shared tree blocks are encountered, whole subtrees are skipped, making
5030  * the compare pretty fast on snapshotted subvolumes.
5031  *
5032  * This currently works on commit roots only. As commit roots are read only,
5033  * we don't do any locking. The commit roots are protected with transactions.
5034  * Transactions are ended and rejoined when a commit is tried in between.
5035  *
5036  * This function checks for modifications done to the trees while comparing.
5037  * If it detects a change, it aborts immediately.
5038  */
5039 int btrfs_compare_trees(struct btrfs_root *left_root,
5040                         struct btrfs_root *right_root,
5041                         btrfs_changed_cb_t changed_cb, void *ctx)
5042 {
5043         int ret;
5044         int cmp;
5045         struct btrfs_trans_handle *trans = NULL;
5046         struct btrfs_path *left_path = NULL;
5047         struct btrfs_path *right_path = NULL;
5048         struct btrfs_key left_key;
5049         struct btrfs_key right_key;
5050         char *tmp_buf = NULL;
5051         int left_root_level;
5052         int right_root_level;
5053         int left_level;
5054         int right_level;
5055         int left_end_reached;
5056         int right_end_reached;
5057         int advance_left;
5058         int advance_right;
5059         u64 left_blockptr;
5060         u64 right_blockptr;
5061         u64 left_start_ctransid;
5062         u64 right_start_ctransid;
5063         u64 ctransid;
5064
5065         left_path = btrfs_alloc_path();
5066         if (!left_path) {
5067                 ret = -ENOMEM;
5068                 goto out;
5069         }
5070         right_path = btrfs_alloc_path();
5071         if (!right_path) {
5072                 ret = -ENOMEM;
5073                 goto out;
5074         }
5075
5076         tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5077         if (!tmp_buf) {
5078                 ret = -ENOMEM;
5079                 goto out;
5080         }
5081
5082         left_path->search_commit_root = 1;
5083         left_path->skip_locking = 1;
5084         right_path->search_commit_root = 1;
5085         right_path->skip_locking = 1;
5086
5087         spin_lock(&left_root->root_item_lock);
5088         left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5089         spin_unlock(&left_root->root_item_lock);
5090
5091         spin_lock(&right_root->root_item_lock);
5092         right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5093         spin_unlock(&right_root->root_item_lock);
5094
5095         trans = btrfs_join_transaction(left_root);
5096         if (IS_ERR(trans)) {
5097                 ret = PTR_ERR(trans);
5098                 trans = NULL;
5099                 goto out;
5100         }
5101
5102         /*
5103          * Strategy: Go to the first items of both trees. Then do
5104          *
5105          * If both trees are at level 0
5106          *   Compare keys of current items
5107          *     If left < right treat left item as new, advance left tree
5108          *       and repeat
5109          *     If left > right treat right item as deleted, advance right tree
5110          *       and repeat
5111          *     If left == right do deep compare of items, treat as changed if
5112          *       needed, advance both trees and repeat
5113          * If both trees are at the same level but not at level 0
5114          *   Compare keys of current nodes/leafs
5115          *     If left < right advance left tree and repeat
5116          *     If left > right advance right tree and repeat
5117          *     If left == right compare blockptrs of the next nodes/leafs
5118          *       If they match advance both trees but stay at the same level
5119          *         and repeat
5120          *       If they don't match advance both trees while allowing to go
5121          *         deeper and repeat
5122          * If tree levels are different
5123          *   Advance the tree that needs it and repeat
5124          *
5125          * Advancing a tree means:
5126          *   If we are at level 0, try to go to the next slot. If that's not
5127          *   possible, go one level up and repeat. Stop when we found a level
5128          *   where we could go to the next slot. We may at this point be on a
5129          *   node or a leaf.
5130          *
5131          *   If we are not at level 0 and not on shared tree blocks, go one
5132          *   level deeper.
5133          *
5134          *   If we are not at level 0 and on shared tree blocks, go one slot to
5135          *   the right if possible or go up and right.
5136          */
5137
5138         left_level = btrfs_header_level(left_root->commit_root);
5139         left_root_level = left_level;
5140         left_path->nodes[left_level] = left_root->commit_root;
5141         extent_buffer_get(left_path->nodes[left_level]);
5142
5143         right_level = btrfs_header_level(right_root->commit_root);
5144         right_root_level = right_level;
5145         right_path->nodes[right_level] = right_root->commit_root;
5146         extent_buffer_get(right_path->nodes[right_level]);
5147
5148         if (left_level == 0)
5149                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5150                                 &left_key, left_path->slots[left_level]);
5151         else
5152                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5153                                 &left_key, left_path->slots[left_level]);
5154         if (right_level == 0)
5155                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5156                                 &right_key, right_path->slots[right_level]);
5157         else
5158                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5159                                 &right_key, right_path->slots[right_level]);
5160
5161         left_end_reached = right_end_reached = 0;
5162         advance_left = advance_right = 0;
5163
5164         while (1) {
5165                 /*
5166                  * We need to make sure the transaction does not get committed
5167                  * while we do anything on commit roots. This means, we need to
5168                  * join and leave transactions for every item that we process.
5169                  */
5170                 if (trans && btrfs_should_end_transaction(trans, left_root)) {
5171                         btrfs_release_path(left_path);
5172                         btrfs_release_path(right_path);
5173
5174                         ret = btrfs_end_transaction(trans, left_root);
5175                         trans = NULL;
5176                         if (ret < 0)
5177                                 goto out;
5178                 }
5179                 /* now rejoin the transaction */
5180                 if (!trans) {
5181                         trans = btrfs_join_transaction(left_root);
5182                         if (IS_ERR(trans)) {
5183                                 ret = PTR_ERR(trans);
5184                                 trans = NULL;
5185                                 goto out;
5186                         }
5187
5188                         spin_lock(&left_root->root_item_lock);
5189                         ctransid = btrfs_root_ctransid(&left_root->root_item);
5190                         spin_unlock(&left_root->root_item_lock);
5191                         if (ctransid != left_start_ctransid)
5192                                 left_start_ctransid = 0;
5193
5194                         spin_lock(&right_root->root_item_lock);
5195                         ctransid = btrfs_root_ctransid(&right_root->root_item);
5196                         spin_unlock(&right_root->root_item_lock);
5197                         if (ctransid != right_start_ctransid)
5198                                 right_start_ctransid = 0;
5199
5200                         if (!left_start_ctransid || !right_start_ctransid) {
5201                                 WARN(1, KERN_WARNING
5202                                         "btrfs: btrfs_compare_tree detected "
5203                                         "a change in one of the trees while "
5204                                         "iterating. This is probably a "
5205                                         "bug.\n");
5206                                 ret = -EIO;
5207                                 goto out;
5208                         }
5209
5210                         /*
5211                          * the commit root may have changed, so start again
5212                          * where we stopped
5213                          */
5214                         left_path->lowest_level = left_level;
5215                         right_path->lowest_level = right_level;
5216                         ret = btrfs_search_slot(NULL, left_root,
5217                                         &left_key, left_path, 0, 0);
5218                         if (ret < 0)
5219                                 goto out;
5220                         ret = btrfs_search_slot(NULL, right_root,
5221                                         &right_key, right_path, 0, 0);
5222                         if (ret < 0)
5223                                 goto out;
5224                 }
5225
5226                 if (advance_left && !left_end_reached) {
5227                         ret = tree_advance(left_root, left_path, &left_level,
5228                                         left_root_level,
5229                                         advance_left != ADVANCE_ONLY_NEXT,
5230                                         &left_key);
5231                         if (ret < 0)
5232                                 left_end_reached = ADVANCE;
5233                         advance_left = 0;
5234                 }
5235                 if (advance_right && !right_end_reached) {
5236                         ret = tree_advance(right_root, right_path, &right_level,
5237                                         right_root_level,
5238                                         advance_right != ADVANCE_ONLY_NEXT,
5239                                         &right_key);
5240                         if (ret < 0)
5241                                 right_end_reached = ADVANCE;
5242                         advance_right = 0;
5243                 }
5244
5245                 if (left_end_reached && right_end_reached) {
5246                         ret = 0;
5247                         goto out;
5248                 } else if (left_end_reached) {
5249                         if (right_level == 0) {
5250                                 ret = changed_cb(left_root, right_root,
5251                                                 left_path, right_path,
5252                                                 &right_key,
5253                                                 BTRFS_COMPARE_TREE_DELETED,
5254                                                 ctx);
5255                                 if (ret < 0)
5256                                         goto out;
5257                         }
5258                         advance_right = ADVANCE;
5259                         continue;
5260                 } else if (right_end_reached) {
5261                         if (left_level == 0) {
5262                                 ret = changed_cb(left_root, right_root,
5263                                                 left_path, right_path,
5264                                                 &left_key,
5265                                                 BTRFS_COMPARE_TREE_NEW,
5266                                                 ctx);
5267                                 if (ret < 0)
5268                                         goto out;
5269                         }
5270                         advance_left = ADVANCE;
5271                         continue;
5272                 }
5273
5274                 if (left_level == 0 && right_level == 0) {
5275                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5276                         if (cmp < 0) {
5277                                 ret = changed_cb(left_root, right_root,
5278                                                 left_path, right_path,
5279                                                 &left_key,
5280                                                 BTRFS_COMPARE_TREE_NEW,
5281                                                 ctx);
5282                                 if (ret < 0)
5283                                         goto out;
5284                                 advance_left = ADVANCE;
5285                         } else if (cmp > 0) {
5286                                 ret = changed_cb(left_root, right_root,
5287                                                 left_path, right_path,
5288                                                 &right_key,
5289                                                 BTRFS_COMPARE_TREE_DELETED,
5290                                                 ctx);
5291                                 if (ret < 0)
5292                                         goto out;
5293                                 advance_right = ADVANCE;
5294                         } else {
5295                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5296                                 ret = tree_compare_item(left_root, left_path,
5297                                                 right_path, tmp_buf);
5298                                 if (ret) {
5299                                         WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5300                                         ret = changed_cb(left_root, right_root,
5301                                                 left_path, right_path,
5302                                                 &left_key,
5303                                                 BTRFS_COMPARE_TREE_CHANGED,
5304                                                 ctx);
5305                                         if (ret < 0)
5306                                                 goto out;
5307                                 }
5308                                 advance_left = ADVANCE;
5309                                 advance_right = ADVANCE;
5310                         }
5311                 } else if (left_level == right_level) {
5312                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5313                         if (cmp < 0) {
5314                                 advance_left = ADVANCE;
5315                         } else if (cmp > 0) {
5316                                 advance_right = ADVANCE;
5317                         } else {
5318                                 left_blockptr = btrfs_node_blockptr(
5319                                                 left_path->nodes[left_level],
5320                                                 left_path->slots[left_level]);
5321                                 right_blockptr = btrfs_node_blockptr(
5322                                                 right_path->nodes[right_level],
5323                                                 right_path->slots[right_level]);
5324                                 if (left_blockptr == right_blockptr) {
5325                                         /*
5326                                          * As we're on a shared block, don't
5327                                          * allow to go deeper.
5328                                          */
5329                                         advance_left = ADVANCE_ONLY_NEXT;
5330                                         advance_right = ADVANCE_ONLY_NEXT;
5331                                 } else {
5332                                         advance_left = ADVANCE;
5333                                         advance_right = ADVANCE;
5334                                 }
5335                         }
5336                 } else if (left_level < right_level) {
5337                         advance_right = ADVANCE;
5338                 } else {
5339                         advance_left = ADVANCE;
5340                 }
5341         }
5342
5343 out:
5344         btrfs_free_path(left_path);
5345         btrfs_free_path(right_path);
5346         kfree(tmp_buf);
5347
5348         if (trans) {
5349                 if (!ret)
5350                         ret = btrfs_end_transaction(trans, left_root);
5351                 else
5352                         btrfs_end_transaction(trans, left_root);
5353         }
5354
5355         return ret;
5356 }
5357
5358 /*
5359  * this is similar to btrfs_next_leaf, but does not try to preserve
5360  * and fixup the path.  It looks for and returns the next key in the
5361  * tree based on the current path and the min_trans parameters.
5362  *
5363  * 0 is returned if another key is found, < 0 if there are any errors
5364  * and 1 is returned if there are no higher keys in the tree
5365  *
5366  * path->keep_locks should be set to 1 on the search made before
5367  * calling this function.
5368  */
5369 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5370                         struct btrfs_key *key, int level, u64 min_trans)
5371 {
5372         int slot;
5373         struct extent_buffer *c;
5374
5375         WARN_ON(!path->keep_locks);
5376         while (level < BTRFS_MAX_LEVEL) {
5377                 if (!path->nodes[level])
5378                         return 1;
5379
5380                 slot = path->slots[level] + 1;
5381                 c = path->nodes[level];
5382 next:
5383                 if (slot >= btrfs_header_nritems(c)) {
5384                         int ret;
5385                         int orig_lowest;
5386                         struct btrfs_key cur_key;
5387                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5388                             !path->nodes[level + 1])
5389                                 return 1;
5390
5391                         if (path->locks[level + 1]) {
5392                                 level++;
5393                                 continue;
5394                         }
5395
5396                         slot = btrfs_header_nritems(c) - 1;
5397                         if (level == 0)
5398                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5399                         else
5400                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5401
5402                         orig_lowest = path->lowest_level;
5403                         btrfs_release_path(path);
5404                         path->lowest_level = level;
5405                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5406                                                 0, 0);
5407                         path->lowest_level = orig_lowest;
5408                         if (ret < 0)
5409                                 return ret;
5410
5411                         c = path->nodes[level];
5412                         slot = path->slots[level];
5413                         if (ret == 0)
5414                                 slot++;
5415                         goto next;
5416                 }
5417
5418                 if (level == 0)
5419                         btrfs_item_key_to_cpu(c, key, slot);
5420                 else {
5421                         u64 gen = btrfs_node_ptr_generation(c, slot);
5422
5423                         if (gen < min_trans) {
5424                                 slot++;
5425                                 goto next;
5426                         }
5427                         btrfs_node_key_to_cpu(c, key, slot);
5428                 }
5429                 return 0;
5430         }
5431         return 1;
5432 }
5433
5434 /*
5435  * search the tree again to find a leaf with greater keys
5436  * returns 0 if it found something or 1 if there are no greater leaves.
5437  * returns < 0 on io errors.
5438  */
5439 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5440 {
5441         return btrfs_next_old_leaf(root, path, 0);
5442 }
5443
5444 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5445                         u64 time_seq)
5446 {
5447         int slot;
5448         int level;
5449         struct extent_buffer *c;
5450         struct extent_buffer *next;
5451         struct btrfs_key key;
5452         u32 nritems;
5453         int ret;
5454         int old_spinning = path->leave_spinning;
5455         int next_rw_lock = 0;
5456
5457         nritems = btrfs_header_nritems(path->nodes[0]);
5458         if (nritems == 0)
5459                 return 1;
5460
5461         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5462 again:
5463         level = 1;
5464         next = NULL;
5465         next_rw_lock = 0;
5466         btrfs_release_path(path);
5467
5468         path->keep_locks = 1;
5469         path->leave_spinning = 1;
5470
5471         if (time_seq)
5472                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5473         else
5474                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5475         path->keep_locks = 0;
5476
5477         if (ret < 0)
5478                 return ret;
5479
5480         nritems = btrfs_header_nritems(path->nodes[0]);
5481         /*
5482          * by releasing the path above we dropped all our locks.  A balance
5483          * could have added more items next to the key that used to be
5484          * at the very end of the block.  So, check again here and
5485          * advance the path if there are now more items available.
5486          */
5487         if (nritems > 0 && path->slots[0] < nritems - 1) {
5488                 if (ret == 0)
5489                         path->slots[0]++;
5490                 ret = 0;
5491                 goto done;
5492         }
5493
5494         while (level < BTRFS_MAX_LEVEL) {
5495                 if (!path->nodes[level]) {
5496                         ret = 1;
5497                         goto done;
5498                 }
5499
5500                 slot = path->slots[level] + 1;
5501                 c = path->nodes[level];
5502                 if (slot >= btrfs_header_nritems(c)) {
5503                         level++;
5504                         if (level == BTRFS_MAX_LEVEL) {
5505                                 ret = 1;
5506                                 goto done;
5507                         }
5508                         continue;
5509                 }
5510
5511                 if (next) {
5512                         btrfs_tree_unlock_rw(next, next_rw_lock);
5513                         free_extent_buffer(next);
5514                 }
5515
5516                 next = c;
5517                 next_rw_lock = path->locks[level];
5518                 ret = read_block_for_search(NULL, root, path, &next, level,
5519                                             slot, &key, 0);
5520                 if (ret == -EAGAIN)
5521                         goto again;
5522
5523                 if (ret < 0) {
5524                         btrfs_release_path(path);
5525                         goto done;
5526                 }
5527
5528                 if (!path->skip_locking) {
5529                         ret = btrfs_try_tree_read_lock(next);
5530                         if (!ret && time_seq) {
5531                                 /*
5532                                  * If we don't get the lock, we may be racing
5533                                  * with push_leaf_left, holding that lock while
5534                                  * itself waiting for the leaf we've currently
5535                                  * locked. To solve this situation, we give up
5536                                  * on our lock and cycle.
5537                                  */
5538                                 free_extent_buffer(next);
5539                                 btrfs_release_path(path);
5540                                 cond_resched();
5541                                 goto again;
5542                         }
5543                         if (!ret) {
5544                                 btrfs_set_path_blocking(path);
5545                                 btrfs_tree_read_lock(next);
5546                                 btrfs_clear_path_blocking(path, next,
5547                                                           BTRFS_READ_LOCK);
5548                         }
5549                         next_rw_lock = BTRFS_READ_LOCK;
5550                 }
5551                 break;
5552         }
5553         path->slots[level] = slot;
5554         while (1) {
5555                 level--;
5556                 c = path->nodes[level];
5557                 if (path->locks[level])
5558                         btrfs_tree_unlock_rw(c, path->locks[level]);
5559
5560                 free_extent_buffer(c);
5561                 path->nodes[level] = next;
5562                 path->slots[level] = 0;
5563                 if (!path->skip_locking)
5564                         path->locks[level] = next_rw_lock;
5565                 if (!level)
5566                         break;
5567
5568                 ret = read_block_for_search(NULL, root, path, &next, level,
5569                                             0, &key, 0);
5570                 if (ret == -EAGAIN)
5571                         goto again;
5572
5573                 if (ret < 0) {
5574                         btrfs_release_path(path);
5575                         goto done;
5576                 }
5577
5578                 if (!path->skip_locking) {
5579                         ret = btrfs_try_tree_read_lock(next);
5580                         if (!ret) {
5581                                 btrfs_set_path_blocking(path);
5582                                 btrfs_tree_read_lock(next);
5583                                 btrfs_clear_path_blocking(path, next,
5584                                                           BTRFS_READ_LOCK);
5585                         }
5586                         next_rw_lock = BTRFS_READ_LOCK;
5587                 }
5588         }
5589         ret = 0;
5590 done:
5591         unlock_up(path, 0, 1, 0, NULL);
5592         path->leave_spinning = old_spinning;
5593         if (!old_spinning)
5594                 btrfs_set_path_blocking(path);
5595
5596         return ret;
5597 }
5598
5599 /*
5600  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5601  * searching until it gets past min_objectid or finds an item of 'type'
5602  *
5603  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5604  */
5605 int btrfs_previous_item(struct btrfs_root *root,
5606                         struct btrfs_path *path, u64 min_objectid,
5607                         int type)
5608 {
5609         struct btrfs_key found_key;
5610         struct extent_buffer *leaf;
5611         u32 nritems;
5612         int ret;
5613
5614         while (1) {
5615                 if (path->slots[0] == 0) {
5616                         btrfs_set_path_blocking(path);
5617                         ret = btrfs_prev_leaf(root, path);
5618                         if (ret != 0)
5619                                 return ret;
5620                 } else {
5621                         path->slots[0]--;
5622                 }
5623                 leaf = path->nodes[0];
5624                 nritems = btrfs_header_nritems(leaf);
5625                 if (nritems == 0)
5626                         return 1;
5627                 if (path->slots[0] == nritems)
5628                         path->slots[0]--;
5629
5630                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5631                 if (found_key.objectid < min_objectid)
5632                         break;
5633                 if (found_key.type == type)
5634                         return 0;
5635                 if (found_key.objectid == min_objectid &&
5636                     found_key.type < type)
5637                         break;
5638         }
5639         return 1;
5640 }