btrfs: replace many BUG_ONs with proper error handling
[platform/adaptation/renesas_rcar/renesas_kernel.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include "ctree.h"
22 #include "disk-io.h"
23 #include "transaction.h"
24 #include "print-tree.h"
25 #include "locking.h"
26
27 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
28                       *root, struct btrfs_path *path, int level);
29 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_key *ins_key,
31                       struct btrfs_path *path, int data_size, int extend);
32 static int push_node_left(struct btrfs_trans_handle *trans,
33                           struct btrfs_root *root, struct extent_buffer *dst,
34                           struct extent_buffer *src, int empty);
35 static int balance_node_right(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               struct extent_buffer *dst_buf,
38                               struct extent_buffer *src_buf);
39 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40                    struct btrfs_path *path, int level, int slot);
41
42 struct btrfs_path *btrfs_alloc_path(void)
43 {
44         struct btrfs_path *path;
45         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
46         return path;
47 }
48
49 /*
50  * set all locked nodes in the path to blocking locks.  This should
51  * be done before scheduling
52  */
53 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54 {
55         int i;
56         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57                 if (!p->nodes[i] || !p->locks[i])
58                         continue;
59                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60                 if (p->locks[i] == BTRFS_READ_LOCK)
61                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
64         }
65 }
66
67 /*
68  * reset all the locked nodes in the patch to spinning locks.
69  *
70  * held is used to keep lockdep happy, when lockdep is enabled
71  * we set held to a blocking lock before we go around and
72  * retake all the spinlocks in the path.  You can safely use NULL
73  * for held
74  */
75 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
76                                         struct extent_buffer *held, int held_rw)
77 {
78         int i;
79
80 #ifdef CONFIG_DEBUG_LOCK_ALLOC
81         /* lockdep really cares that we take all of these spinlocks
82          * in the right order.  If any of the locks in the path are not
83          * currently blocking, it is going to complain.  So, make really
84          * really sure by forcing the path to blocking before we clear
85          * the path blocking.
86          */
87         if (held) {
88                 btrfs_set_lock_blocking_rw(held, held_rw);
89                 if (held_rw == BTRFS_WRITE_LOCK)
90                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91                 else if (held_rw == BTRFS_READ_LOCK)
92                         held_rw = BTRFS_READ_LOCK_BLOCKING;
93         }
94         btrfs_set_path_blocking(p);
95 #endif
96
97         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
98                 if (p->nodes[i] && p->locks[i]) {
99                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101                                 p->locks[i] = BTRFS_WRITE_LOCK;
102                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103                                 p->locks[i] = BTRFS_READ_LOCK;
104                 }
105         }
106
107 #ifdef CONFIG_DEBUG_LOCK_ALLOC
108         if (held)
109                 btrfs_clear_lock_blocking_rw(held, held_rw);
110 #endif
111 }
112
113 /* this also releases the path */
114 void btrfs_free_path(struct btrfs_path *p)
115 {
116         if (!p)
117                 return;
118         btrfs_release_path(p);
119         kmem_cache_free(btrfs_path_cachep, p);
120 }
121
122 /*
123  * path release drops references on the extent buffers in the path
124  * and it drops any locks held by this path
125  *
126  * It is safe to call this on paths that no locks or extent buffers held.
127  */
128 noinline void btrfs_release_path(struct btrfs_path *p)
129 {
130         int i;
131
132         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
133                 p->slots[i] = 0;
134                 if (!p->nodes[i])
135                         continue;
136                 if (p->locks[i]) {
137                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
138                         p->locks[i] = 0;
139                 }
140                 free_extent_buffer(p->nodes[i]);
141                 p->nodes[i] = NULL;
142         }
143 }
144
145 /*
146  * safely gets a reference on the root node of a tree.  A lock
147  * is not taken, so a concurrent writer may put a different node
148  * at the root of the tree.  See btrfs_lock_root_node for the
149  * looping required.
150  *
151  * The extent buffer returned by this has a reference taken, so
152  * it won't disappear.  It may stop being the root of the tree
153  * at any time because there are no locks held.
154  */
155 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
156 {
157         struct extent_buffer *eb;
158
159         rcu_read_lock();
160         eb = rcu_dereference(root->node);
161         extent_buffer_get(eb);
162         rcu_read_unlock();
163         return eb;
164 }
165
166 /* loop around taking references on and locking the root node of the
167  * tree until you end up with a lock on the root.  A locked buffer
168  * is returned, with a reference held.
169  */
170 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
171 {
172         struct extent_buffer *eb;
173
174         while (1) {
175                 eb = btrfs_root_node(root);
176                 btrfs_tree_lock(eb);
177                 if (eb == root->node)
178                         break;
179                 btrfs_tree_unlock(eb);
180                 free_extent_buffer(eb);
181         }
182         return eb;
183 }
184
185 /* loop around taking references on and locking the root node of the
186  * tree until you end up with a lock on the root.  A locked buffer
187  * is returned, with a reference held.
188  */
189 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190 {
191         struct extent_buffer *eb;
192
193         while (1) {
194                 eb = btrfs_root_node(root);
195                 btrfs_tree_read_lock(eb);
196                 if (eb == root->node)
197                         break;
198                 btrfs_tree_read_unlock(eb);
199                 free_extent_buffer(eb);
200         }
201         return eb;
202 }
203
204 /* cowonly root (everything not a reference counted cow subvolume), just get
205  * put onto a simple dirty list.  transaction.c walks this to make sure they
206  * get properly updated on disk.
207  */
208 static void add_root_to_dirty_list(struct btrfs_root *root)
209 {
210         if (root->track_dirty && list_empty(&root->dirty_list)) {
211                 list_add(&root->dirty_list,
212                          &root->fs_info->dirty_cowonly_roots);
213         }
214 }
215
216 /*
217  * used by snapshot creation to make a copy of a root for a tree with
218  * a given objectid.  The buffer with the new root node is returned in
219  * cow_ret, and this func returns zero on success or a negative error code.
220  */
221 int btrfs_copy_root(struct btrfs_trans_handle *trans,
222                       struct btrfs_root *root,
223                       struct extent_buffer *buf,
224                       struct extent_buffer **cow_ret, u64 new_root_objectid)
225 {
226         struct extent_buffer *cow;
227         int ret = 0;
228         int level;
229         struct btrfs_disk_key disk_key;
230
231         WARN_ON(root->ref_cows && trans->transid !=
232                 root->fs_info->running_transaction->transid);
233         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
234
235         level = btrfs_header_level(buf);
236         if (level == 0)
237                 btrfs_item_key(buf, &disk_key, 0);
238         else
239                 btrfs_node_key(buf, &disk_key, 0);
240
241         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242                                      new_root_objectid, &disk_key, level,
243                                      buf->start, 0, 1);
244         if (IS_ERR(cow))
245                 return PTR_ERR(cow);
246
247         copy_extent_buffer(cow, buf, 0, 0, cow->len);
248         btrfs_set_header_bytenr(cow, cow->start);
249         btrfs_set_header_generation(cow, trans->transid);
250         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
251         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
252                                      BTRFS_HEADER_FLAG_RELOC);
253         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
254                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
255         else
256                 btrfs_set_header_owner(cow, new_root_objectid);
257
258         write_extent_buffer(cow, root->fs_info->fsid,
259                             (unsigned long)btrfs_header_fsid(cow),
260                             BTRFS_FSID_SIZE);
261
262         WARN_ON(btrfs_header_generation(buf) > trans->transid);
263         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265         else
266                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267
268         if (ret)
269                 return ret;
270
271         btrfs_mark_buffer_dirty(cow);
272         *cow_ret = cow;
273         return 0;
274 }
275
276 /*
277  * check if the tree block can be shared by multiple trees
278  */
279 int btrfs_block_can_be_shared(struct btrfs_root *root,
280                               struct extent_buffer *buf)
281 {
282         /*
283          * Tree blocks not in refernece counted trees and tree roots
284          * are never shared. If a block was allocated after the last
285          * snapshot and the block was not allocated by tree relocation,
286          * we know the block is not shared.
287          */
288         if (root->ref_cows &&
289             buf != root->node && buf != root->commit_root &&
290             (btrfs_header_generation(buf) <=
291              btrfs_root_last_snapshot(&root->root_item) ||
292              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
293                 return 1;
294 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
295         if (root->ref_cows &&
296             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
297                 return 1;
298 #endif
299         return 0;
300 }
301
302 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
303                                        struct btrfs_root *root,
304                                        struct extent_buffer *buf,
305                                        struct extent_buffer *cow,
306                                        int *last_ref)
307 {
308         u64 refs;
309         u64 owner;
310         u64 flags;
311         u64 new_flags = 0;
312         int ret;
313
314         /*
315          * Backrefs update rules:
316          *
317          * Always use full backrefs for extent pointers in tree block
318          * allocated by tree relocation.
319          *
320          * If a shared tree block is no longer referenced by its owner
321          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
322          * use full backrefs for extent pointers in tree block.
323          *
324          * If a tree block is been relocating
325          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
326          * use full backrefs for extent pointers in tree block.
327          * The reason for this is some operations (such as drop tree)
328          * are only allowed for blocks use full backrefs.
329          */
330
331         if (btrfs_block_can_be_shared(root, buf)) {
332                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
333                                                buf->len, &refs, &flags);
334                 if (ret)
335                         return ret;
336                 if (refs == 0) {
337                         ret = -EROFS;
338                         btrfs_std_error(root->fs_info, ret);
339                         return ret;
340                 }
341         } else {
342                 refs = 1;
343                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
344                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
345                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
346                 else
347                         flags = 0;
348         }
349
350         owner = btrfs_header_owner(buf);
351         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
352                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
353
354         if (refs > 1) {
355                 if ((owner == root->root_key.objectid ||
356                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
357                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
358                         ret = btrfs_inc_ref(trans, root, buf, 1, 1);
359                         BUG_ON(ret); /* -ENOMEM */
360
361                         if (root->root_key.objectid ==
362                             BTRFS_TREE_RELOC_OBJECTID) {
363                                 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
364                                 BUG_ON(ret); /* -ENOMEM */
365                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
366                                 BUG_ON(ret); /* -ENOMEM */
367                         }
368                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
369                 } else {
370
371                         if (root->root_key.objectid ==
372                             BTRFS_TREE_RELOC_OBJECTID)
373                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
374                         else
375                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
376                         BUG_ON(ret); /* -ENOMEM */
377                 }
378                 if (new_flags != 0) {
379                         ret = btrfs_set_disk_extent_flags(trans, root,
380                                                           buf->start,
381                                                           buf->len,
382                                                           new_flags, 0);
383                         if (ret)
384                                 return ret;
385                 }
386         } else {
387                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
388                         if (root->root_key.objectid ==
389                             BTRFS_TREE_RELOC_OBJECTID)
390                                 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
391                         else
392                                 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
393                         BUG_ON(ret); /* -ENOMEM */
394                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
395                         BUG_ON(ret); /* -ENOMEM */
396                 }
397                 clean_tree_block(trans, root, buf);
398                 *last_ref = 1;
399         }
400         return 0;
401 }
402
403 /*
404  * does the dirty work in cow of a single block.  The parent block (if
405  * supplied) is updated to point to the new cow copy.  The new buffer is marked
406  * dirty and returned locked.  If you modify the block it needs to be marked
407  * dirty again.
408  *
409  * search_start -- an allocation hint for the new block
410  *
411  * empty_size -- a hint that you plan on doing more cow.  This is the size in
412  * bytes the allocator should try to find free next to the block it returns.
413  * This is just a hint and may be ignored by the allocator.
414  */
415 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
416                              struct btrfs_root *root,
417                              struct extent_buffer *buf,
418                              struct extent_buffer *parent, int parent_slot,
419                              struct extent_buffer **cow_ret,
420                              u64 search_start, u64 empty_size)
421 {
422         struct btrfs_disk_key disk_key;
423         struct extent_buffer *cow;
424         int level, ret;
425         int last_ref = 0;
426         int unlock_orig = 0;
427         u64 parent_start;
428
429         if (*cow_ret == buf)
430                 unlock_orig = 1;
431
432         btrfs_assert_tree_locked(buf);
433
434         WARN_ON(root->ref_cows && trans->transid !=
435                 root->fs_info->running_transaction->transid);
436         WARN_ON(root->ref_cows && trans->transid != root->last_trans);
437
438         level = btrfs_header_level(buf);
439
440         if (level == 0)
441                 btrfs_item_key(buf, &disk_key, 0);
442         else
443                 btrfs_node_key(buf, &disk_key, 0);
444
445         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
446                 if (parent)
447                         parent_start = parent->start;
448                 else
449                         parent_start = 0;
450         } else
451                 parent_start = 0;
452
453         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
454                                      root->root_key.objectid, &disk_key,
455                                      level, search_start, empty_size, 1);
456         if (IS_ERR(cow))
457                 return PTR_ERR(cow);
458
459         /* cow is set to blocking by btrfs_init_new_buffer */
460
461         copy_extent_buffer(cow, buf, 0, 0, cow->len);
462         btrfs_set_header_bytenr(cow, cow->start);
463         btrfs_set_header_generation(cow, trans->transid);
464         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
465         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
466                                      BTRFS_HEADER_FLAG_RELOC);
467         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
468                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
469         else
470                 btrfs_set_header_owner(cow, root->root_key.objectid);
471
472         write_extent_buffer(cow, root->fs_info->fsid,
473                             (unsigned long)btrfs_header_fsid(cow),
474                             BTRFS_FSID_SIZE);
475
476         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
477         if (ret) {
478                 btrfs_abort_transaction(trans, root, ret);
479                 return ret;
480         }
481
482         if (root->ref_cows)
483                 btrfs_reloc_cow_block(trans, root, buf, cow);
484
485         if (buf == root->node) {
486                 WARN_ON(parent && parent != buf);
487                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
488                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
489                         parent_start = buf->start;
490                 else
491                         parent_start = 0;
492
493                 extent_buffer_get(cow);
494                 rcu_assign_pointer(root->node, cow);
495
496                 btrfs_free_tree_block(trans, root, buf, parent_start,
497                                       last_ref, 1);
498                 free_extent_buffer(buf);
499                 add_root_to_dirty_list(root);
500         } else {
501                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
502                         parent_start = parent->start;
503                 else
504                         parent_start = 0;
505
506                 WARN_ON(trans->transid != btrfs_header_generation(parent));
507                 btrfs_set_node_blockptr(parent, parent_slot,
508                                         cow->start);
509                 btrfs_set_node_ptr_generation(parent, parent_slot,
510                                               trans->transid);
511                 btrfs_mark_buffer_dirty(parent);
512                 btrfs_free_tree_block(trans, root, buf, parent_start,
513                                       last_ref, 1);
514         }
515         if (unlock_orig)
516                 btrfs_tree_unlock(buf);
517         free_extent_buffer(buf);
518         btrfs_mark_buffer_dirty(cow);
519         *cow_ret = cow;
520         return 0;
521 }
522
523 static inline int should_cow_block(struct btrfs_trans_handle *trans,
524                                    struct btrfs_root *root,
525                                    struct extent_buffer *buf)
526 {
527         /* ensure we can see the force_cow */
528         smp_rmb();
529
530         /*
531          * We do not need to cow a block if
532          * 1) this block is not created or changed in this transaction;
533          * 2) this block does not belong to TREE_RELOC tree;
534          * 3) the root is not forced COW.
535          *
536          * What is forced COW:
537          *    when we create snapshot during commiting the transaction,
538          *    after we've finished coping src root, we must COW the shared
539          *    block to ensure the metadata consistency.
540          */
541         if (btrfs_header_generation(buf) == trans->transid &&
542             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
543             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
544               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
545             !root->force_cow)
546                 return 0;
547         return 1;
548 }
549
550 /*
551  * cows a single block, see __btrfs_cow_block for the real work.
552  * This version of it has extra checks so that a block isn't cow'd more than
553  * once per transaction, as long as it hasn't been written yet
554  */
555 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
556                     struct btrfs_root *root, struct extent_buffer *buf,
557                     struct extent_buffer *parent, int parent_slot,
558                     struct extent_buffer **cow_ret)
559 {
560         u64 search_start;
561         int ret;
562
563         if (trans->transaction != root->fs_info->running_transaction) {
564                 printk(KERN_CRIT "trans %llu running %llu\n",
565                        (unsigned long long)trans->transid,
566                        (unsigned long long)
567                        root->fs_info->running_transaction->transid);
568                 WARN_ON(1);
569         }
570         if (trans->transid != root->fs_info->generation) {
571                 printk(KERN_CRIT "trans %llu running %llu\n",
572                        (unsigned long long)trans->transid,
573                        (unsigned long long)root->fs_info->generation);
574                 WARN_ON(1);
575         }
576
577         if (!should_cow_block(trans, root, buf)) {
578                 *cow_ret = buf;
579                 return 0;
580         }
581
582         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
583
584         if (parent)
585                 btrfs_set_lock_blocking(parent);
586         btrfs_set_lock_blocking(buf);
587
588         ret = __btrfs_cow_block(trans, root, buf, parent,
589                                  parent_slot, cow_ret, search_start, 0);
590
591         trace_btrfs_cow_block(root, buf, *cow_ret);
592
593         return ret;
594 }
595
596 /*
597  * helper function for defrag to decide if two blocks pointed to by a
598  * node are actually close by
599  */
600 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
601 {
602         if (blocknr < other && other - (blocknr + blocksize) < 32768)
603                 return 1;
604         if (blocknr > other && blocknr - (other + blocksize) < 32768)
605                 return 1;
606         return 0;
607 }
608
609 /*
610  * compare two keys in a memcmp fashion
611  */
612 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
613 {
614         struct btrfs_key k1;
615
616         btrfs_disk_key_to_cpu(&k1, disk);
617
618         return btrfs_comp_cpu_keys(&k1, k2);
619 }
620
621 /*
622  * same as comp_keys only with two btrfs_key's
623  */
624 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
625 {
626         if (k1->objectid > k2->objectid)
627                 return 1;
628         if (k1->objectid < k2->objectid)
629                 return -1;
630         if (k1->type > k2->type)
631                 return 1;
632         if (k1->type < k2->type)
633                 return -1;
634         if (k1->offset > k2->offset)
635                 return 1;
636         if (k1->offset < k2->offset)
637                 return -1;
638         return 0;
639 }
640
641 /*
642  * this is used by the defrag code to go through all the
643  * leaves pointed to by a node and reallocate them so that
644  * disk order is close to key order
645  */
646 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
647                        struct btrfs_root *root, struct extent_buffer *parent,
648                        int start_slot, int cache_only, u64 *last_ret,
649                        struct btrfs_key *progress)
650 {
651         struct extent_buffer *cur;
652         u64 blocknr;
653         u64 gen;
654         u64 search_start = *last_ret;
655         u64 last_block = 0;
656         u64 other;
657         u32 parent_nritems;
658         int end_slot;
659         int i;
660         int err = 0;
661         int parent_level;
662         int uptodate;
663         u32 blocksize;
664         int progress_passed = 0;
665         struct btrfs_disk_key disk_key;
666
667         parent_level = btrfs_header_level(parent);
668         if (cache_only && parent_level != 1)
669                 return 0;
670
671         if (trans->transaction != root->fs_info->running_transaction)
672                 WARN_ON(1);
673         if (trans->transid != root->fs_info->generation)
674                 WARN_ON(1);
675
676         parent_nritems = btrfs_header_nritems(parent);
677         blocksize = btrfs_level_size(root, parent_level - 1);
678         end_slot = parent_nritems;
679
680         if (parent_nritems == 1)
681                 return 0;
682
683         btrfs_set_lock_blocking(parent);
684
685         for (i = start_slot; i < end_slot; i++) {
686                 int close = 1;
687
688                 btrfs_node_key(parent, &disk_key, i);
689                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
690                         continue;
691
692                 progress_passed = 1;
693                 blocknr = btrfs_node_blockptr(parent, i);
694                 gen = btrfs_node_ptr_generation(parent, i);
695                 if (last_block == 0)
696                         last_block = blocknr;
697
698                 if (i > 0) {
699                         other = btrfs_node_blockptr(parent, i - 1);
700                         close = close_blocks(blocknr, other, blocksize);
701                 }
702                 if (!close && i < end_slot - 2) {
703                         other = btrfs_node_blockptr(parent, i + 1);
704                         close = close_blocks(blocknr, other, blocksize);
705                 }
706                 if (close) {
707                         last_block = blocknr;
708                         continue;
709                 }
710
711                 cur = btrfs_find_tree_block(root, blocknr, blocksize);
712                 if (cur)
713                         uptodate = btrfs_buffer_uptodate(cur, gen);
714                 else
715                         uptodate = 0;
716                 if (!cur || !uptodate) {
717                         if (cache_only) {
718                                 free_extent_buffer(cur);
719                                 continue;
720                         }
721                         if (!cur) {
722                                 cur = read_tree_block(root, blocknr,
723                                                          blocksize, gen);
724                                 if (!cur)
725                                         return -EIO;
726                         } else if (!uptodate) {
727                                 btrfs_read_buffer(cur, gen);
728                         }
729                 }
730                 if (search_start == 0)
731                         search_start = last_block;
732
733                 btrfs_tree_lock(cur);
734                 btrfs_set_lock_blocking(cur);
735                 err = __btrfs_cow_block(trans, root, cur, parent, i,
736                                         &cur, search_start,
737                                         min(16 * blocksize,
738                                             (end_slot - i) * blocksize));
739                 if (err) {
740                         btrfs_tree_unlock(cur);
741                         free_extent_buffer(cur);
742                         break;
743                 }
744                 search_start = cur->start;
745                 last_block = cur->start;
746                 *last_ret = search_start;
747                 btrfs_tree_unlock(cur);
748                 free_extent_buffer(cur);
749         }
750         return err;
751 }
752
753 /*
754  * The leaf data grows from end-to-front in the node.
755  * this returns the address of the start of the last item,
756  * which is the stop of the leaf data stack
757  */
758 static inline unsigned int leaf_data_end(struct btrfs_root *root,
759                                          struct extent_buffer *leaf)
760 {
761         u32 nr = btrfs_header_nritems(leaf);
762         if (nr == 0)
763                 return BTRFS_LEAF_DATA_SIZE(root);
764         return btrfs_item_offset_nr(leaf, nr - 1);
765 }
766
767
768 /*
769  * search for key in the extent_buffer.  The items start at offset p,
770  * and they are item_size apart.  There are 'max' items in p.
771  *
772  * the slot in the array is returned via slot, and it points to
773  * the place where you would insert key if it is not found in
774  * the array.
775  *
776  * slot may point to max if the key is bigger than all of the keys
777  */
778 static noinline int generic_bin_search(struct extent_buffer *eb,
779                                        unsigned long p,
780                                        int item_size, struct btrfs_key *key,
781                                        int max, int *slot)
782 {
783         int low = 0;
784         int high = max;
785         int mid;
786         int ret;
787         struct btrfs_disk_key *tmp = NULL;
788         struct btrfs_disk_key unaligned;
789         unsigned long offset;
790         char *kaddr = NULL;
791         unsigned long map_start = 0;
792         unsigned long map_len = 0;
793         int err;
794
795         while (low < high) {
796                 mid = (low + high) / 2;
797                 offset = p + mid * item_size;
798
799                 if (!kaddr || offset < map_start ||
800                     (offset + sizeof(struct btrfs_disk_key)) >
801                     map_start + map_len) {
802
803                         err = map_private_extent_buffer(eb, offset,
804                                                 sizeof(struct btrfs_disk_key),
805                                                 &kaddr, &map_start, &map_len);
806
807                         if (!err) {
808                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
809                                                         map_start);
810                         } else {
811                                 read_extent_buffer(eb, &unaligned,
812                                                    offset, sizeof(unaligned));
813                                 tmp = &unaligned;
814                         }
815
816                 } else {
817                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
818                                                         map_start);
819                 }
820                 ret = comp_keys(tmp, key);
821
822                 if (ret < 0)
823                         low = mid + 1;
824                 else if (ret > 0)
825                         high = mid;
826                 else {
827                         *slot = mid;
828                         return 0;
829                 }
830         }
831         *slot = low;
832         return 1;
833 }
834
835 /*
836  * simple bin_search frontend that does the right thing for
837  * leaves vs nodes
838  */
839 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
840                       int level, int *slot)
841 {
842         if (level == 0) {
843                 return generic_bin_search(eb,
844                                           offsetof(struct btrfs_leaf, items),
845                                           sizeof(struct btrfs_item),
846                                           key, btrfs_header_nritems(eb),
847                                           slot);
848         } else {
849                 return generic_bin_search(eb,
850                                           offsetof(struct btrfs_node, ptrs),
851                                           sizeof(struct btrfs_key_ptr),
852                                           key, btrfs_header_nritems(eb),
853                                           slot);
854         }
855         return -1;
856 }
857
858 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
859                      int level, int *slot)
860 {
861         return bin_search(eb, key, level, slot);
862 }
863
864 static void root_add_used(struct btrfs_root *root, u32 size)
865 {
866         spin_lock(&root->accounting_lock);
867         btrfs_set_root_used(&root->root_item,
868                             btrfs_root_used(&root->root_item) + size);
869         spin_unlock(&root->accounting_lock);
870 }
871
872 static void root_sub_used(struct btrfs_root *root, u32 size)
873 {
874         spin_lock(&root->accounting_lock);
875         btrfs_set_root_used(&root->root_item,
876                             btrfs_root_used(&root->root_item) - size);
877         spin_unlock(&root->accounting_lock);
878 }
879
880 /* given a node and slot number, this reads the blocks it points to.  The
881  * extent buffer is returned with a reference taken (but unlocked).
882  * NULL is returned on error.
883  */
884 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
885                                    struct extent_buffer *parent, int slot)
886 {
887         int level = btrfs_header_level(parent);
888         if (slot < 0)
889                 return NULL;
890         if (slot >= btrfs_header_nritems(parent))
891                 return NULL;
892
893         BUG_ON(level == 0);
894
895         return read_tree_block(root, btrfs_node_blockptr(parent, slot),
896                        btrfs_level_size(root, level - 1),
897                        btrfs_node_ptr_generation(parent, slot));
898 }
899
900 /*
901  * node level balancing, used to make sure nodes are in proper order for
902  * item deletion.  We balance from the top down, so we have to make sure
903  * that a deletion won't leave an node completely empty later on.
904  */
905 static noinline int balance_level(struct btrfs_trans_handle *trans,
906                          struct btrfs_root *root,
907                          struct btrfs_path *path, int level)
908 {
909         struct extent_buffer *right = NULL;
910         struct extent_buffer *mid;
911         struct extent_buffer *left = NULL;
912         struct extent_buffer *parent = NULL;
913         int ret = 0;
914         int wret;
915         int pslot;
916         int orig_slot = path->slots[level];
917         u64 orig_ptr;
918
919         if (level == 0)
920                 return 0;
921
922         mid = path->nodes[level];
923
924         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
925                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
926         WARN_ON(btrfs_header_generation(mid) != trans->transid);
927
928         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
929
930         if (level < BTRFS_MAX_LEVEL - 1) {
931                 parent = path->nodes[level + 1];
932                 pslot = path->slots[level + 1];
933         }
934
935         /*
936          * deal with the case where there is only one pointer in the root
937          * by promoting the node below to a root
938          */
939         if (!parent) {
940                 struct extent_buffer *child;
941
942                 if (btrfs_header_nritems(mid) != 1)
943                         return 0;
944
945                 /* promote the child to a root */
946                 child = read_node_slot(root, mid, 0);
947                 if (!child) {
948                         ret = -EROFS;
949                         btrfs_std_error(root->fs_info, ret);
950                         goto enospc;
951                 }
952
953                 btrfs_tree_lock(child);
954                 btrfs_set_lock_blocking(child);
955                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
956                 if (ret) {
957                         btrfs_tree_unlock(child);
958                         free_extent_buffer(child);
959                         goto enospc;
960                 }
961
962                 rcu_assign_pointer(root->node, child);
963
964                 add_root_to_dirty_list(root);
965                 btrfs_tree_unlock(child);
966
967                 path->locks[level] = 0;
968                 path->nodes[level] = NULL;
969                 clean_tree_block(trans, root, mid);
970                 btrfs_tree_unlock(mid);
971                 /* once for the path */
972                 free_extent_buffer(mid);
973
974                 root_sub_used(root, mid->len);
975                 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
976                 /* once for the root ptr */
977                 free_extent_buffer(mid);
978                 return 0;
979         }
980         if (btrfs_header_nritems(mid) >
981             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
982                 return 0;
983
984         btrfs_header_nritems(mid);
985
986         left = read_node_slot(root, parent, pslot - 1);
987         if (left) {
988                 btrfs_tree_lock(left);
989                 btrfs_set_lock_blocking(left);
990                 wret = btrfs_cow_block(trans, root, left,
991                                        parent, pslot - 1, &left);
992                 if (wret) {
993                         ret = wret;
994                         goto enospc;
995                 }
996         }
997         right = read_node_slot(root, parent, pslot + 1);
998         if (right) {
999                 btrfs_tree_lock(right);
1000                 btrfs_set_lock_blocking(right);
1001                 wret = btrfs_cow_block(trans, root, right,
1002                                        parent, pslot + 1, &right);
1003                 if (wret) {
1004                         ret = wret;
1005                         goto enospc;
1006                 }
1007         }
1008
1009         /* first, try to make some room in the middle buffer */
1010         if (left) {
1011                 orig_slot += btrfs_header_nritems(left);
1012                 wret = push_node_left(trans, root, left, mid, 1);
1013                 if (wret < 0)
1014                         ret = wret;
1015                 btrfs_header_nritems(mid);
1016         }
1017
1018         /*
1019          * then try to empty the right most buffer into the middle
1020          */
1021         if (right) {
1022                 wret = push_node_left(trans, root, mid, right, 1);
1023                 if (wret < 0 && wret != -ENOSPC)
1024                         ret = wret;
1025                 if (btrfs_header_nritems(right) == 0) {
1026                         clean_tree_block(trans, root, right);
1027                         btrfs_tree_unlock(right);
1028                         del_ptr(trans, root, path, level + 1, pslot + 1);
1029                         root_sub_used(root, right->len);
1030                         btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1031                         free_extent_buffer(right);
1032                         right = NULL;
1033                 } else {
1034                         struct btrfs_disk_key right_key;
1035                         btrfs_node_key(right, &right_key, 0);
1036                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1037                         btrfs_mark_buffer_dirty(parent);
1038                 }
1039         }
1040         if (btrfs_header_nritems(mid) == 1) {
1041                 /*
1042                  * we're not allowed to leave a node with one item in the
1043                  * tree during a delete.  A deletion from lower in the tree
1044                  * could try to delete the only pointer in this node.
1045                  * So, pull some keys from the left.
1046                  * There has to be a left pointer at this point because
1047                  * otherwise we would have pulled some pointers from the
1048                  * right
1049                  */
1050                 if (!left) {
1051                         ret = -EROFS;
1052                         btrfs_std_error(root->fs_info, ret);
1053                         goto enospc;
1054                 }
1055                 wret = balance_node_right(trans, root, mid, left);
1056                 if (wret < 0) {
1057                         ret = wret;
1058                         goto enospc;
1059                 }
1060                 if (wret == 1) {
1061                         wret = push_node_left(trans, root, left, mid, 1);
1062                         if (wret < 0)
1063                                 ret = wret;
1064                 }
1065                 BUG_ON(wret == 1);
1066         }
1067         if (btrfs_header_nritems(mid) == 0) {
1068                 clean_tree_block(trans, root, mid);
1069                 btrfs_tree_unlock(mid);
1070                 del_ptr(trans, root, path, level + 1, pslot);
1071                 root_sub_used(root, mid->len);
1072                 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1073                 free_extent_buffer(mid);
1074                 mid = NULL;
1075         } else {
1076                 /* update the parent key to reflect our changes */
1077                 struct btrfs_disk_key mid_key;
1078                 btrfs_node_key(mid, &mid_key, 0);
1079                 btrfs_set_node_key(parent, &mid_key, pslot);
1080                 btrfs_mark_buffer_dirty(parent);
1081         }
1082
1083         /* update the path */
1084         if (left) {
1085                 if (btrfs_header_nritems(left) > orig_slot) {
1086                         extent_buffer_get(left);
1087                         /* left was locked after cow */
1088                         path->nodes[level] = left;
1089                         path->slots[level + 1] -= 1;
1090                         path->slots[level] = orig_slot;
1091                         if (mid) {
1092                                 btrfs_tree_unlock(mid);
1093                                 free_extent_buffer(mid);
1094                         }
1095                 } else {
1096                         orig_slot -= btrfs_header_nritems(left);
1097                         path->slots[level] = orig_slot;
1098                 }
1099         }
1100         /* double check we haven't messed things up */
1101         if (orig_ptr !=
1102             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1103                 BUG();
1104 enospc:
1105         if (right) {
1106                 btrfs_tree_unlock(right);
1107                 free_extent_buffer(right);
1108         }
1109         if (left) {
1110                 if (path->nodes[level] != left)
1111                         btrfs_tree_unlock(left);
1112                 free_extent_buffer(left);
1113         }
1114         return ret;
1115 }
1116
1117 /* Node balancing for insertion.  Here we only split or push nodes around
1118  * when they are completely full.  This is also done top down, so we
1119  * have to be pessimistic.
1120  */
1121 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1122                                           struct btrfs_root *root,
1123                                           struct btrfs_path *path, int level)
1124 {
1125         struct extent_buffer *right = NULL;
1126         struct extent_buffer *mid;
1127         struct extent_buffer *left = NULL;
1128         struct extent_buffer *parent = NULL;
1129         int ret = 0;
1130         int wret;
1131         int pslot;
1132         int orig_slot = path->slots[level];
1133
1134         if (level == 0)
1135                 return 1;
1136
1137         mid = path->nodes[level];
1138         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1139
1140         if (level < BTRFS_MAX_LEVEL - 1) {
1141                 parent = path->nodes[level + 1];
1142                 pslot = path->slots[level + 1];
1143         }
1144
1145         if (!parent)
1146                 return 1;
1147
1148         left = read_node_slot(root, parent, pslot - 1);
1149
1150         /* first, try to make some room in the middle buffer */
1151         if (left) {
1152                 u32 left_nr;
1153
1154                 btrfs_tree_lock(left);
1155                 btrfs_set_lock_blocking(left);
1156
1157                 left_nr = btrfs_header_nritems(left);
1158                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1159                         wret = 1;
1160                 } else {
1161                         ret = btrfs_cow_block(trans, root, left, parent,
1162                                               pslot - 1, &left);
1163                         if (ret)
1164                                 wret = 1;
1165                         else {
1166                                 wret = push_node_left(trans, root,
1167                                                       left, mid, 0);
1168                         }
1169                 }
1170                 if (wret < 0)
1171                         ret = wret;
1172                 if (wret == 0) {
1173                         struct btrfs_disk_key disk_key;
1174                         orig_slot += left_nr;
1175                         btrfs_node_key(mid, &disk_key, 0);
1176                         btrfs_set_node_key(parent, &disk_key, pslot);
1177                         btrfs_mark_buffer_dirty(parent);
1178                         if (btrfs_header_nritems(left) > orig_slot) {
1179                                 path->nodes[level] = left;
1180                                 path->slots[level + 1] -= 1;
1181                                 path->slots[level] = orig_slot;
1182                                 btrfs_tree_unlock(mid);
1183                                 free_extent_buffer(mid);
1184                         } else {
1185                                 orig_slot -=
1186                                         btrfs_header_nritems(left);
1187                                 path->slots[level] = orig_slot;
1188                                 btrfs_tree_unlock(left);
1189                                 free_extent_buffer(left);
1190                         }
1191                         return 0;
1192                 }
1193                 btrfs_tree_unlock(left);
1194                 free_extent_buffer(left);
1195         }
1196         right = read_node_slot(root, parent, pslot + 1);
1197
1198         /*
1199          * then try to empty the right most buffer into the middle
1200          */
1201         if (right) {
1202                 u32 right_nr;
1203
1204                 btrfs_tree_lock(right);
1205                 btrfs_set_lock_blocking(right);
1206
1207                 right_nr = btrfs_header_nritems(right);
1208                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
1209                         wret = 1;
1210                 } else {
1211                         ret = btrfs_cow_block(trans, root, right,
1212                                               parent, pslot + 1,
1213                                               &right);
1214                         if (ret)
1215                                 wret = 1;
1216                         else {
1217                                 wret = balance_node_right(trans, root,
1218                                                           right, mid);
1219                         }
1220                 }
1221                 if (wret < 0)
1222                         ret = wret;
1223                 if (wret == 0) {
1224                         struct btrfs_disk_key disk_key;
1225
1226                         btrfs_node_key(right, &disk_key, 0);
1227                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1228                         btrfs_mark_buffer_dirty(parent);
1229
1230                         if (btrfs_header_nritems(mid) <= orig_slot) {
1231                                 path->nodes[level] = right;
1232                                 path->slots[level + 1] += 1;
1233                                 path->slots[level] = orig_slot -
1234                                         btrfs_header_nritems(mid);
1235                                 btrfs_tree_unlock(mid);
1236                                 free_extent_buffer(mid);
1237                         } else {
1238                                 btrfs_tree_unlock(right);
1239                                 free_extent_buffer(right);
1240                         }
1241                         return 0;
1242                 }
1243                 btrfs_tree_unlock(right);
1244                 free_extent_buffer(right);
1245         }
1246         return 1;
1247 }
1248
1249 /*
1250  * readahead one full node of leaves, finding things that are close
1251  * to the block in 'slot', and triggering ra on them.
1252  */
1253 static void reada_for_search(struct btrfs_root *root,
1254                              struct btrfs_path *path,
1255                              int level, int slot, u64 objectid)
1256 {
1257         struct extent_buffer *node;
1258         struct btrfs_disk_key disk_key;
1259         u32 nritems;
1260         u64 search;
1261         u64 target;
1262         u64 nread = 0;
1263         u64 gen;
1264         int direction = path->reada;
1265         struct extent_buffer *eb;
1266         u32 nr;
1267         u32 blocksize;
1268         u32 nscan = 0;
1269
1270         if (level != 1)
1271                 return;
1272
1273         if (!path->nodes[level])
1274                 return;
1275
1276         node = path->nodes[level];
1277
1278         search = btrfs_node_blockptr(node, slot);
1279         blocksize = btrfs_level_size(root, level - 1);
1280         eb = btrfs_find_tree_block(root, search, blocksize);
1281         if (eb) {
1282                 free_extent_buffer(eb);
1283                 return;
1284         }
1285
1286         target = search;
1287
1288         nritems = btrfs_header_nritems(node);
1289         nr = slot;
1290
1291         while (1) {
1292                 if (direction < 0) {
1293                         if (nr == 0)
1294                                 break;
1295                         nr--;
1296                 } else if (direction > 0) {
1297                         nr++;
1298                         if (nr >= nritems)
1299                                 break;
1300                 }
1301                 if (path->reada < 0 && objectid) {
1302                         btrfs_node_key(node, &disk_key, nr);
1303                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1304                                 break;
1305                 }
1306                 search = btrfs_node_blockptr(node, nr);
1307                 if ((search <= target && target - search <= 65536) ||
1308                     (search > target && search - target <= 65536)) {
1309                         gen = btrfs_node_ptr_generation(node, nr);
1310                         readahead_tree_block(root, search, blocksize, gen);
1311                         nread += blocksize;
1312                 }
1313                 nscan++;
1314                 if ((nread > 65536 || nscan > 32))
1315                         break;
1316         }
1317 }
1318
1319 /*
1320  * returns -EAGAIN if it had to drop the path, or zero if everything was in
1321  * cache
1322  */
1323 static noinline int reada_for_balance(struct btrfs_root *root,
1324                                       struct btrfs_path *path, int level)
1325 {
1326         int slot;
1327         int nritems;
1328         struct extent_buffer *parent;
1329         struct extent_buffer *eb;
1330         u64 gen;
1331         u64 block1 = 0;
1332         u64 block2 = 0;
1333         int ret = 0;
1334         int blocksize;
1335
1336         parent = path->nodes[level + 1];
1337         if (!parent)
1338                 return 0;
1339
1340         nritems = btrfs_header_nritems(parent);
1341         slot = path->slots[level + 1];
1342         blocksize = btrfs_level_size(root, level);
1343
1344         if (slot > 0) {
1345                 block1 = btrfs_node_blockptr(parent, slot - 1);
1346                 gen = btrfs_node_ptr_generation(parent, slot - 1);
1347                 eb = btrfs_find_tree_block(root, block1, blocksize);
1348                 if (eb && btrfs_buffer_uptodate(eb, gen))
1349                         block1 = 0;
1350                 free_extent_buffer(eb);
1351         }
1352         if (slot + 1 < nritems) {
1353                 block2 = btrfs_node_blockptr(parent, slot + 1);
1354                 gen = btrfs_node_ptr_generation(parent, slot + 1);
1355                 eb = btrfs_find_tree_block(root, block2, blocksize);
1356                 if (eb && btrfs_buffer_uptodate(eb, gen))
1357                         block2 = 0;
1358                 free_extent_buffer(eb);
1359         }
1360         if (block1 || block2) {
1361                 ret = -EAGAIN;
1362
1363                 /* release the whole path */
1364                 btrfs_release_path(path);
1365
1366                 /* read the blocks */
1367                 if (block1)
1368                         readahead_tree_block(root, block1, blocksize, 0);
1369                 if (block2)
1370                         readahead_tree_block(root, block2, blocksize, 0);
1371
1372                 if (block1) {
1373                         eb = read_tree_block(root, block1, blocksize, 0);
1374                         free_extent_buffer(eb);
1375                 }
1376                 if (block2) {
1377                         eb = read_tree_block(root, block2, blocksize, 0);
1378                         free_extent_buffer(eb);
1379                 }
1380         }
1381         return ret;
1382 }
1383
1384
1385 /*
1386  * when we walk down the tree, it is usually safe to unlock the higher layers
1387  * in the tree.  The exceptions are when our path goes through slot 0, because
1388  * operations on the tree might require changing key pointers higher up in the
1389  * tree.
1390  *
1391  * callers might also have set path->keep_locks, which tells this code to keep
1392  * the lock if the path points to the last slot in the block.  This is part of
1393  * walking through the tree, and selecting the next slot in the higher block.
1394  *
1395  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1396  * if lowest_unlock is 1, level 0 won't be unlocked
1397  */
1398 static noinline void unlock_up(struct btrfs_path *path, int level,
1399                                int lowest_unlock)
1400 {
1401         int i;
1402         int skip_level = level;
1403         int no_skips = 0;
1404         struct extent_buffer *t;
1405
1406         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1407                 if (!path->nodes[i])
1408                         break;
1409                 if (!path->locks[i])
1410                         break;
1411                 if (!no_skips && path->slots[i] == 0) {
1412                         skip_level = i + 1;
1413                         continue;
1414                 }
1415                 if (!no_skips && path->keep_locks) {
1416                         u32 nritems;
1417                         t = path->nodes[i];
1418                         nritems = btrfs_header_nritems(t);
1419                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
1420                                 skip_level = i + 1;
1421                                 continue;
1422                         }
1423                 }
1424                 if (skip_level < i && i >= lowest_unlock)
1425                         no_skips = 1;
1426
1427                 t = path->nodes[i];
1428                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1429                         btrfs_tree_unlock_rw(t, path->locks[i]);
1430                         path->locks[i] = 0;
1431                 }
1432         }
1433 }
1434
1435 /*
1436  * This releases any locks held in the path starting at level and
1437  * going all the way up to the root.
1438  *
1439  * btrfs_search_slot will keep the lock held on higher nodes in a few
1440  * corner cases, such as COW of the block at slot zero in the node.  This
1441  * ignores those rules, and it should only be called when there are no
1442  * more updates to be done higher up in the tree.
1443  */
1444 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1445 {
1446         int i;
1447
1448         if (path->keep_locks)
1449                 return;
1450
1451         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1452                 if (!path->nodes[i])
1453                         continue;
1454                 if (!path->locks[i])
1455                         continue;
1456                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1457                 path->locks[i] = 0;
1458         }
1459 }
1460
1461 /*
1462  * helper function for btrfs_search_slot.  The goal is to find a block
1463  * in cache without setting the path to blocking.  If we find the block
1464  * we return zero and the path is unchanged.
1465  *
1466  * If we can't find the block, we set the path blocking and do some
1467  * reada.  -EAGAIN is returned and the search must be repeated.
1468  */
1469 static int
1470 read_block_for_search(struct btrfs_trans_handle *trans,
1471                        struct btrfs_root *root, struct btrfs_path *p,
1472                        struct extent_buffer **eb_ret, int level, int slot,
1473                        struct btrfs_key *key)
1474 {
1475         u64 blocknr;
1476         u64 gen;
1477         u32 blocksize;
1478         struct extent_buffer *b = *eb_ret;
1479         struct extent_buffer *tmp;
1480         int ret;
1481
1482         blocknr = btrfs_node_blockptr(b, slot);
1483         gen = btrfs_node_ptr_generation(b, slot);
1484         blocksize = btrfs_level_size(root, level - 1);
1485
1486         tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1487         if (tmp) {
1488                 if (btrfs_buffer_uptodate(tmp, 0)) {
1489                         if (btrfs_buffer_uptodate(tmp, gen)) {
1490                                 /*
1491                                  * we found an up to date block without
1492                                  * sleeping, return
1493                                  * right away
1494                                  */
1495                                 *eb_ret = tmp;
1496                                 return 0;
1497                         }
1498                         /* the pages were up to date, but we failed
1499                          * the generation number check.  Do a full
1500                          * read for the generation number that is correct.
1501                          * We must do this without dropping locks so
1502                          * we can trust our generation number
1503                          */
1504                         free_extent_buffer(tmp);
1505                         btrfs_set_path_blocking(p);
1506
1507                         tmp = read_tree_block(root, blocknr, blocksize, gen);
1508                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1509                                 *eb_ret = tmp;
1510                                 return 0;
1511                         }
1512                         free_extent_buffer(tmp);
1513                         btrfs_release_path(p);
1514                         return -EIO;
1515                 }
1516         }
1517
1518         /*
1519          * reduce lock contention at high levels
1520          * of the btree by dropping locks before
1521          * we read.  Don't release the lock on the current
1522          * level because we need to walk this node to figure
1523          * out which blocks to read.
1524          */
1525         btrfs_unlock_up_safe(p, level + 1);
1526         btrfs_set_path_blocking(p);
1527
1528         free_extent_buffer(tmp);
1529         if (p->reada)
1530                 reada_for_search(root, p, level, slot, key->objectid);
1531
1532         btrfs_release_path(p);
1533
1534         ret = -EAGAIN;
1535         tmp = read_tree_block(root, blocknr, blocksize, 0);
1536         if (tmp) {
1537                 /*
1538                  * If the read above didn't mark this buffer up to date,
1539                  * it will never end up being up to date.  Set ret to EIO now
1540                  * and give up so that our caller doesn't loop forever
1541                  * on our EAGAINs.
1542                  */
1543                 if (!btrfs_buffer_uptodate(tmp, 0))
1544                         ret = -EIO;
1545                 free_extent_buffer(tmp);
1546         }
1547         return ret;
1548 }
1549
1550 /*
1551  * helper function for btrfs_search_slot.  This does all of the checks
1552  * for node-level blocks and does any balancing required based on
1553  * the ins_len.
1554  *
1555  * If no extra work was required, zero is returned.  If we had to
1556  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1557  * start over
1558  */
1559 static int
1560 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1561                        struct btrfs_root *root, struct btrfs_path *p,
1562                        struct extent_buffer *b, int level, int ins_len,
1563                        int *write_lock_level)
1564 {
1565         int ret;
1566         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1567             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1568                 int sret;
1569
1570                 if (*write_lock_level < level + 1) {
1571                         *write_lock_level = level + 1;
1572                         btrfs_release_path(p);
1573                         goto again;
1574                 }
1575
1576                 sret = reada_for_balance(root, p, level);
1577                 if (sret)
1578                         goto again;
1579
1580                 btrfs_set_path_blocking(p);
1581                 sret = split_node(trans, root, p, level);
1582                 btrfs_clear_path_blocking(p, NULL, 0);
1583
1584                 BUG_ON(sret > 0);
1585                 if (sret) {
1586                         ret = sret;
1587                         goto done;
1588                 }
1589                 b = p->nodes[level];
1590         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1591                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1592                 int sret;
1593
1594                 if (*write_lock_level < level + 1) {
1595                         *write_lock_level = level + 1;
1596                         btrfs_release_path(p);
1597                         goto again;
1598                 }
1599
1600                 sret = reada_for_balance(root, p, level);
1601                 if (sret)
1602                         goto again;
1603
1604                 btrfs_set_path_blocking(p);
1605                 sret = balance_level(trans, root, p, level);
1606                 btrfs_clear_path_blocking(p, NULL, 0);
1607
1608                 if (sret) {
1609                         ret = sret;
1610                         goto done;
1611                 }
1612                 b = p->nodes[level];
1613                 if (!b) {
1614                         btrfs_release_path(p);
1615                         goto again;
1616                 }
1617                 BUG_ON(btrfs_header_nritems(b) == 1);
1618         }
1619         return 0;
1620
1621 again:
1622         ret = -EAGAIN;
1623 done:
1624         return ret;
1625 }
1626
1627 /*
1628  * look for key in the tree.  path is filled in with nodes along the way
1629  * if key is found, we return zero and you can find the item in the leaf
1630  * level of the path (level 0)
1631  *
1632  * If the key isn't found, the path points to the slot where it should
1633  * be inserted, and 1 is returned.  If there are other errors during the
1634  * search a negative error number is returned.
1635  *
1636  * if ins_len > 0, nodes and leaves will be split as we walk down the
1637  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
1638  * possible)
1639  */
1640 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1641                       *root, struct btrfs_key *key, struct btrfs_path *p, int
1642                       ins_len, int cow)
1643 {
1644         struct extent_buffer *b;
1645         int slot;
1646         int ret;
1647         int err;
1648         int level;
1649         int lowest_unlock = 1;
1650         int root_lock;
1651         /* everything at write_lock_level or lower must be write locked */
1652         int write_lock_level = 0;
1653         u8 lowest_level = 0;
1654
1655         lowest_level = p->lowest_level;
1656         WARN_ON(lowest_level && ins_len > 0);
1657         WARN_ON(p->nodes[0] != NULL);
1658
1659         if (ins_len < 0) {
1660                 lowest_unlock = 2;
1661
1662                 /* when we are removing items, we might have to go up to level
1663                  * two as we update tree pointers  Make sure we keep write
1664                  * for those levels as well
1665                  */
1666                 write_lock_level = 2;
1667         } else if (ins_len > 0) {
1668                 /*
1669                  * for inserting items, make sure we have a write lock on
1670                  * level 1 so we can update keys
1671                  */
1672                 write_lock_level = 1;
1673         }
1674
1675         if (!cow)
1676                 write_lock_level = -1;
1677
1678         if (cow && (p->keep_locks || p->lowest_level))
1679                 write_lock_level = BTRFS_MAX_LEVEL;
1680
1681 again:
1682         /*
1683          * we try very hard to do read locks on the root
1684          */
1685         root_lock = BTRFS_READ_LOCK;
1686         level = 0;
1687         if (p->search_commit_root) {
1688                 /*
1689                  * the commit roots are read only
1690                  * so we always do read locks
1691                  */
1692                 b = root->commit_root;
1693                 extent_buffer_get(b);
1694                 level = btrfs_header_level(b);
1695                 if (!p->skip_locking)
1696                         btrfs_tree_read_lock(b);
1697         } else {
1698                 if (p->skip_locking) {
1699                         b = btrfs_root_node(root);
1700                         level = btrfs_header_level(b);
1701                 } else {
1702                         /* we don't know the level of the root node
1703                          * until we actually have it read locked
1704                          */
1705                         b = btrfs_read_lock_root_node(root);
1706                         level = btrfs_header_level(b);
1707                         if (level <= write_lock_level) {
1708                                 /* whoops, must trade for write lock */
1709                                 btrfs_tree_read_unlock(b);
1710                                 free_extent_buffer(b);
1711                                 b = btrfs_lock_root_node(root);
1712                                 root_lock = BTRFS_WRITE_LOCK;
1713
1714                                 /* the level might have changed, check again */
1715                                 level = btrfs_header_level(b);
1716                         }
1717                 }
1718         }
1719         p->nodes[level] = b;
1720         if (!p->skip_locking)
1721                 p->locks[level] = root_lock;
1722
1723         while (b) {
1724                 level = btrfs_header_level(b);
1725
1726                 /*
1727                  * setup the path here so we can release it under lock
1728                  * contention with the cow code
1729                  */
1730                 if (cow) {
1731                         /*
1732                          * if we don't really need to cow this block
1733                          * then we don't want to set the path blocking,
1734                          * so we test it here
1735                          */
1736                         if (!should_cow_block(trans, root, b))
1737                                 goto cow_done;
1738
1739                         btrfs_set_path_blocking(p);
1740
1741                         /*
1742                          * must have write locks on this node and the
1743                          * parent
1744                          */
1745                         if (level + 1 > write_lock_level) {
1746                                 write_lock_level = level + 1;
1747                                 btrfs_release_path(p);
1748                                 goto again;
1749                         }
1750
1751                         err = btrfs_cow_block(trans, root, b,
1752                                               p->nodes[level + 1],
1753                                               p->slots[level + 1], &b);
1754                         if (err) {
1755                                 ret = err;
1756                                 goto done;
1757                         }
1758                 }
1759 cow_done:
1760                 BUG_ON(!cow && ins_len);
1761
1762                 p->nodes[level] = b;
1763                 btrfs_clear_path_blocking(p, NULL, 0);
1764
1765                 /*
1766                  * we have a lock on b and as long as we aren't changing
1767                  * the tree, there is no way to for the items in b to change.
1768                  * It is safe to drop the lock on our parent before we
1769                  * go through the expensive btree search on b.
1770                  *
1771                  * If cow is true, then we might be changing slot zero,
1772                  * which may require changing the parent.  So, we can't
1773                  * drop the lock until after we know which slot we're
1774                  * operating on.
1775                  */
1776                 if (!cow)
1777                         btrfs_unlock_up_safe(p, level + 1);
1778
1779                 ret = bin_search(b, key, level, &slot);
1780
1781                 if (level != 0) {
1782                         int dec = 0;
1783                         if (ret && slot > 0) {
1784                                 dec = 1;
1785                                 slot -= 1;
1786                         }
1787                         p->slots[level] = slot;
1788                         err = setup_nodes_for_search(trans, root, p, b, level,
1789                                              ins_len, &write_lock_level);
1790                         if (err == -EAGAIN)
1791                                 goto again;
1792                         if (err) {
1793                                 ret = err;
1794                                 goto done;
1795                         }
1796                         b = p->nodes[level];
1797                         slot = p->slots[level];
1798
1799                         /*
1800                          * slot 0 is special, if we change the key
1801                          * we have to update the parent pointer
1802                          * which means we must have a write lock
1803                          * on the parent
1804                          */
1805                         if (slot == 0 && cow &&
1806                             write_lock_level < level + 1) {
1807                                 write_lock_level = level + 1;
1808                                 btrfs_release_path(p);
1809                                 goto again;
1810                         }
1811
1812                         unlock_up(p, level, lowest_unlock);
1813
1814                         if (level == lowest_level) {
1815                                 if (dec)
1816                                         p->slots[level]++;
1817                                 goto done;
1818                         }
1819
1820                         err = read_block_for_search(trans, root, p,
1821                                                     &b, level, slot, key);
1822                         if (err == -EAGAIN)
1823                                 goto again;
1824                         if (err) {
1825                                 ret = err;
1826                                 goto done;
1827                         }
1828
1829                         if (!p->skip_locking) {
1830                                 level = btrfs_header_level(b);
1831                                 if (level <= write_lock_level) {
1832                                         err = btrfs_try_tree_write_lock(b);
1833                                         if (!err) {
1834                                                 btrfs_set_path_blocking(p);
1835                                                 btrfs_tree_lock(b);
1836                                                 btrfs_clear_path_blocking(p, b,
1837                                                                   BTRFS_WRITE_LOCK);
1838                                         }
1839                                         p->locks[level] = BTRFS_WRITE_LOCK;
1840                                 } else {
1841                                         err = btrfs_try_tree_read_lock(b);
1842                                         if (!err) {
1843                                                 btrfs_set_path_blocking(p);
1844                                                 btrfs_tree_read_lock(b);
1845                                                 btrfs_clear_path_blocking(p, b,
1846                                                                   BTRFS_READ_LOCK);
1847                                         }
1848                                         p->locks[level] = BTRFS_READ_LOCK;
1849                                 }
1850                                 p->nodes[level] = b;
1851                         }
1852                 } else {
1853                         p->slots[level] = slot;
1854                         if (ins_len > 0 &&
1855                             btrfs_leaf_free_space(root, b) < ins_len) {
1856                                 if (write_lock_level < 1) {
1857                                         write_lock_level = 1;
1858                                         btrfs_release_path(p);
1859                                         goto again;
1860                                 }
1861
1862                                 btrfs_set_path_blocking(p);
1863                                 err = split_leaf(trans, root, key,
1864                                                  p, ins_len, ret == 0);
1865                                 btrfs_clear_path_blocking(p, NULL, 0);
1866
1867                                 BUG_ON(err > 0);
1868                                 if (err) {
1869                                         ret = err;
1870                                         goto done;
1871                                 }
1872                         }
1873                         if (!p->search_for_split)
1874                                 unlock_up(p, level, lowest_unlock);
1875                         goto done;
1876                 }
1877         }
1878         ret = 1;
1879 done:
1880         /*
1881          * we don't really know what they plan on doing with the path
1882          * from here on, so for now just mark it as blocking
1883          */
1884         if (!p->leave_spinning)
1885                 btrfs_set_path_blocking(p);
1886         if (ret < 0)
1887                 btrfs_release_path(p);
1888         return ret;
1889 }
1890
1891 /*
1892  * adjust the pointers going up the tree, starting at level
1893  * making sure the right key of each node is points to 'key'.
1894  * This is used after shifting pointers to the left, so it stops
1895  * fixing up pointers when a given leaf/node is not in slot 0 of the
1896  * higher levels
1897  *
1898  */
1899 static void fixup_low_keys(struct btrfs_trans_handle *trans,
1900                            struct btrfs_root *root, struct btrfs_path *path,
1901                            struct btrfs_disk_key *key, int level)
1902 {
1903         int i;
1904         struct extent_buffer *t;
1905
1906         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1907                 int tslot = path->slots[i];
1908                 if (!path->nodes[i])
1909                         break;
1910                 t = path->nodes[i];
1911                 btrfs_set_node_key(t, key, tslot);
1912                 btrfs_mark_buffer_dirty(path->nodes[i]);
1913                 if (tslot != 0)
1914                         break;
1915         }
1916 }
1917
1918 /*
1919  * update item key.
1920  *
1921  * This function isn't completely safe. It's the caller's responsibility
1922  * that the new key won't break the order
1923  */
1924 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
1925                              struct btrfs_root *root, struct btrfs_path *path,
1926                              struct btrfs_key *new_key)
1927 {
1928         struct btrfs_disk_key disk_key;
1929         struct extent_buffer *eb;
1930         int slot;
1931
1932         eb = path->nodes[0];
1933         slot = path->slots[0];
1934         if (slot > 0) {
1935                 btrfs_item_key(eb, &disk_key, slot - 1);
1936                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
1937         }
1938         if (slot < btrfs_header_nritems(eb) - 1) {
1939                 btrfs_item_key(eb, &disk_key, slot + 1);
1940                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
1941         }
1942
1943         btrfs_cpu_key_to_disk(&disk_key, new_key);
1944         btrfs_set_item_key(eb, &disk_key, slot);
1945         btrfs_mark_buffer_dirty(eb);
1946         if (slot == 0)
1947                 fixup_low_keys(trans, root, path, &disk_key, 1);
1948 }
1949
1950 /*
1951  * try to push data from one node into the next node left in the
1952  * tree.
1953  *
1954  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
1955  * error, and > 0 if there was no room in the left hand block.
1956  */
1957 static int push_node_left(struct btrfs_trans_handle *trans,
1958                           struct btrfs_root *root, struct extent_buffer *dst,
1959                           struct extent_buffer *src, int empty)
1960 {
1961         int push_items = 0;
1962         int src_nritems;
1963         int dst_nritems;
1964         int ret = 0;
1965
1966         src_nritems = btrfs_header_nritems(src);
1967         dst_nritems = btrfs_header_nritems(dst);
1968         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1969         WARN_ON(btrfs_header_generation(src) != trans->transid);
1970         WARN_ON(btrfs_header_generation(dst) != trans->transid);
1971
1972         if (!empty && src_nritems <= 8)
1973                 return 1;
1974
1975         if (push_items <= 0)
1976                 return 1;
1977
1978         if (empty) {
1979                 push_items = min(src_nritems, push_items);
1980                 if (push_items < src_nritems) {
1981                         /* leave at least 8 pointers in the node if
1982                          * we aren't going to empty it
1983                          */
1984                         if (src_nritems - push_items < 8) {
1985                                 if (push_items <= 8)
1986                                         return 1;
1987                                 push_items -= 8;
1988                         }
1989                 }
1990         } else
1991                 push_items = min(src_nritems - 8, push_items);
1992
1993         copy_extent_buffer(dst, src,
1994                            btrfs_node_key_ptr_offset(dst_nritems),
1995                            btrfs_node_key_ptr_offset(0),
1996                            push_items * sizeof(struct btrfs_key_ptr));
1997
1998         if (push_items < src_nritems) {
1999                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2000                                       btrfs_node_key_ptr_offset(push_items),
2001                                       (src_nritems - push_items) *
2002                                       sizeof(struct btrfs_key_ptr));
2003         }
2004         btrfs_set_header_nritems(src, src_nritems - push_items);
2005         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2006         btrfs_mark_buffer_dirty(src);
2007         btrfs_mark_buffer_dirty(dst);
2008
2009         return ret;
2010 }
2011
2012 /*
2013  * try to push data from one node into the next node right in the
2014  * tree.
2015  *
2016  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2017  * error, and > 0 if there was no room in the right hand block.
2018  *
2019  * this will  only push up to 1/2 the contents of the left node over
2020  */
2021 static int balance_node_right(struct btrfs_trans_handle *trans,
2022                               struct btrfs_root *root,
2023                               struct extent_buffer *dst,
2024                               struct extent_buffer *src)
2025 {
2026         int push_items = 0;
2027         int max_push;
2028         int src_nritems;
2029         int dst_nritems;
2030         int ret = 0;
2031
2032         WARN_ON(btrfs_header_generation(src) != trans->transid);
2033         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2034
2035         src_nritems = btrfs_header_nritems(src);
2036         dst_nritems = btrfs_header_nritems(dst);
2037         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
2038         if (push_items <= 0)
2039                 return 1;
2040
2041         if (src_nritems < 4)
2042                 return 1;
2043
2044         max_push = src_nritems / 2 + 1;
2045         /* don't try to empty the node */
2046         if (max_push >= src_nritems)
2047                 return 1;
2048
2049         if (max_push < push_items)
2050                 push_items = max_push;
2051
2052         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2053                                       btrfs_node_key_ptr_offset(0),
2054                                       (dst_nritems) *
2055                                       sizeof(struct btrfs_key_ptr));
2056
2057         copy_extent_buffer(dst, src,
2058                            btrfs_node_key_ptr_offset(0),
2059                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2060                            push_items * sizeof(struct btrfs_key_ptr));
2061
2062         btrfs_set_header_nritems(src, src_nritems - push_items);
2063         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2064
2065         btrfs_mark_buffer_dirty(src);
2066         btrfs_mark_buffer_dirty(dst);
2067
2068         return ret;
2069 }
2070
2071 /*
2072  * helper function to insert a new root level in the tree.
2073  * A new node is allocated, and a single item is inserted to
2074  * point to the existing root
2075  *
2076  * returns zero on success or < 0 on failure.
2077  */
2078 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2079                            struct btrfs_root *root,
2080                            struct btrfs_path *path, int level)
2081 {
2082         u64 lower_gen;
2083         struct extent_buffer *lower;
2084         struct extent_buffer *c;
2085         struct extent_buffer *old;
2086         struct btrfs_disk_key lower_key;
2087
2088         BUG_ON(path->nodes[level]);
2089         BUG_ON(path->nodes[level-1] != root->node);
2090
2091         lower = path->nodes[level-1];
2092         if (level == 1)
2093                 btrfs_item_key(lower, &lower_key, 0);
2094         else
2095                 btrfs_node_key(lower, &lower_key, 0);
2096
2097         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2098                                    root->root_key.objectid, &lower_key,
2099                                    level, root->node->start, 0, 0);
2100         if (IS_ERR(c))
2101                 return PTR_ERR(c);
2102
2103         root_add_used(root, root->nodesize);
2104
2105         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2106         btrfs_set_header_nritems(c, 1);
2107         btrfs_set_header_level(c, level);
2108         btrfs_set_header_bytenr(c, c->start);
2109         btrfs_set_header_generation(c, trans->transid);
2110         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
2111         btrfs_set_header_owner(c, root->root_key.objectid);
2112
2113         write_extent_buffer(c, root->fs_info->fsid,
2114                             (unsigned long)btrfs_header_fsid(c),
2115                             BTRFS_FSID_SIZE);
2116
2117         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
2118                             (unsigned long)btrfs_header_chunk_tree_uuid(c),
2119                             BTRFS_UUID_SIZE);
2120
2121         btrfs_set_node_key(c, &lower_key, 0);
2122         btrfs_set_node_blockptr(c, 0, lower->start);
2123         lower_gen = btrfs_header_generation(lower);
2124         WARN_ON(lower_gen != trans->transid);
2125
2126         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2127
2128         btrfs_mark_buffer_dirty(c);
2129
2130         old = root->node;
2131         rcu_assign_pointer(root->node, c);
2132
2133         /* the super has an extra ref to root->node */
2134         free_extent_buffer(old);
2135
2136         add_root_to_dirty_list(root);
2137         extent_buffer_get(c);
2138         path->nodes[level] = c;
2139         path->locks[level] = BTRFS_WRITE_LOCK;
2140         path->slots[level] = 0;
2141         return 0;
2142 }
2143
2144 /*
2145  * worker function to insert a single pointer in a node.
2146  * the node should have enough room for the pointer already
2147  *
2148  * slot and level indicate where you want the key to go, and
2149  * blocknr is the block the key points to.
2150  */
2151 static void insert_ptr(struct btrfs_trans_handle *trans,
2152                        struct btrfs_root *root, struct btrfs_path *path,
2153                        struct btrfs_disk_key *key, u64 bytenr,
2154                        int slot, int level)
2155 {
2156         struct extent_buffer *lower;
2157         int nritems;
2158
2159         BUG_ON(!path->nodes[level]);
2160         btrfs_assert_tree_locked(path->nodes[level]);
2161         lower = path->nodes[level];
2162         nritems = btrfs_header_nritems(lower);
2163         BUG_ON(slot > nritems);
2164         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
2165         if (slot != nritems) {
2166                 memmove_extent_buffer(lower,
2167                               btrfs_node_key_ptr_offset(slot + 1),
2168                               btrfs_node_key_ptr_offset(slot),
2169                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2170         }
2171         btrfs_set_node_key(lower, key, slot);
2172         btrfs_set_node_blockptr(lower, slot, bytenr);
2173         WARN_ON(trans->transid == 0);
2174         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2175         btrfs_set_header_nritems(lower, nritems + 1);
2176         btrfs_mark_buffer_dirty(lower);
2177 }
2178
2179 /*
2180  * split the node at the specified level in path in two.
2181  * The path is corrected to point to the appropriate node after the split
2182  *
2183  * Before splitting this tries to make some room in the node by pushing
2184  * left and right, if either one works, it returns right away.
2185  *
2186  * returns 0 on success and < 0 on failure
2187  */
2188 static noinline int split_node(struct btrfs_trans_handle *trans,
2189                                struct btrfs_root *root,
2190                                struct btrfs_path *path, int level)
2191 {
2192         struct extent_buffer *c;
2193         struct extent_buffer *split;
2194         struct btrfs_disk_key disk_key;
2195         int mid;
2196         int ret;
2197         u32 c_nritems;
2198
2199         c = path->nodes[level];
2200         WARN_ON(btrfs_header_generation(c) != trans->transid);
2201         if (c == root->node) {
2202                 /* trying to split the root, lets make a new one */
2203                 ret = insert_new_root(trans, root, path, level + 1);
2204                 if (ret)
2205                         return ret;
2206         } else {
2207                 ret = push_nodes_for_insert(trans, root, path, level);
2208                 c = path->nodes[level];
2209                 if (!ret && btrfs_header_nritems(c) <
2210                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2211                         return 0;
2212                 if (ret < 0)
2213                         return ret;
2214         }
2215
2216         c_nritems = btrfs_header_nritems(c);
2217         mid = (c_nritems + 1) / 2;
2218         btrfs_node_key(c, &disk_key, mid);
2219
2220         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2221                                         root->root_key.objectid,
2222                                         &disk_key, level, c->start, 0, 0);
2223         if (IS_ERR(split))
2224                 return PTR_ERR(split);
2225
2226         root_add_used(root, root->nodesize);
2227
2228         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2229         btrfs_set_header_level(split, btrfs_header_level(c));
2230         btrfs_set_header_bytenr(split, split->start);
2231         btrfs_set_header_generation(split, trans->transid);
2232         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
2233         btrfs_set_header_owner(split, root->root_key.objectid);
2234         write_extent_buffer(split, root->fs_info->fsid,
2235                             (unsigned long)btrfs_header_fsid(split),
2236                             BTRFS_FSID_SIZE);
2237         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
2238                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
2239                             BTRFS_UUID_SIZE);
2240
2241
2242         copy_extent_buffer(split, c,
2243                            btrfs_node_key_ptr_offset(0),
2244                            btrfs_node_key_ptr_offset(mid),
2245                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2246         btrfs_set_header_nritems(split, c_nritems - mid);
2247         btrfs_set_header_nritems(c, mid);
2248         ret = 0;
2249
2250         btrfs_mark_buffer_dirty(c);
2251         btrfs_mark_buffer_dirty(split);
2252
2253         insert_ptr(trans, root, path, &disk_key, split->start,
2254                    path->slots[level + 1] + 1, level + 1);
2255
2256         if (path->slots[level] >= mid) {
2257                 path->slots[level] -= mid;
2258                 btrfs_tree_unlock(c);
2259                 free_extent_buffer(c);
2260                 path->nodes[level] = split;
2261                 path->slots[level + 1] += 1;
2262         } else {
2263                 btrfs_tree_unlock(split);
2264                 free_extent_buffer(split);
2265         }
2266         return ret;
2267 }
2268
2269 /*
2270  * how many bytes are required to store the items in a leaf.  start
2271  * and nr indicate which items in the leaf to check.  This totals up the
2272  * space used both by the item structs and the item data
2273  */
2274 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2275 {
2276         int data_len;
2277         int nritems = btrfs_header_nritems(l);
2278         int end = min(nritems, start + nr) - 1;
2279
2280         if (!nr)
2281                 return 0;
2282         data_len = btrfs_item_end_nr(l, start);
2283         data_len = data_len - btrfs_item_offset_nr(l, end);
2284         data_len += sizeof(struct btrfs_item) * nr;
2285         WARN_ON(data_len < 0);
2286         return data_len;
2287 }
2288
2289 /*
2290  * The space between the end of the leaf items and
2291  * the start of the leaf data.  IOW, how much room
2292  * the leaf has left for both items and data
2293  */
2294 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2295                                    struct extent_buffer *leaf)
2296 {
2297         int nritems = btrfs_header_nritems(leaf);
2298         int ret;
2299         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
2300         if (ret < 0) {
2301                 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
2302                        "used %d nritems %d\n",
2303                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2304                        leaf_space_used(leaf, 0, nritems), nritems);
2305         }
2306         return ret;
2307 }
2308
2309 /*
2310  * min slot controls the lowest index we're willing to push to the
2311  * right.  We'll push up to and including min_slot, but no lower
2312  */
2313 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2314                                       struct btrfs_root *root,
2315                                       struct btrfs_path *path,
2316                                       int data_size, int empty,
2317                                       struct extent_buffer *right,
2318                                       int free_space, u32 left_nritems,
2319                                       u32 min_slot)
2320 {
2321         struct extent_buffer *left = path->nodes[0];
2322         struct extent_buffer *upper = path->nodes[1];
2323         struct btrfs_disk_key disk_key;
2324         int slot;
2325         u32 i;
2326         int push_space = 0;
2327         int push_items = 0;
2328         struct btrfs_item *item;
2329         u32 nr;
2330         u32 right_nritems;
2331         u32 data_end;
2332         u32 this_item_size;
2333
2334         if (empty)
2335                 nr = 0;
2336         else
2337                 nr = max_t(u32, 1, min_slot);
2338
2339         if (path->slots[0] >= left_nritems)
2340                 push_space += data_size;
2341
2342         slot = path->slots[1];
2343         i = left_nritems - 1;
2344         while (i >= nr) {
2345                 item = btrfs_item_nr(left, i);
2346
2347                 if (!empty && push_items > 0) {
2348                         if (path->slots[0] > i)
2349                                 break;
2350                         if (path->slots[0] == i) {
2351                                 int space = btrfs_leaf_free_space(root, left);
2352                                 if (space + push_space * 2 > free_space)
2353                                         break;
2354                         }
2355                 }
2356
2357                 if (path->slots[0] == i)
2358                         push_space += data_size;
2359
2360                 this_item_size = btrfs_item_size(left, item);
2361                 if (this_item_size + sizeof(*item) + push_space > free_space)
2362                         break;
2363
2364                 push_items++;
2365                 push_space += this_item_size + sizeof(*item);
2366                 if (i == 0)
2367                         break;
2368                 i--;
2369         }
2370
2371         if (push_items == 0)
2372                 goto out_unlock;
2373
2374         if (!empty && push_items == left_nritems)
2375                 WARN_ON(1);
2376
2377         /* push left to right */
2378         right_nritems = btrfs_header_nritems(right);
2379
2380         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2381         push_space -= leaf_data_end(root, left);
2382
2383         /* make room in the right data area */
2384         data_end = leaf_data_end(root, right);
2385         memmove_extent_buffer(right,
2386                               btrfs_leaf_data(right) + data_end - push_space,
2387                               btrfs_leaf_data(right) + data_end,
2388                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
2389
2390         /* copy from the left data area */
2391         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
2392                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
2393                      btrfs_leaf_data(left) + leaf_data_end(root, left),
2394                      push_space);
2395
2396         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2397                               btrfs_item_nr_offset(0),
2398                               right_nritems * sizeof(struct btrfs_item));
2399
2400         /* copy the items from left to right */
2401         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2402                    btrfs_item_nr_offset(left_nritems - push_items),
2403                    push_items * sizeof(struct btrfs_item));
2404
2405         /* update the item pointers */
2406         right_nritems += push_items;
2407         btrfs_set_header_nritems(right, right_nritems);
2408         push_space = BTRFS_LEAF_DATA_SIZE(root);
2409         for (i = 0; i < right_nritems; i++) {
2410                 item = btrfs_item_nr(right, i);
2411                 push_space -= btrfs_item_size(right, item);
2412                 btrfs_set_item_offset(right, item, push_space);
2413         }
2414
2415         left_nritems -= push_items;
2416         btrfs_set_header_nritems(left, left_nritems);
2417
2418         if (left_nritems)
2419                 btrfs_mark_buffer_dirty(left);
2420         else
2421                 clean_tree_block(trans, root, left);
2422
2423         btrfs_mark_buffer_dirty(right);
2424
2425         btrfs_item_key(right, &disk_key, 0);
2426         btrfs_set_node_key(upper, &disk_key, slot + 1);
2427         btrfs_mark_buffer_dirty(upper);
2428
2429         /* then fixup the leaf pointer in the path */
2430         if (path->slots[0] >= left_nritems) {
2431                 path->slots[0] -= left_nritems;
2432                 if (btrfs_header_nritems(path->nodes[0]) == 0)
2433                         clean_tree_block(trans, root, path->nodes[0]);
2434                 btrfs_tree_unlock(path->nodes[0]);
2435                 free_extent_buffer(path->nodes[0]);
2436                 path->nodes[0] = right;
2437                 path->slots[1] += 1;
2438         } else {
2439                 btrfs_tree_unlock(right);
2440                 free_extent_buffer(right);
2441         }
2442         return 0;
2443
2444 out_unlock:
2445         btrfs_tree_unlock(right);
2446         free_extent_buffer(right);
2447         return 1;
2448 }
2449
2450 /*
2451  * push some data in the path leaf to the right, trying to free up at
2452  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2453  *
2454  * returns 1 if the push failed because the other node didn't have enough
2455  * room, 0 if everything worked out and < 0 if there were major errors.
2456  *
2457  * this will push starting from min_slot to the end of the leaf.  It won't
2458  * push any slot lower than min_slot
2459  */
2460 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2461                            *root, struct btrfs_path *path,
2462                            int min_data_size, int data_size,
2463                            int empty, u32 min_slot)
2464 {
2465         struct extent_buffer *left = path->nodes[0];
2466         struct extent_buffer *right;
2467         struct extent_buffer *upper;
2468         int slot;
2469         int free_space;
2470         u32 left_nritems;
2471         int ret;
2472
2473         if (!path->nodes[1])
2474                 return 1;
2475
2476         slot = path->slots[1];
2477         upper = path->nodes[1];
2478         if (slot >= btrfs_header_nritems(upper) - 1)
2479                 return 1;
2480
2481         btrfs_assert_tree_locked(path->nodes[1]);
2482
2483         right = read_node_slot(root, upper, slot + 1);
2484         if (right == NULL)
2485                 return 1;
2486
2487         btrfs_tree_lock(right);
2488         btrfs_set_lock_blocking(right);
2489
2490         free_space = btrfs_leaf_free_space(root, right);
2491         if (free_space < data_size)
2492                 goto out_unlock;
2493
2494         /* cow and double check */
2495         ret = btrfs_cow_block(trans, root, right, upper,
2496                               slot + 1, &right);
2497         if (ret)
2498                 goto out_unlock;
2499
2500         free_space = btrfs_leaf_free_space(root, right);
2501         if (free_space < data_size)
2502                 goto out_unlock;
2503
2504         left_nritems = btrfs_header_nritems(left);
2505         if (left_nritems == 0)
2506                 goto out_unlock;
2507
2508         return __push_leaf_right(trans, root, path, min_data_size, empty,
2509                                 right, free_space, left_nritems, min_slot);
2510 out_unlock:
2511         btrfs_tree_unlock(right);
2512         free_extent_buffer(right);
2513         return 1;
2514 }
2515
2516 /*
2517  * push some data in the path leaf to the left, trying to free up at
2518  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2519  *
2520  * max_slot can put a limit on how far into the leaf we'll push items.  The
2521  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2522  * items
2523  */
2524 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2525                                      struct btrfs_root *root,
2526                                      struct btrfs_path *path, int data_size,
2527                                      int empty, struct extent_buffer *left,
2528                                      int free_space, u32 right_nritems,
2529                                      u32 max_slot)
2530 {
2531         struct btrfs_disk_key disk_key;
2532         struct extent_buffer *right = path->nodes[0];
2533         int i;
2534         int push_space = 0;
2535         int push_items = 0;
2536         struct btrfs_item *item;
2537         u32 old_left_nritems;
2538         u32 nr;
2539         int ret = 0;
2540         u32 this_item_size;
2541         u32 old_left_item_size;
2542
2543         if (empty)
2544                 nr = min(right_nritems, max_slot);
2545         else
2546                 nr = min(right_nritems - 1, max_slot);
2547
2548         for (i = 0; i < nr; i++) {
2549                 item = btrfs_item_nr(right, i);
2550
2551                 if (!empty && push_items > 0) {
2552                         if (path->slots[0] < i)
2553                                 break;
2554                         if (path->slots[0] == i) {
2555                                 int space = btrfs_leaf_free_space(root, right);
2556                                 if (space + push_space * 2 > free_space)
2557                                         break;
2558                         }
2559                 }
2560
2561                 if (path->slots[0] == i)
2562                         push_space += data_size;
2563
2564                 this_item_size = btrfs_item_size(right, item);
2565                 if (this_item_size + sizeof(*item) + push_space > free_space)
2566                         break;
2567
2568                 push_items++;
2569                 push_space += this_item_size + sizeof(*item);
2570         }
2571
2572         if (push_items == 0) {
2573                 ret = 1;
2574                 goto out;
2575         }
2576         if (!empty && push_items == btrfs_header_nritems(right))
2577                 WARN_ON(1);
2578
2579         /* push data from right to left */
2580         copy_extent_buffer(left, right,
2581                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
2582                            btrfs_item_nr_offset(0),
2583                            push_items * sizeof(struct btrfs_item));
2584
2585         push_space = BTRFS_LEAF_DATA_SIZE(root) -
2586                      btrfs_item_offset_nr(right, push_items - 1);
2587
2588         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
2589                      leaf_data_end(root, left) - push_space,
2590                      btrfs_leaf_data(right) +
2591                      btrfs_item_offset_nr(right, push_items - 1),
2592                      push_space);
2593         old_left_nritems = btrfs_header_nritems(left);
2594         BUG_ON(old_left_nritems <= 0);
2595
2596         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2597         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2598                 u32 ioff;
2599
2600                 item = btrfs_item_nr(left, i);
2601
2602                 ioff = btrfs_item_offset(left, item);
2603                 btrfs_set_item_offset(left, item,
2604                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2605         }
2606         btrfs_set_header_nritems(left, old_left_nritems + push_items);
2607
2608         /* fixup right node */
2609         if (push_items > right_nritems) {
2610                 printk(KERN_CRIT "push items %d nr %u\n", push_items,
2611                        right_nritems);
2612                 WARN_ON(1);
2613         }
2614
2615         if (push_items < right_nritems) {
2616                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
2617                                                   leaf_data_end(root, right);
2618                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
2619                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
2620                                       btrfs_leaf_data(right) +
2621                                       leaf_data_end(root, right), push_space);
2622
2623                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2624                               btrfs_item_nr_offset(push_items),
2625                              (btrfs_header_nritems(right) - push_items) *
2626                              sizeof(struct btrfs_item));
2627         }
2628         right_nritems -= push_items;
2629         btrfs_set_header_nritems(right, right_nritems);
2630         push_space = BTRFS_LEAF_DATA_SIZE(root);
2631         for (i = 0; i < right_nritems; i++) {
2632                 item = btrfs_item_nr(right, i);
2633
2634                 push_space = push_space - btrfs_item_size(right, item);
2635                 btrfs_set_item_offset(right, item, push_space);
2636         }
2637
2638         btrfs_mark_buffer_dirty(left);
2639         if (right_nritems)
2640                 btrfs_mark_buffer_dirty(right);
2641         else
2642                 clean_tree_block(trans, root, right);
2643
2644         btrfs_item_key(right, &disk_key, 0);
2645         fixup_low_keys(trans, root, path, &disk_key, 1);
2646
2647         /* then fixup the leaf pointer in the path */
2648         if (path->slots[0] < push_items) {
2649                 path->slots[0] += old_left_nritems;
2650                 btrfs_tree_unlock(path->nodes[0]);
2651                 free_extent_buffer(path->nodes[0]);
2652                 path->nodes[0] = left;
2653                 path->slots[1] -= 1;
2654         } else {
2655                 btrfs_tree_unlock(left);
2656                 free_extent_buffer(left);
2657                 path->slots[0] -= push_items;
2658         }
2659         BUG_ON(path->slots[0] < 0);
2660         return ret;
2661 out:
2662         btrfs_tree_unlock(left);
2663         free_extent_buffer(left);
2664         return ret;
2665 }
2666
2667 /*
2668  * push some data in the path leaf to the left, trying to free up at
2669  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2670  *
2671  * max_slot can put a limit on how far into the leaf we'll push items.  The
2672  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
2673  * items
2674  */
2675 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2676                           *root, struct btrfs_path *path, int min_data_size,
2677                           int data_size, int empty, u32 max_slot)
2678 {
2679         struct extent_buffer *right = path->nodes[0];
2680         struct extent_buffer *left;
2681         int slot;
2682         int free_space;
2683         u32 right_nritems;
2684         int ret = 0;
2685
2686         slot = path->slots[1];
2687         if (slot == 0)
2688                 return 1;
2689         if (!path->nodes[1])
2690                 return 1;
2691
2692         right_nritems = btrfs_header_nritems(right);
2693         if (right_nritems == 0)
2694                 return 1;
2695
2696         btrfs_assert_tree_locked(path->nodes[1]);
2697
2698         left = read_node_slot(root, path->nodes[1], slot - 1);
2699         if (left == NULL)
2700                 return 1;
2701
2702         btrfs_tree_lock(left);
2703         btrfs_set_lock_blocking(left);
2704
2705         free_space = btrfs_leaf_free_space(root, left);
2706         if (free_space < data_size) {
2707                 ret = 1;
2708                 goto out;
2709         }
2710
2711         /* cow and double check */
2712         ret = btrfs_cow_block(trans, root, left,
2713                               path->nodes[1], slot - 1, &left);
2714         if (ret) {
2715                 /* we hit -ENOSPC, but it isn't fatal here */
2716                 if (ret == -ENOSPC)
2717                         ret = 1;
2718                 goto out;
2719         }
2720
2721         free_space = btrfs_leaf_free_space(root, left);
2722         if (free_space < data_size) {
2723                 ret = 1;
2724                 goto out;
2725         }
2726
2727         return __push_leaf_left(trans, root, path, min_data_size,
2728                                empty, left, free_space, right_nritems,
2729                                max_slot);
2730 out:
2731         btrfs_tree_unlock(left);
2732         free_extent_buffer(left);
2733         return ret;
2734 }
2735
2736 /*
2737  * split the path's leaf in two, making sure there is at least data_size
2738  * available for the resulting leaf level of the path.
2739  */
2740 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
2741                                     struct btrfs_root *root,
2742                                     struct btrfs_path *path,
2743                                     struct extent_buffer *l,
2744                                     struct extent_buffer *right,
2745                                     int slot, int mid, int nritems)
2746 {
2747         int data_copy_size;
2748         int rt_data_off;
2749         int i;
2750         struct btrfs_disk_key disk_key;
2751
2752         nritems = nritems - mid;
2753         btrfs_set_header_nritems(right, nritems);
2754         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
2755
2756         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
2757                            btrfs_item_nr_offset(mid),
2758                            nritems * sizeof(struct btrfs_item));
2759
2760         copy_extent_buffer(right, l,
2761                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
2762                      data_copy_size, btrfs_leaf_data(l) +
2763                      leaf_data_end(root, l), data_copy_size);
2764
2765         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2766                       btrfs_item_end_nr(l, mid);
2767
2768         for (i = 0; i < nritems; i++) {
2769                 struct btrfs_item *item = btrfs_item_nr(right, i);
2770                 u32 ioff;
2771
2772                 ioff = btrfs_item_offset(right, item);
2773                 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2774         }
2775
2776         btrfs_set_header_nritems(l, mid);
2777         btrfs_item_key(right, &disk_key, 0);
2778         insert_ptr(trans, root, path, &disk_key, right->start,
2779                    path->slots[1] + 1, 1);
2780
2781         btrfs_mark_buffer_dirty(right);
2782         btrfs_mark_buffer_dirty(l);
2783         BUG_ON(path->slots[0] != slot);
2784
2785         if (mid <= slot) {
2786                 btrfs_tree_unlock(path->nodes[0]);
2787                 free_extent_buffer(path->nodes[0]);
2788                 path->nodes[0] = right;
2789                 path->slots[0] -= mid;
2790                 path->slots[1] += 1;
2791         } else {
2792                 btrfs_tree_unlock(right);
2793                 free_extent_buffer(right);
2794         }
2795
2796         BUG_ON(path->slots[0] < 0);
2797 }
2798
2799 /*
2800  * double splits happen when we need to insert a big item in the middle
2801  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
2802  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2803  *          A                 B                 C
2804  *
2805  * We avoid this by trying to push the items on either side of our target
2806  * into the adjacent leaves.  If all goes well we can avoid the double split
2807  * completely.
2808  */
2809 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2810                                           struct btrfs_root *root,
2811                                           struct btrfs_path *path,
2812                                           int data_size)
2813 {
2814         int ret;
2815         int progress = 0;
2816         int slot;
2817         u32 nritems;
2818
2819         slot = path->slots[0];
2820
2821         /*
2822          * try to push all the items after our slot into the
2823          * right leaf
2824          */
2825         ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2826         if (ret < 0)
2827                 return ret;
2828
2829         if (ret == 0)
2830                 progress++;
2831
2832         nritems = btrfs_header_nritems(path->nodes[0]);
2833         /*
2834          * our goal is to get our slot at the start or end of a leaf.  If
2835          * we've done so we're done
2836          */
2837         if (path->slots[0] == 0 || path->slots[0] == nritems)
2838                 return 0;
2839
2840         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2841                 return 0;
2842
2843         /* try to push all the items before our slot into the next leaf */
2844         slot = path->slots[0];
2845         ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2846         if (ret < 0)
2847                 return ret;
2848
2849         if (ret == 0)
2850                 progress++;
2851
2852         if (progress)
2853                 return 0;
2854         return 1;
2855 }
2856
2857 /*
2858  * split the path's leaf in two, making sure there is at least data_size
2859  * available for the resulting leaf level of the path.
2860  *
2861  * returns 0 if all went well and < 0 on failure.
2862  */
2863 static noinline int split_leaf(struct btrfs_trans_handle *trans,
2864                                struct btrfs_root *root,
2865                                struct btrfs_key *ins_key,
2866                                struct btrfs_path *path, int data_size,
2867                                int extend)
2868 {
2869         struct btrfs_disk_key disk_key;
2870         struct extent_buffer *l;
2871         u32 nritems;
2872         int mid;
2873         int slot;
2874         struct extent_buffer *right;
2875         int ret = 0;
2876         int wret;
2877         int split;
2878         int num_doubles = 0;
2879         int tried_avoid_double = 0;
2880
2881         l = path->nodes[0];
2882         slot = path->slots[0];
2883         if (extend && data_size + btrfs_item_size_nr(l, slot) +
2884             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
2885                 return -EOVERFLOW;
2886
2887         /* first try to make some room by pushing left and right */
2888         if (data_size) {
2889                 wret = push_leaf_right(trans, root, path, data_size,
2890                                        data_size, 0, 0);
2891                 if (wret < 0)
2892                         return wret;
2893                 if (wret) {
2894                         wret = push_leaf_left(trans, root, path, data_size,
2895                                               data_size, 0, (u32)-1);
2896                         if (wret < 0)
2897                                 return wret;
2898                 }
2899                 l = path->nodes[0];
2900
2901                 /* did the pushes work? */
2902                 if (btrfs_leaf_free_space(root, l) >= data_size)
2903                         return 0;
2904         }
2905
2906         if (!path->nodes[1]) {
2907                 ret = insert_new_root(trans, root, path, 1);
2908                 if (ret)
2909                         return ret;
2910         }
2911 again:
2912         split = 1;
2913         l = path->nodes[0];
2914         slot = path->slots[0];
2915         nritems = btrfs_header_nritems(l);
2916         mid = (nritems + 1) / 2;
2917
2918         if (mid <= slot) {
2919                 if (nritems == 1 ||
2920                     leaf_space_used(l, mid, nritems - mid) + data_size >
2921                         BTRFS_LEAF_DATA_SIZE(root)) {
2922                         if (slot >= nritems) {
2923                                 split = 0;
2924                         } else {
2925                                 mid = slot;
2926                                 if (mid != nritems &&
2927                                     leaf_space_used(l, mid, nritems - mid) +
2928                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2929                                         if (data_size && !tried_avoid_double)
2930                                                 goto push_for_double;
2931                                         split = 2;
2932                                 }
2933                         }
2934                 }
2935         } else {
2936                 if (leaf_space_used(l, 0, mid) + data_size >
2937                         BTRFS_LEAF_DATA_SIZE(root)) {
2938                         if (!extend && data_size && slot == 0) {
2939                                 split = 0;
2940                         } else if ((extend || !data_size) && slot == 0) {
2941                                 mid = 1;
2942                         } else {
2943                                 mid = slot;
2944                                 if (mid != nritems &&
2945                                     leaf_space_used(l, mid, nritems - mid) +
2946                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
2947                                         if (data_size && !tried_avoid_double)
2948                                                 goto push_for_double;
2949                                         split = 2 ;
2950                                 }
2951                         }
2952                 }
2953         }
2954
2955         if (split == 0)
2956                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
2957         else
2958                 btrfs_item_key(l, &disk_key, mid);
2959
2960         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2961                                         root->root_key.objectid,
2962                                         &disk_key, 0, l->start, 0, 0);
2963         if (IS_ERR(right))
2964                 return PTR_ERR(right);
2965
2966         root_add_used(root, root->leafsize);
2967
2968         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2969         btrfs_set_header_bytenr(right, right->start);
2970         btrfs_set_header_generation(right, trans->transid);
2971         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
2972         btrfs_set_header_owner(right, root->root_key.objectid);
2973         btrfs_set_header_level(right, 0);
2974         write_extent_buffer(right, root->fs_info->fsid,
2975                             (unsigned long)btrfs_header_fsid(right),
2976                             BTRFS_FSID_SIZE);
2977
2978         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
2979                             (unsigned long)btrfs_header_chunk_tree_uuid(right),
2980                             BTRFS_UUID_SIZE);
2981
2982         if (split == 0) {
2983                 if (mid <= slot) {
2984                         btrfs_set_header_nritems(right, 0);
2985                         insert_ptr(trans, root, path, &disk_key, right->start,
2986                                    path->slots[1] + 1, 1);
2987                         btrfs_tree_unlock(path->nodes[0]);
2988                         free_extent_buffer(path->nodes[0]);
2989                         path->nodes[0] = right;
2990                         path->slots[0] = 0;
2991                         path->slots[1] += 1;
2992                 } else {
2993                         btrfs_set_header_nritems(right, 0);
2994                         insert_ptr(trans, root, path, &disk_key, right->start,
2995                                           path->slots[1], 1);
2996                         btrfs_tree_unlock(path->nodes[0]);
2997                         free_extent_buffer(path->nodes[0]);
2998                         path->nodes[0] = right;
2999                         path->slots[0] = 0;
3000                         if (path->slots[1] == 0)
3001                                 fixup_low_keys(trans, root, path,
3002                                                &disk_key, 1);
3003                 }
3004                 btrfs_mark_buffer_dirty(right);
3005                 return ret;
3006         }
3007
3008         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
3009
3010         if (split == 2) {
3011                 BUG_ON(num_doubles != 0);
3012                 num_doubles++;
3013                 goto again;
3014         }
3015
3016         return 0;
3017
3018 push_for_double:
3019         push_for_double_split(trans, root, path, data_size);
3020         tried_avoid_double = 1;
3021         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3022                 return 0;
3023         goto again;
3024 }
3025
3026 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3027                                          struct btrfs_root *root,
3028                                          struct btrfs_path *path, int ins_len)
3029 {
3030         struct btrfs_key key;
3031         struct extent_buffer *leaf;
3032         struct btrfs_file_extent_item *fi;
3033         u64 extent_len = 0;
3034         u32 item_size;
3035         int ret;
3036
3037         leaf = path->nodes[0];
3038         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3039
3040         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3041                key.type != BTRFS_EXTENT_CSUM_KEY);
3042
3043         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
3044                 return 0;
3045
3046         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3047         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3048                 fi = btrfs_item_ptr(leaf, path->slots[0],
3049                                     struct btrfs_file_extent_item);
3050                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3051         }
3052         btrfs_release_path(path);
3053
3054         path->keep_locks = 1;
3055         path->search_for_split = 1;
3056         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3057         path->search_for_split = 0;
3058         if (ret < 0)
3059                 goto err;
3060
3061         ret = -EAGAIN;
3062         leaf = path->nodes[0];
3063         /* if our item isn't there or got smaller, return now */
3064         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3065                 goto err;
3066
3067         /* the leaf has  changed, it now has room.  return now */
3068         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3069                 goto err;
3070
3071         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3072                 fi = btrfs_item_ptr(leaf, path->slots[0],
3073                                     struct btrfs_file_extent_item);
3074                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3075                         goto err;
3076         }
3077
3078         btrfs_set_path_blocking(path);
3079         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3080         if (ret)
3081                 goto err;
3082
3083         path->keep_locks = 0;
3084         btrfs_unlock_up_safe(path, 1);
3085         return 0;
3086 err:
3087         path->keep_locks = 0;
3088         return ret;
3089 }
3090
3091 static noinline int split_item(struct btrfs_trans_handle *trans,
3092                                struct btrfs_root *root,
3093                                struct btrfs_path *path,
3094                                struct btrfs_key *new_key,
3095                                unsigned long split_offset)
3096 {
3097         struct extent_buffer *leaf;
3098         struct btrfs_item *item;
3099         struct btrfs_item *new_item;
3100         int slot;
3101         char *buf;
3102         u32 nritems;
3103         u32 item_size;
3104         u32 orig_offset;
3105         struct btrfs_disk_key disk_key;
3106
3107         leaf = path->nodes[0];
3108         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
3109
3110         btrfs_set_path_blocking(path);
3111
3112         item = btrfs_item_nr(leaf, path->slots[0]);
3113         orig_offset = btrfs_item_offset(leaf, item);
3114         item_size = btrfs_item_size(leaf, item);
3115
3116         buf = kmalloc(item_size, GFP_NOFS);
3117         if (!buf)
3118                 return -ENOMEM;
3119
3120         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3121                             path->slots[0]), item_size);
3122
3123         slot = path->slots[0] + 1;
3124         nritems = btrfs_header_nritems(leaf);
3125         if (slot != nritems) {
3126                 /* shift the items */
3127                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3128                                 btrfs_item_nr_offset(slot),
3129                                 (nritems - slot) * sizeof(struct btrfs_item));
3130         }
3131
3132         btrfs_cpu_key_to_disk(&disk_key, new_key);
3133         btrfs_set_item_key(leaf, &disk_key, slot);
3134
3135         new_item = btrfs_item_nr(leaf, slot);
3136
3137         btrfs_set_item_offset(leaf, new_item, orig_offset);
3138         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3139
3140         btrfs_set_item_offset(leaf, item,
3141                               orig_offset + item_size - split_offset);
3142         btrfs_set_item_size(leaf, item, split_offset);
3143
3144         btrfs_set_header_nritems(leaf, nritems + 1);
3145
3146         /* write the data for the start of the original item */
3147         write_extent_buffer(leaf, buf,
3148                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3149                             split_offset);
3150
3151         /* write the data for the new item */
3152         write_extent_buffer(leaf, buf + split_offset,
3153                             btrfs_item_ptr_offset(leaf, slot),
3154                             item_size - split_offset);
3155         btrfs_mark_buffer_dirty(leaf);
3156
3157         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
3158         kfree(buf);
3159         return 0;
3160 }
3161
3162 /*
3163  * This function splits a single item into two items,
3164  * giving 'new_key' to the new item and splitting the
3165  * old one at split_offset (from the start of the item).
3166  *
3167  * The path may be released by this operation.  After
3168  * the split, the path is pointing to the old item.  The
3169  * new item is going to be in the same node as the old one.
3170  *
3171  * Note, the item being split must be smaller enough to live alone on
3172  * a tree block with room for one extra struct btrfs_item
3173  *
3174  * This allows us to split the item in place, keeping a lock on the
3175  * leaf the entire time.
3176  */
3177 int btrfs_split_item(struct btrfs_trans_handle *trans,
3178                      struct btrfs_root *root,
3179                      struct btrfs_path *path,
3180                      struct btrfs_key *new_key,
3181                      unsigned long split_offset)
3182 {
3183         int ret;
3184         ret = setup_leaf_for_split(trans, root, path,
3185                                    sizeof(struct btrfs_item));
3186         if (ret)
3187                 return ret;
3188
3189         ret = split_item(trans, root, path, new_key, split_offset);
3190         return ret;
3191 }
3192
3193 /*
3194  * This function duplicate a item, giving 'new_key' to the new item.
3195  * It guarantees both items live in the same tree leaf and the new item
3196  * is contiguous with the original item.
3197  *
3198  * This allows us to split file extent in place, keeping a lock on the
3199  * leaf the entire time.
3200  */
3201 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3202                          struct btrfs_root *root,
3203                          struct btrfs_path *path,
3204                          struct btrfs_key *new_key)
3205 {
3206         struct extent_buffer *leaf;
3207         int ret;
3208         u32 item_size;
3209
3210         leaf = path->nodes[0];
3211         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3212         ret = setup_leaf_for_split(trans, root, path,
3213                                    item_size + sizeof(struct btrfs_item));
3214         if (ret)
3215                 return ret;
3216
3217         path->slots[0]++;
3218         setup_items_for_insert(trans, root, path, new_key, &item_size,
3219                                item_size, item_size +
3220                                sizeof(struct btrfs_item), 1);
3221         leaf = path->nodes[0];
3222         memcpy_extent_buffer(leaf,
3223                              btrfs_item_ptr_offset(leaf, path->slots[0]),
3224                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3225                              item_size);
3226         return 0;
3227 }
3228
3229 /*
3230  * make the item pointed to by the path smaller.  new_size indicates
3231  * how small to make it, and from_end tells us if we just chop bytes
3232  * off the end of the item or if we shift the item to chop bytes off
3233  * the front.
3234  */
3235 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
3236                          struct btrfs_root *root,
3237                          struct btrfs_path *path,
3238                          u32 new_size, int from_end)
3239 {
3240         int slot;
3241         struct extent_buffer *leaf;
3242         struct btrfs_item *item;
3243         u32 nritems;
3244         unsigned int data_end;
3245         unsigned int old_data_start;
3246         unsigned int old_size;
3247         unsigned int size_diff;
3248         int i;
3249
3250         leaf = path->nodes[0];
3251         slot = path->slots[0];
3252
3253         old_size = btrfs_item_size_nr(leaf, slot);
3254         if (old_size == new_size)
3255                 return;
3256
3257         nritems = btrfs_header_nritems(leaf);
3258         data_end = leaf_data_end(root, leaf);
3259
3260         old_data_start = btrfs_item_offset_nr(leaf, slot);
3261
3262         size_diff = old_size - new_size;
3263
3264         BUG_ON(slot < 0);
3265         BUG_ON(slot >= nritems);
3266
3267         /*
3268          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3269          */
3270         /* first correct the data pointers */
3271         for (i = slot; i < nritems; i++) {
3272                 u32 ioff;
3273                 item = btrfs_item_nr(leaf, i);
3274
3275                 ioff = btrfs_item_offset(leaf, item);
3276                 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3277         }
3278
3279         /* shift the data */
3280         if (from_end) {
3281                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3282                               data_end + size_diff, btrfs_leaf_data(leaf) +
3283                               data_end, old_data_start + new_size - data_end);
3284         } else {
3285                 struct btrfs_disk_key disk_key;
3286                 u64 offset;
3287
3288                 btrfs_item_key(leaf, &disk_key, slot);
3289
3290                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3291                         unsigned long ptr;
3292                         struct btrfs_file_extent_item *fi;
3293
3294                         fi = btrfs_item_ptr(leaf, slot,
3295                                             struct btrfs_file_extent_item);
3296                         fi = (struct btrfs_file_extent_item *)(
3297                              (unsigned long)fi - size_diff);
3298
3299                         if (btrfs_file_extent_type(leaf, fi) ==
3300                             BTRFS_FILE_EXTENT_INLINE) {
3301                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3302                                 memmove_extent_buffer(leaf, ptr,
3303                                       (unsigned long)fi,
3304                                       offsetof(struct btrfs_file_extent_item,
3305                                                  disk_bytenr));
3306                         }
3307                 }
3308
3309                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3310                               data_end + size_diff, btrfs_leaf_data(leaf) +
3311                               data_end, old_data_start - data_end);
3312
3313                 offset = btrfs_disk_key_offset(&disk_key);
3314                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3315                 btrfs_set_item_key(leaf, &disk_key, slot);
3316                 if (slot == 0)
3317                         fixup_low_keys(trans, root, path, &disk_key, 1);
3318         }
3319
3320         item = btrfs_item_nr(leaf, slot);
3321         btrfs_set_item_size(leaf, item, new_size);
3322         btrfs_mark_buffer_dirty(leaf);
3323
3324         if (btrfs_leaf_free_space(root, leaf) < 0) {
3325                 btrfs_print_leaf(root, leaf);
3326                 BUG();
3327         }
3328 }
3329
3330 /*
3331  * make the item pointed to by the path bigger, data_size is the new size.
3332  */
3333 void btrfs_extend_item(struct btrfs_trans_handle *trans,
3334                        struct btrfs_root *root, struct btrfs_path *path,
3335                        u32 data_size)
3336 {
3337         int slot;
3338         struct extent_buffer *leaf;
3339         struct btrfs_item *item;
3340         u32 nritems;
3341         unsigned int data_end;
3342         unsigned int old_data;
3343         unsigned int old_size;
3344         int i;
3345
3346         leaf = path->nodes[0];
3347
3348         nritems = btrfs_header_nritems(leaf);
3349         data_end = leaf_data_end(root, leaf);
3350
3351         if (btrfs_leaf_free_space(root, leaf) < data_size) {
3352                 btrfs_print_leaf(root, leaf);
3353                 BUG();
3354         }
3355         slot = path->slots[0];
3356         old_data = btrfs_item_end_nr(leaf, slot);
3357
3358         BUG_ON(slot < 0);
3359         if (slot >= nritems) {
3360                 btrfs_print_leaf(root, leaf);
3361                 printk(KERN_CRIT "slot %d too large, nritems %d\n",
3362                        slot, nritems);
3363                 BUG_ON(1);
3364         }
3365
3366         /*
3367          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3368          */
3369         /* first correct the data pointers */
3370         for (i = slot; i < nritems; i++) {
3371                 u32 ioff;
3372                 item = btrfs_item_nr(leaf, i);
3373
3374                 ioff = btrfs_item_offset(leaf, item);
3375                 btrfs_set_item_offset(leaf, item, ioff - data_size);
3376         }
3377
3378         /* shift the data */
3379         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3380                       data_end - data_size, btrfs_leaf_data(leaf) +
3381                       data_end, old_data - data_end);
3382
3383         data_end = old_data;
3384         old_size = btrfs_item_size_nr(leaf, slot);
3385         item = btrfs_item_nr(leaf, slot);
3386         btrfs_set_item_size(leaf, item, old_size + data_size);
3387         btrfs_mark_buffer_dirty(leaf);
3388
3389         if (btrfs_leaf_free_space(root, leaf) < 0) {
3390                 btrfs_print_leaf(root, leaf);
3391                 BUG();
3392         }
3393 }
3394
3395 /*
3396  * Given a key and some data, insert items into the tree.
3397  * This does all the path init required, making room in the tree if needed.
3398  * Returns the number of keys that were inserted.
3399  */
3400 int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3401                             struct btrfs_root *root,
3402                             struct btrfs_path *path,
3403                             struct btrfs_key *cpu_key, u32 *data_size,
3404                             int nr)
3405 {
3406         struct extent_buffer *leaf;
3407         struct btrfs_item *item;
3408         int ret = 0;
3409         int slot;
3410         int i;
3411         u32 nritems;
3412         u32 total_data = 0;
3413         u32 total_size = 0;
3414         unsigned int data_end;
3415         struct btrfs_disk_key disk_key;
3416         struct btrfs_key found_key;
3417
3418         for (i = 0; i < nr; i++) {
3419                 if (total_size + data_size[i] + sizeof(struct btrfs_item) >
3420                     BTRFS_LEAF_DATA_SIZE(root)) {
3421                         break;
3422                         nr = i;
3423                 }
3424                 total_data += data_size[i];
3425                 total_size += data_size[i] + sizeof(struct btrfs_item);
3426         }
3427         BUG_ON(nr == 0);
3428
3429         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3430         if (ret == 0)
3431                 return -EEXIST;
3432         if (ret < 0)
3433                 goto out;
3434
3435         leaf = path->nodes[0];
3436
3437         nritems = btrfs_header_nritems(leaf);
3438         data_end = leaf_data_end(root, leaf);
3439
3440         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3441                 for (i = nr; i >= 0; i--) {
3442                         total_data -= data_size[i];
3443                         total_size -= data_size[i] + sizeof(struct btrfs_item);
3444                         if (total_size < btrfs_leaf_free_space(root, leaf))
3445                                 break;
3446                 }
3447                 nr = i;
3448         }
3449
3450         slot = path->slots[0];
3451         BUG_ON(slot < 0);
3452
3453         if (slot != nritems) {
3454                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3455
3456                 item = btrfs_item_nr(leaf, slot);
3457                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3458
3459                 /* figure out how many keys we can insert in here */
3460                 total_data = data_size[0];
3461                 for (i = 1; i < nr; i++) {
3462                         if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
3463                                 break;
3464                         total_data += data_size[i];
3465                 }
3466                 nr = i;
3467
3468                 if (old_data < data_end) {
3469                         btrfs_print_leaf(root, leaf);
3470                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3471                                slot, old_data, data_end);
3472                         BUG_ON(1);
3473                 }
3474                 /*
3475                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3476                  */
3477                 /* first correct the data pointers */
3478                 for (i = slot; i < nritems; i++) {
3479                         u32 ioff;
3480
3481                         item = btrfs_item_nr(leaf, i);
3482                         ioff = btrfs_item_offset(leaf, item);
3483                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3484                 }
3485                 /* shift the items */
3486                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3487                               btrfs_item_nr_offset(slot),
3488                               (nritems - slot) * sizeof(struct btrfs_item));
3489
3490                 /* shift the data */
3491                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3492                               data_end - total_data, btrfs_leaf_data(leaf) +
3493                               data_end, old_data - data_end);
3494                 data_end = old_data;
3495         } else {
3496                 /*
3497                  * this sucks but it has to be done, if we are inserting at
3498                  * the end of the leaf only insert 1 of the items, since we
3499                  * have no way of knowing whats on the next leaf and we'd have
3500                  * to drop our current locks to figure it out
3501                  */
3502                 nr = 1;
3503         }
3504
3505         /* setup the item for the new data */
3506         for (i = 0; i < nr; i++) {
3507                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3508                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3509                 item = btrfs_item_nr(leaf, slot + i);
3510                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3511                 data_end -= data_size[i];
3512                 btrfs_set_item_size(leaf, item, data_size[i]);
3513         }
3514         btrfs_set_header_nritems(leaf, nritems + nr);
3515         btrfs_mark_buffer_dirty(leaf);
3516
3517         ret = 0;
3518         if (slot == 0) {
3519                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3520                 fixup_low_keys(trans, root, path, &disk_key, 1);
3521         }
3522
3523         if (btrfs_leaf_free_space(root, leaf) < 0) {
3524                 btrfs_print_leaf(root, leaf);
3525                 BUG();
3526         }
3527 out:
3528         if (!ret)
3529                 ret = nr;
3530         return ret;
3531 }
3532
3533 /*
3534  * this is a helper for btrfs_insert_empty_items, the main goal here is
3535  * to save stack depth by doing the bulk of the work in a function
3536  * that doesn't call btrfs_search_slot
3537  */
3538 void setup_items_for_insert(struct btrfs_trans_handle *trans,
3539                             struct btrfs_root *root, struct btrfs_path *path,
3540                             struct btrfs_key *cpu_key, u32 *data_size,
3541                             u32 total_data, u32 total_size, int nr)
3542 {
3543         struct btrfs_item *item;
3544         int i;
3545         u32 nritems;
3546         unsigned int data_end;
3547         struct btrfs_disk_key disk_key;
3548         struct extent_buffer *leaf;
3549         int slot;
3550
3551         leaf = path->nodes[0];
3552         slot = path->slots[0];
3553
3554         nritems = btrfs_header_nritems(leaf);
3555         data_end = leaf_data_end(root, leaf);
3556
3557         if (btrfs_leaf_free_space(root, leaf) < total_size) {
3558                 btrfs_print_leaf(root, leaf);
3559                 printk(KERN_CRIT "not enough freespace need %u have %d\n",
3560                        total_size, btrfs_leaf_free_space(root, leaf));
3561                 BUG();
3562         }
3563
3564         if (slot != nritems) {
3565                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3566
3567                 if (old_data < data_end) {
3568                         btrfs_print_leaf(root, leaf);
3569                         printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
3570                                slot, old_data, data_end);
3571                         BUG_ON(1);
3572                 }
3573                 /*
3574                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
3575                  */
3576                 /* first correct the data pointers */
3577                 for (i = slot; i < nritems; i++) {
3578                         u32 ioff;
3579
3580                         item = btrfs_item_nr(leaf, i);
3581                         ioff = btrfs_item_offset(leaf, item);
3582                         btrfs_set_item_offset(leaf, item, ioff - total_data);
3583                 }
3584                 /* shift the items */
3585                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3586                               btrfs_item_nr_offset(slot),
3587                               (nritems - slot) * sizeof(struct btrfs_item));
3588
3589                 /* shift the data */
3590                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3591                               data_end - total_data, btrfs_leaf_data(leaf) +
3592                               data_end, old_data - data_end);
3593                 data_end = old_data;
3594         }
3595
3596         /* setup the item for the new data */
3597         for (i = 0; i < nr; i++) {
3598                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3599                 btrfs_set_item_key(leaf, &disk_key, slot + i);
3600                 item = btrfs_item_nr(leaf, slot + i);
3601                 btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
3602                 data_end -= data_size[i];
3603                 btrfs_set_item_size(leaf, item, data_size[i]);
3604         }
3605
3606         btrfs_set_header_nritems(leaf, nritems + nr);
3607
3608         if (slot == 0) {
3609                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3610                 fixup_low_keys(trans, root, path, &disk_key, 1);
3611         }
3612         btrfs_unlock_up_safe(path, 1);
3613         btrfs_mark_buffer_dirty(leaf);
3614
3615         if (btrfs_leaf_free_space(root, leaf) < 0) {
3616                 btrfs_print_leaf(root, leaf);
3617                 BUG();
3618         }
3619 }
3620
3621 /*
3622  * Given a key and some data, insert items into the tree.
3623  * This does all the path init required, making room in the tree if needed.
3624  */
3625 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3626                             struct btrfs_root *root,
3627                             struct btrfs_path *path,
3628                             struct btrfs_key *cpu_key, u32 *data_size,
3629                             int nr)
3630 {
3631         int ret = 0;
3632         int slot;
3633         int i;
3634         u32 total_size = 0;
3635         u32 total_data = 0;
3636
3637         for (i = 0; i < nr; i++)
3638                 total_data += data_size[i];
3639
3640         total_size = total_data + (nr * sizeof(struct btrfs_item));
3641         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3642         if (ret == 0)
3643                 return -EEXIST;
3644         if (ret < 0)
3645                 return ret;
3646
3647         slot = path->slots[0];
3648         BUG_ON(slot < 0);
3649
3650         setup_items_for_insert(trans, root, path, cpu_key, data_size,
3651                                total_data, total_size, nr);
3652         return 0;
3653 }
3654
3655 /*
3656  * Given a key and some data, insert an item into the tree.
3657  * This does all the path init required, making room in the tree if needed.
3658  */
3659 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3660                       *root, struct btrfs_key *cpu_key, void *data, u32
3661                       data_size)
3662 {
3663         int ret = 0;
3664         struct btrfs_path *path;
3665         struct extent_buffer *leaf;
3666         unsigned long ptr;
3667
3668         path = btrfs_alloc_path();
3669         if (!path)
3670                 return -ENOMEM;
3671         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3672         if (!ret) {
3673                 leaf = path->nodes[0];
3674                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3675                 write_extent_buffer(leaf, data, ptr, data_size);
3676                 btrfs_mark_buffer_dirty(leaf);
3677         }
3678         btrfs_free_path(path);
3679         return ret;
3680 }
3681
3682 /*
3683  * delete the pointer from a given node.
3684  *
3685  * the tree should have been previously balanced so the deletion does not
3686  * empty a node.
3687  */
3688 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3689                     struct btrfs_path *path, int level, int slot)
3690 {
3691         struct extent_buffer *parent = path->nodes[level];
3692         u32 nritems;
3693
3694         nritems = btrfs_header_nritems(parent);
3695         if (slot != nritems - 1) {
3696                 memmove_extent_buffer(parent,
3697                               btrfs_node_key_ptr_offset(slot),
3698                               btrfs_node_key_ptr_offset(slot + 1),
3699                               sizeof(struct btrfs_key_ptr) *
3700                               (nritems - slot - 1));
3701         }
3702         nritems--;
3703         btrfs_set_header_nritems(parent, nritems);
3704         if (nritems == 0 && parent == root->node) {
3705                 BUG_ON(btrfs_header_level(root->node) != 1);
3706                 /* just turn the root into a leaf and break */
3707                 btrfs_set_header_level(root->node, 0);
3708         } else if (slot == 0) {
3709                 struct btrfs_disk_key disk_key;
3710
3711                 btrfs_node_key(parent, &disk_key, 0);
3712                 fixup_low_keys(trans, root, path, &disk_key, level + 1);
3713         }
3714         btrfs_mark_buffer_dirty(parent);
3715 }
3716
3717 /*
3718  * a helper function to delete the leaf pointed to by path->slots[1] and
3719  * path->nodes[1].
3720  *
3721  * This deletes the pointer in path->nodes[1] and frees the leaf
3722  * block extent.  zero is returned if it all worked out, < 0 otherwise.
3723  *
3724  * The path must have already been setup for deleting the leaf, including
3725  * all the proper balancing.  path->nodes[1] must be locked.
3726  */
3727 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3728                                     struct btrfs_root *root,
3729                                     struct btrfs_path *path,
3730                                     struct extent_buffer *leaf)
3731 {
3732         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3733         del_ptr(trans, root, path, 1, path->slots[1]);
3734
3735         /*
3736          * btrfs_free_extent is expensive, we want to make sure we
3737          * aren't holding any locks when we call it
3738          */
3739         btrfs_unlock_up_safe(path, 0);
3740
3741         root_sub_used(root, leaf->len);
3742
3743         btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3744 }
3745 /*
3746  * delete the item at the leaf level in path.  If that empties
3747  * the leaf, remove it from the tree
3748  */
3749 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3750                     struct btrfs_path *path, int slot, int nr)
3751 {
3752         struct extent_buffer *leaf;
3753         struct btrfs_item *item;
3754         int last_off;
3755         int dsize = 0;
3756         int ret = 0;
3757         int wret;
3758         int i;
3759         u32 nritems;
3760
3761         leaf = path->nodes[0];
3762         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
3763
3764         for (i = 0; i < nr; i++)
3765                 dsize += btrfs_item_size_nr(leaf, slot + i);
3766
3767         nritems = btrfs_header_nritems(leaf);
3768
3769         if (slot + nr != nritems) {
3770                 int data_end = leaf_data_end(root, leaf);
3771
3772                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3773                               data_end + dsize,
3774                               btrfs_leaf_data(leaf) + data_end,
3775                               last_off - data_end);
3776
3777                 for (i = slot + nr; i < nritems; i++) {
3778                         u32 ioff;
3779
3780                         item = btrfs_item_nr(leaf, i);
3781                         ioff = btrfs_item_offset(leaf, item);
3782                         btrfs_set_item_offset(leaf, item, ioff + dsize);
3783                 }
3784
3785                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3786                               btrfs_item_nr_offset(slot + nr),
3787                               sizeof(struct btrfs_item) *
3788                               (nritems - slot - nr));
3789         }
3790         btrfs_set_header_nritems(leaf, nritems - nr);
3791         nritems -= nr;
3792
3793         /* delete the leaf if we've emptied it */
3794         if (nritems == 0) {
3795                 if (leaf == root->node) {
3796                         btrfs_set_header_level(leaf, 0);
3797                 } else {
3798                         btrfs_set_path_blocking(path);
3799                         clean_tree_block(trans, root, leaf);
3800                         btrfs_del_leaf(trans, root, path, leaf);
3801                 }
3802         } else {
3803                 int used = leaf_space_used(leaf, 0, nritems);
3804                 if (slot == 0) {
3805                         struct btrfs_disk_key disk_key;
3806
3807                         btrfs_item_key(leaf, &disk_key, 0);
3808                         fixup_low_keys(trans, root, path, &disk_key, 1);
3809                 }
3810
3811                 /* delete the leaf if it is mostly empty */
3812                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
3813                         /* push_leaf_left fixes the path.
3814                          * make sure the path still points to our leaf
3815                          * for possible call to del_ptr below
3816                          */
3817                         slot = path->slots[1];
3818                         extent_buffer_get(leaf);
3819
3820                         btrfs_set_path_blocking(path);
3821                         wret = push_leaf_left(trans, root, path, 1, 1,
3822                                               1, (u32)-1);
3823                         if (wret < 0 && wret != -ENOSPC)
3824                                 ret = wret;
3825
3826                         if (path->nodes[0] == leaf &&
3827                             btrfs_header_nritems(leaf)) {
3828                                 wret = push_leaf_right(trans, root, path, 1,
3829                                                        1, 1, 0);
3830                                 if (wret < 0 && wret != -ENOSPC)
3831                                         ret = wret;
3832                         }
3833
3834                         if (btrfs_header_nritems(leaf) == 0) {
3835                                 path->slots[1] = slot;
3836                                 btrfs_del_leaf(trans, root, path, leaf);
3837                                 free_extent_buffer(leaf);
3838                                 ret = 0;
3839                         } else {
3840                                 /* if we're still in the path, make sure
3841                                  * we're dirty.  Otherwise, one of the
3842                                  * push_leaf functions must have already
3843                                  * dirtied this buffer
3844                                  */
3845                                 if (path->nodes[0] == leaf)
3846                                         btrfs_mark_buffer_dirty(leaf);
3847                                 free_extent_buffer(leaf);
3848                         }
3849                 } else {
3850                         btrfs_mark_buffer_dirty(leaf);
3851                 }
3852         }
3853         return ret;
3854 }
3855
3856 /*
3857  * search the tree again to find a leaf with lesser keys
3858  * returns 0 if it found something or 1 if there are no lesser leaves.
3859  * returns < 0 on io errors.
3860  *
3861  * This may release the path, and so you may lose any locks held at the
3862  * time you call it.
3863  */
3864 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
3865 {
3866         struct btrfs_key key;
3867         struct btrfs_disk_key found_key;
3868         int ret;
3869
3870         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3871
3872         if (key.offset > 0)
3873                 key.offset--;
3874         else if (key.type > 0)
3875                 key.type--;
3876         else if (key.objectid > 0)
3877                 key.objectid--;
3878         else
3879                 return 1;
3880
3881         btrfs_release_path(path);
3882         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3883         if (ret < 0)
3884                 return ret;
3885         btrfs_item_key(path->nodes[0], &found_key, 0);
3886         ret = comp_keys(&found_key, &key);
3887         if (ret < 0)
3888                 return 0;
3889         return 1;
3890 }
3891
3892 /*
3893  * A helper function to walk down the tree starting at min_key, and looking
3894  * for nodes or leaves that are either in cache or have a minimum
3895  * transaction id.  This is used by the btree defrag code, and tree logging
3896  *
3897  * This does not cow, but it does stuff the starting key it finds back
3898  * into min_key, so you can call btrfs_search_slot with cow=1 on the
3899  * key and get a writable path.
3900  *
3901  * This does lock as it descends, and path->keep_locks should be set
3902  * to 1 by the caller.
3903  *
3904  * This honors path->lowest_level to prevent descent past a given level
3905  * of the tree.
3906  *
3907  * min_trans indicates the oldest transaction that you are interested
3908  * in walking through.  Any nodes or leaves older than min_trans are
3909  * skipped over (without reading them).
3910  *
3911  * returns zero if something useful was found, < 0 on error and 1 if there
3912  * was nothing in the tree that matched the search criteria.
3913  */
3914 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3915                          struct btrfs_key *max_key,
3916                          struct btrfs_path *path, int cache_only,
3917                          u64 min_trans)
3918 {
3919         struct extent_buffer *cur;
3920         struct btrfs_key found_key;
3921         int slot;
3922         int sret;
3923         u32 nritems;
3924         int level;
3925         int ret = 1;
3926
3927         WARN_ON(!path->keep_locks);
3928 again:
3929         cur = btrfs_read_lock_root_node(root);
3930         level = btrfs_header_level(cur);
3931         WARN_ON(path->nodes[level]);
3932         path->nodes[level] = cur;
3933         path->locks[level] = BTRFS_READ_LOCK;
3934
3935         if (btrfs_header_generation(cur) < min_trans) {
3936                 ret = 1;
3937                 goto out;
3938         }
3939         while (1) {
3940                 nritems = btrfs_header_nritems(cur);
3941                 level = btrfs_header_level(cur);
3942                 sret = bin_search(cur, min_key, level, &slot);
3943
3944                 /* at the lowest level, we're done, setup the path and exit */
3945                 if (level == path->lowest_level) {
3946                         if (slot >= nritems)
3947                                 goto find_next_key;
3948                         ret = 0;
3949                         path->slots[level] = slot;
3950                         btrfs_item_key_to_cpu(cur, &found_key, slot);
3951                         goto out;
3952                 }
3953                 if (sret && slot > 0)
3954                         slot--;
3955                 /*
3956                  * check this node pointer against the cache_only and
3957                  * min_trans parameters.  If it isn't in cache or is too
3958                  * old, skip to the next one.
3959                  */
3960                 while (slot < nritems) {
3961                         u64 blockptr;
3962                         u64 gen;
3963                         struct extent_buffer *tmp;
3964                         struct btrfs_disk_key disk_key;
3965
3966                         blockptr = btrfs_node_blockptr(cur, slot);
3967                         gen = btrfs_node_ptr_generation(cur, slot);
3968                         if (gen < min_trans) {
3969                                 slot++;
3970                                 continue;
3971                         }
3972                         if (!cache_only)
3973                                 break;
3974
3975                         if (max_key) {
3976                                 btrfs_node_key(cur, &disk_key, slot);
3977                                 if (comp_keys(&disk_key, max_key) >= 0) {
3978                                         ret = 1;
3979                                         goto out;
3980                                 }
3981                         }
3982
3983                         tmp = btrfs_find_tree_block(root, blockptr,
3984                                             btrfs_level_size(root, level - 1));
3985
3986                         if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
3987                                 free_extent_buffer(tmp);
3988                                 break;
3989                         }
3990                         if (tmp)
3991                                 free_extent_buffer(tmp);
3992                         slot++;
3993                 }
3994 find_next_key:
3995                 /*
3996                  * we didn't find a candidate key in this node, walk forward
3997                  * and find another one
3998                  */
3999                 if (slot >= nritems) {
4000                         path->slots[level] = slot;
4001                         btrfs_set_path_blocking(path);
4002                         sret = btrfs_find_next_key(root, path, min_key, level,
4003                                                   cache_only, min_trans);
4004                         if (sret == 0) {
4005                                 btrfs_release_path(path);
4006                                 goto again;
4007                         } else {
4008                                 goto out;
4009                         }
4010                 }
4011                 /* save our key for returning back */
4012                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4013                 path->slots[level] = slot;
4014                 if (level == path->lowest_level) {
4015                         ret = 0;
4016                         unlock_up(path, level, 1);
4017                         goto out;
4018                 }
4019                 btrfs_set_path_blocking(path);
4020                 cur = read_node_slot(root, cur, slot);
4021                 BUG_ON(!cur); /* -ENOMEM */
4022
4023                 btrfs_tree_read_lock(cur);
4024
4025                 path->locks[level - 1] = BTRFS_READ_LOCK;
4026                 path->nodes[level - 1] = cur;
4027                 unlock_up(path, level, 1);
4028                 btrfs_clear_path_blocking(path, NULL, 0);
4029         }
4030 out:
4031         if (ret == 0)
4032                 memcpy(min_key, &found_key, sizeof(found_key));
4033         btrfs_set_path_blocking(path);
4034         return ret;
4035 }
4036
4037 /*
4038  * this is similar to btrfs_next_leaf, but does not try to preserve
4039  * and fixup the path.  It looks for and returns the next key in the
4040  * tree based on the current path and the cache_only and min_trans
4041  * parameters.
4042  *
4043  * 0 is returned if another key is found, < 0 if there are any errors
4044  * and 1 is returned if there are no higher keys in the tree
4045  *
4046  * path->keep_locks should be set to 1 on the search made before
4047  * calling this function.
4048  */
4049 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4050                         struct btrfs_key *key, int level,
4051                         int cache_only, u64 min_trans)
4052 {
4053         int slot;
4054         struct extent_buffer *c;
4055
4056         WARN_ON(!path->keep_locks);
4057         while (level < BTRFS_MAX_LEVEL) {
4058                 if (!path->nodes[level])
4059                         return 1;
4060
4061                 slot = path->slots[level] + 1;
4062                 c = path->nodes[level];
4063 next:
4064                 if (slot >= btrfs_header_nritems(c)) {
4065                         int ret;
4066                         int orig_lowest;
4067                         struct btrfs_key cur_key;
4068                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4069                             !path->nodes[level + 1])
4070                                 return 1;
4071
4072                         if (path->locks[level + 1]) {
4073                                 level++;
4074                                 continue;
4075                         }
4076
4077                         slot = btrfs_header_nritems(c) - 1;
4078                         if (level == 0)
4079                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4080                         else
4081                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4082
4083                         orig_lowest = path->lowest_level;
4084                         btrfs_release_path(path);
4085                         path->lowest_level = level;
4086                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4087                                                 0, 0);
4088                         path->lowest_level = orig_lowest;
4089                         if (ret < 0)
4090                                 return ret;
4091
4092                         c = path->nodes[level];
4093                         slot = path->slots[level];
4094                         if (ret == 0)
4095                                 slot++;
4096                         goto next;
4097                 }
4098
4099                 if (level == 0)
4100                         btrfs_item_key_to_cpu(c, key, slot);
4101                 else {
4102                         u64 blockptr = btrfs_node_blockptr(c, slot);
4103                         u64 gen = btrfs_node_ptr_generation(c, slot);
4104
4105                         if (cache_only) {
4106                                 struct extent_buffer *cur;
4107                                 cur = btrfs_find_tree_block(root, blockptr,
4108                                             btrfs_level_size(root, level - 1));
4109                                 if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
4110                                         slot++;
4111                                         if (cur)
4112                                                 free_extent_buffer(cur);
4113                                         goto next;
4114                                 }
4115                                 free_extent_buffer(cur);
4116                         }
4117                         if (gen < min_trans) {
4118                                 slot++;
4119                                 goto next;
4120                         }
4121                         btrfs_node_key_to_cpu(c, key, slot);
4122                 }
4123                 return 0;
4124         }
4125         return 1;
4126 }
4127
4128 /*
4129  * search the tree again to find a leaf with greater keys
4130  * returns 0 if it found something or 1 if there are no greater leaves.
4131  * returns < 0 on io errors.
4132  */
4133 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4134 {
4135         int slot;
4136         int level;
4137         struct extent_buffer *c;
4138         struct extent_buffer *next;
4139         struct btrfs_key key;
4140         u32 nritems;
4141         int ret;
4142         int old_spinning = path->leave_spinning;
4143         int next_rw_lock = 0;
4144
4145         nritems = btrfs_header_nritems(path->nodes[0]);
4146         if (nritems == 0)
4147                 return 1;
4148
4149         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4150 again:
4151         level = 1;
4152         next = NULL;
4153         next_rw_lock = 0;
4154         btrfs_release_path(path);
4155
4156         path->keep_locks = 1;
4157         path->leave_spinning = 1;
4158
4159         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4160         path->keep_locks = 0;
4161
4162         if (ret < 0)
4163                 return ret;
4164
4165         nritems = btrfs_header_nritems(path->nodes[0]);
4166         /*
4167          * by releasing the path above we dropped all our locks.  A balance
4168          * could have added more items next to the key that used to be
4169          * at the very end of the block.  So, check again here and
4170          * advance the path if there are now more items available.
4171          */
4172         if (nritems > 0 && path->slots[0] < nritems - 1) {
4173                 if (ret == 0)
4174                         path->slots[0]++;
4175                 ret = 0;
4176                 goto done;
4177         }
4178
4179         while (level < BTRFS_MAX_LEVEL) {
4180                 if (!path->nodes[level]) {
4181                         ret = 1;
4182                         goto done;
4183                 }
4184
4185                 slot = path->slots[level] + 1;
4186                 c = path->nodes[level];
4187                 if (slot >= btrfs_header_nritems(c)) {
4188                         level++;
4189                         if (level == BTRFS_MAX_LEVEL) {
4190                                 ret = 1;
4191                                 goto done;
4192                         }
4193                         continue;
4194                 }
4195
4196                 if (next) {
4197                         btrfs_tree_unlock_rw(next, next_rw_lock);
4198                         free_extent_buffer(next);
4199                 }
4200
4201                 next = c;
4202                 next_rw_lock = path->locks[level];
4203                 ret = read_block_for_search(NULL, root, path, &next, level,
4204                                             slot, &key);
4205                 if (ret == -EAGAIN)
4206                         goto again;
4207
4208                 if (ret < 0) {
4209                         btrfs_release_path(path);
4210                         goto done;
4211                 }
4212
4213                 if (!path->skip_locking) {
4214                         ret = btrfs_try_tree_read_lock(next);
4215                         if (!ret) {
4216                                 btrfs_set_path_blocking(path);
4217                                 btrfs_tree_read_lock(next);
4218                                 btrfs_clear_path_blocking(path, next,
4219                                                           BTRFS_READ_LOCK);
4220                         }
4221                         next_rw_lock = BTRFS_READ_LOCK;
4222                 }
4223                 break;
4224         }
4225         path->slots[level] = slot;
4226         while (1) {
4227                 level--;
4228                 c = path->nodes[level];
4229                 if (path->locks[level])
4230                         btrfs_tree_unlock_rw(c, path->locks[level]);
4231
4232                 free_extent_buffer(c);
4233                 path->nodes[level] = next;
4234                 path->slots[level] = 0;
4235                 if (!path->skip_locking)
4236                         path->locks[level] = next_rw_lock;
4237                 if (!level)
4238                         break;
4239
4240                 ret = read_block_for_search(NULL, root, path, &next, level,
4241                                             0, &key);
4242                 if (ret == -EAGAIN)
4243                         goto again;
4244
4245                 if (ret < 0) {
4246                         btrfs_release_path(path);
4247                         goto done;
4248                 }
4249
4250                 if (!path->skip_locking) {
4251                         ret = btrfs_try_tree_read_lock(next);
4252                         if (!ret) {
4253                                 btrfs_set_path_blocking(path);
4254                                 btrfs_tree_read_lock(next);
4255                                 btrfs_clear_path_blocking(path, next,
4256                                                           BTRFS_READ_LOCK);
4257                         }
4258                         next_rw_lock = BTRFS_READ_LOCK;
4259                 }
4260         }
4261         ret = 0;
4262 done:
4263         unlock_up(path, 0, 1);
4264         path->leave_spinning = old_spinning;
4265         if (!old_spinning)
4266                 btrfs_set_path_blocking(path);
4267
4268         return ret;
4269 }
4270
4271 /*
4272  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4273  * searching until it gets past min_objectid or finds an item of 'type'
4274  *
4275  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4276  */
4277 int btrfs_previous_item(struct btrfs_root *root,
4278                         struct btrfs_path *path, u64 min_objectid,
4279                         int type)
4280 {
4281         struct btrfs_key found_key;
4282         struct extent_buffer *leaf;
4283         u32 nritems;
4284         int ret;
4285
4286         while (1) {
4287                 if (path->slots[0] == 0) {
4288                         btrfs_set_path_blocking(path);
4289                         ret = btrfs_prev_leaf(root, path);
4290                         if (ret != 0)
4291                                 return ret;
4292                 } else {
4293                         path->slots[0]--;
4294                 }
4295                 leaf = path->nodes[0];
4296                 nritems = btrfs_header_nritems(leaf);
4297                 if (nritems == 0)
4298                         return 1;
4299                 if (path->slots[0] == nritems)
4300                         path->slots[0]--;
4301
4302                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4303                 if (found_key.objectid < min_objectid)
4304                         break;
4305                 if (found_key.type == type)
4306                         return 0;
4307                 if (found_key.objectid == min_objectid &&
4308                     found_key.type < type)
4309                         break;
4310         }
4311         return 1;
4312 }