btrfs: fix uninitialized parent in insert_state
[platform/kernel/linux-starfive.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include <linux/error-injection.h>
11 #include "ctree.h"
12 #include "disk-io.h"
13 #include "transaction.h"
14 #include "print-tree.h"
15 #include "locking.h"
16 #include "volumes.h"
17 #include "qgroup.h"
18 #include "tree-mod-log.h"
19 #include "tree-checker.h"
20
21 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
22                       *root, struct btrfs_path *path, int level);
23 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
24                       const struct btrfs_key *ins_key, struct btrfs_path *path,
25                       int data_size, int extend);
26 static int push_node_left(struct btrfs_trans_handle *trans,
27                           struct extent_buffer *dst,
28                           struct extent_buffer *src, int empty);
29 static int balance_node_right(struct btrfs_trans_handle *trans,
30                               struct extent_buffer *dst_buf,
31                               struct extent_buffer *src_buf);
32 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
33                     int level, int slot);
34
35 static const struct btrfs_csums {
36         u16             size;
37         const char      name[10];
38         const char      driver[12];
39 } btrfs_csums[] = {
40         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
41         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
42         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
43         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
44                                      .driver = "blake2b-256" },
45 };
46
47 int btrfs_super_csum_size(const struct btrfs_super_block *s)
48 {
49         u16 t = btrfs_super_csum_type(s);
50         /*
51          * csum type is validated at mount time
52          */
53         return btrfs_csums[t].size;
54 }
55
56 const char *btrfs_super_csum_name(u16 csum_type)
57 {
58         /* csum type is validated at mount time */
59         return btrfs_csums[csum_type].name;
60 }
61
62 /*
63  * Return driver name if defined, otherwise the name that's also a valid driver
64  * name
65  */
66 const char *btrfs_super_csum_driver(u16 csum_type)
67 {
68         /* csum type is validated at mount time */
69         return btrfs_csums[csum_type].driver[0] ?
70                 btrfs_csums[csum_type].driver :
71                 btrfs_csums[csum_type].name;
72 }
73
74 size_t __attribute_const__ btrfs_get_num_csums(void)
75 {
76         return ARRAY_SIZE(btrfs_csums);
77 }
78
79 struct btrfs_path *btrfs_alloc_path(void)
80 {
81         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
82 }
83
84 /* this also releases the path */
85 void btrfs_free_path(struct btrfs_path *p)
86 {
87         if (!p)
88                 return;
89         btrfs_release_path(p);
90         kmem_cache_free(btrfs_path_cachep, p);
91 }
92
93 /*
94  * path release drops references on the extent buffers in the path
95  * and it drops any locks held by this path
96  *
97  * It is safe to call this on paths that no locks or extent buffers held.
98  */
99 noinline void btrfs_release_path(struct btrfs_path *p)
100 {
101         int i;
102
103         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
104                 p->slots[i] = 0;
105                 if (!p->nodes[i])
106                         continue;
107                 if (p->locks[i]) {
108                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
109                         p->locks[i] = 0;
110                 }
111                 free_extent_buffer(p->nodes[i]);
112                 p->nodes[i] = NULL;
113         }
114 }
115
116 /*
117  * We want the transaction abort to print stack trace only for errors where the
118  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
119  * caused by external factors.
120  */
121 bool __cold abort_should_print_stack(int errno)
122 {
123         switch (errno) {
124         case -EIO:
125         case -EROFS:
126         case -ENOMEM:
127                 return false;
128         }
129         return true;
130 }
131
132 /*
133  * safely gets a reference on the root node of a tree.  A lock
134  * is not taken, so a concurrent writer may put a different node
135  * at the root of the tree.  See btrfs_lock_root_node for the
136  * looping required.
137  *
138  * The extent buffer returned by this has a reference taken, so
139  * it won't disappear.  It may stop being the root of the tree
140  * at any time because there are no locks held.
141  */
142 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
143 {
144         struct extent_buffer *eb;
145
146         while (1) {
147                 rcu_read_lock();
148                 eb = rcu_dereference(root->node);
149
150                 /*
151                  * RCU really hurts here, we could free up the root node because
152                  * it was COWed but we may not get the new root node yet so do
153                  * the inc_not_zero dance and if it doesn't work then
154                  * synchronize_rcu and try again.
155                  */
156                 if (atomic_inc_not_zero(&eb->refs)) {
157                         rcu_read_unlock();
158                         break;
159                 }
160                 rcu_read_unlock();
161                 synchronize_rcu();
162         }
163         return eb;
164 }
165
166 /*
167  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
168  * just get put onto a simple dirty list.  Transaction walks this list to make
169  * sure they get properly updated on disk.
170  */
171 static void add_root_to_dirty_list(struct btrfs_root *root)
172 {
173         struct btrfs_fs_info *fs_info = root->fs_info;
174
175         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
176             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
177                 return;
178
179         spin_lock(&fs_info->trans_lock);
180         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
181                 /* Want the extent tree to be the last on the list */
182                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
183                         list_move_tail(&root->dirty_list,
184                                        &fs_info->dirty_cowonly_roots);
185                 else
186                         list_move(&root->dirty_list,
187                                   &fs_info->dirty_cowonly_roots);
188         }
189         spin_unlock(&fs_info->trans_lock);
190 }
191
192 /*
193  * used by snapshot creation to make a copy of a root for a tree with
194  * a given objectid.  The buffer with the new root node is returned in
195  * cow_ret, and this func returns zero on success or a negative error code.
196  */
197 int btrfs_copy_root(struct btrfs_trans_handle *trans,
198                       struct btrfs_root *root,
199                       struct extent_buffer *buf,
200                       struct extent_buffer **cow_ret, u64 new_root_objectid)
201 {
202         struct btrfs_fs_info *fs_info = root->fs_info;
203         struct extent_buffer *cow;
204         int ret = 0;
205         int level;
206         struct btrfs_disk_key disk_key;
207
208         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
209                 trans->transid != fs_info->running_transaction->transid);
210         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
211                 trans->transid != root->last_trans);
212
213         level = btrfs_header_level(buf);
214         if (level == 0)
215                 btrfs_item_key(buf, &disk_key, 0);
216         else
217                 btrfs_node_key(buf, &disk_key, 0);
218
219         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
220                                      &disk_key, level, buf->start, 0,
221                                      BTRFS_NESTING_NEW_ROOT);
222         if (IS_ERR(cow))
223                 return PTR_ERR(cow);
224
225         copy_extent_buffer_full(cow, buf);
226         btrfs_set_header_bytenr(cow, cow->start);
227         btrfs_set_header_generation(cow, trans->transid);
228         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
229         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
230                                      BTRFS_HEADER_FLAG_RELOC);
231         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
232                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
233         else
234                 btrfs_set_header_owner(cow, new_root_objectid);
235
236         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
237
238         WARN_ON(btrfs_header_generation(buf) > trans->transid);
239         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
240                 ret = btrfs_inc_ref(trans, root, cow, 1);
241         else
242                 ret = btrfs_inc_ref(trans, root, cow, 0);
243         if (ret) {
244                 btrfs_tree_unlock(cow);
245                 free_extent_buffer(cow);
246                 btrfs_abort_transaction(trans, ret);
247                 return ret;
248         }
249
250         btrfs_mark_buffer_dirty(cow);
251         *cow_ret = cow;
252         return 0;
253 }
254
255 /*
256  * check if the tree block can be shared by multiple trees
257  */
258 int btrfs_block_can_be_shared(struct btrfs_root *root,
259                               struct extent_buffer *buf)
260 {
261         /*
262          * Tree blocks not in shareable trees and tree roots are never shared.
263          * If a block was allocated after the last snapshot and the block was
264          * not allocated by tree relocation, we know the block is not shared.
265          */
266         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
267             buf != root->node && buf != root->commit_root &&
268             (btrfs_header_generation(buf) <=
269              btrfs_root_last_snapshot(&root->root_item) ||
270              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
271                 return 1;
272
273         return 0;
274 }
275
276 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
277                                        struct btrfs_root *root,
278                                        struct extent_buffer *buf,
279                                        struct extent_buffer *cow,
280                                        int *last_ref)
281 {
282         struct btrfs_fs_info *fs_info = root->fs_info;
283         u64 refs;
284         u64 owner;
285         u64 flags;
286         u64 new_flags = 0;
287         int ret;
288
289         /*
290          * Backrefs update rules:
291          *
292          * Always use full backrefs for extent pointers in tree block
293          * allocated by tree relocation.
294          *
295          * If a shared tree block is no longer referenced by its owner
296          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
297          * use full backrefs for extent pointers in tree block.
298          *
299          * If a tree block is been relocating
300          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
301          * use full backrefs for extent pointers in tree block.
302          * The reason for this is some operations (such as drop tree)
303          * are only allowed for blocks use full backrefs.
304          */
305
306         if (btrfs_block_can_be_shared(root, buf)) {
307                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
308                                                btrfs_header_level(buf), 1,
309                                                &refs, &flags);
310                 if (ret)
311                         return ret;
312                 if (refs == 0) {
313                         ret = -EROFS;
314                         btrfs_handle_fs_error(fs_info, ret, NULL);
315                         return ret;
316                 }
317         } else {
318                 refs = 1;
319                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
320                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
321                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
322                 else
323                         flags = 0;
324         }
325
326         owner = btrfs_header_owner(buf);
327         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
328                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
329
330         if (refs > 1) {
331                 if ((owner == root->root_key.objectid ||
332                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
333                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
334                         ret = btrfs_inc_ref(trans, root, buf, 1);
335                         if (ret)
336                                 return ret;
337
338                         if (root->root_key.objectid ==
339                             BTRFS_TREE_RELOC_OBJECTID) {
340                                 ret = btrfs_dec_ref(trans, root, buf, 0);
341                                 if (ret)
342                                         return ret;
343                                 ret = btrfs_inc_ref(trans, root, cow, 1);
344                                 if (ret)
345                                         return ret;
346                         }
347                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
348                 } else {
349
350                         if (root->root_key.objectid ==
351                             BTRFS_TREE_RELOC_OBJECTID)
352                                 ret = btrfs_inc_ref(trans, root, cow, 1);
353                         else
354                                 ret = btrfs_inc_ref(trans, root, cow, 0);
355                         if (ret)
356                                 return ret;
357                 }
358                 if (new_flags != 0) {
359                         int level = btrfs_header_level(buf);
360
361                         ret = btrfs_set_disk_extent_flags(trans, buf,
362                                                           new_flags, level);
363                         if (ret)
364                                 return ret;
365                 }
366         } else {
367                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
368                         if (root->root_key.objectid ==
369                             BTRFS_TREE_RELOC_OBJECTID)
370                                 ret = btrfs_inc_ref(trans, root, cow, 1);
371                         else
372                                 ret = btrfs_inc_ref(trans, root, cow, 0);
373                         if (ret)
374                                 return ret;
375                         ret = btrfs_dec_ref(trans, root, buf, 1);
376                         if (ret)
377                                 return ret;
378                 }
379                 btrfs_clean_tree_block(buf);
380                 *last_ref = 1;
381         }
382         return 0;
383 }
384
385 /*
386  * does the dirty work in cow of a single block.  The parent block (if
387  * supplied) is updated to point to the new cow copy.  The new buffer is marked
388  * dirty and returned locked.  If you modify the block it needs to be marked
389  * dirty again.
390  *
391  * search_start -- an allocation hint for the new block
392  *
393  * empty_size -- a hint that you plan on doing more cow.  This is the size in
394  * bytes the allocator should try to find free next to the block it returns.
395  * This is just a hint and may be ignored by the allocator.
396  */
397 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
398                              struct btrfs_root *root,
399                              struct extent_buffer *buf,
400                              struct extent_buffer *parent, int parent_slot,
401                              struct extent_buffer **cow_ret,
402                              u64 search_start, u64 empty_size,
403                              enum btrfs_lock_nesting nest)
404 {
405         struct btrfs_fs_info *fs_info = root->fs_info;
406         struct btrfs_disk_key disk_key;
407         struct extent_buffer *cow;
408         int level, ret;
409         int last_ref = 0;
410         int unlock_orig = 0;
411         u64 parent_start = 0;
412
413         if (*cow_ret == buf)
414                 unlock_orig = 1;
415
416         btrfs_assert_tree_write_locked(buf);
417
418         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
419                 trans->transid != fs_info->running_transaction->transid);
420         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
421                 trans->transid != root->last_trans);
422
423         level = btrfs_header_level(buf);
424
425         if (level == 0)
426                 btrfs_item_key(buf, &disk_key, 0);
427         else
428                 btrfs_node_key(buf, &disk_key, 0);
429
430         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
431                 parent_start = parent->start;
432
433         cow = btrfs_alloc_tree_block(trans, root, parent_start,
434                                      root->root_key.objectid, &disk_key, level,
435                                      search_start, empty_size, nest);
436         if (IS_ERR(cow))
437                 return PTR_ERR(cow);
438
439         /* cow is set to blocking by btrfs_init_new_buffer */
440
441         copy_extent_buffer_full(cow, buf);
442         btrfs_set_header_bytenr(cow, cow->start);
443         btrfs_set_header_generation(cow, trans->transid);
444         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
445         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
446                                      BTRFS_HEADER_FLAG_RELOC);
447         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
448                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
449         else
450                 btrfs_set_header_owner(cow, root->root_key.objectid);
451
452         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
453
454         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
455         if (ret) {
456                 btrfs_tree_unlock(cow);
457                 free_extent_buffer(cow);
458                 btrfs_abort_transaction(trans, ret);
459                 return ret;
460         }
461
462         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
463                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
464                 if (ret) {
465                         btrfs_tree_unlock(cow);
466                         free_extent_buffer(cow);
467                         btrfs_abort_transaction(trans, ret);
468                         return ret;
469                 }
470         }
471
472         if (buf == root->node) {
473                 WARN_ON(parent && parent != buf);
474                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
475                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
476                         parent_start = buf->start;
477
478                 atomic_inc(&cow->refs);
479                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
480                 BUG_ON(ret < 0);
481                 rcu_assign_pointer(root->node, cow);
482
483                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
484                                       parent_start, last_ref);
485                 free_extent_buffer(buf);
486                 add_root_to_dirty_list(root);
487         } else {
488                 WARN_ON(trans->transid != btrfs_header_generation(parent));
489                 btrfs_tree_mod_log_insert_key(parent, parent_slot,
490                                               BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
491                 btrfs_set_node_blockptr(parent, parent_slot,
492                                         cow->start);
493                 btrfs_set_node_ptr_generation(parent, parent_slot,
494                                               trans->transid);
495                 btrfs_mark_buffer_dirty(parent);
496                 if (last_ref) {
497                         ret = btrfs_tree_mod_log_free_eb(buf);
498                         if (ret) {
499                                 btrfs_tree_unlock(cow);
500                                 free_extent_buffer(cow);
501                                 btrfs_abort_transaction(trans, ret);
502                                 return ret;
503                         }
504                 }
505                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
506                                       parent_start, last_ref);
507         }
508         if (unlock_orig)
509                 btrfs_tree_unlock(buf);
510         free_extent_buffer_stale(buf);
511         btrfs_mark_buffer_dirty(cow);
512         *cow_ret = cow;
513         return 0;
514 }
515
516 static inline int should_cow_block(struct btrfs_trans_handle *trans,
517                                    struct btrfs_root *root,
518                                    struct extent_buffer *buf)
519 {
520         if (btrfs_is_testing(root->fs_info))
521                 return 0;
522
523         /* Ensure we can see the FORCE_COW bit */
524         smp_mb__before_atomic();
525
526         /*
527          * We do not need to cow a block if
528          * 1) this block is not created or changed in this transaction;
529          * 2) this block does not belong to TREE_RELOC tree;
530          * 3) the root is not forced COW.
531          *
532          * What is forced COW:
533          *    when we create snapshot during committing the transaction,
534          *    after we've finished copying src root, we must COW the shared
535          *    block to ensure the metadata consistency.
536          */
537         if (btrfs_header_generation(buf) == trans->transid &&
538             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
539             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
540               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
541             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
542                 return 0;
543         return 1;
544 }
545
546 /*
547  * cows a single block, see __btrfs_cow_block for the real work.
548  * This version of it has extra checks so that a block isn't COWed more than
549  * once per transaction, as long as it hasn't been written yet
550  */
551 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
552                     struct btrfs_root *root, struct extent_buffer *buf,
553                     struct extent_buffer *parent, int parent_slot,
554                     struct extent_buffer **cow_ret,
555                     enum btrfs_lock_nesting nest)
556 {
557         struct btrfs_fs_info *fs_info = root->fs_info;
558         u64 search_start;
559         int ret;
560
561         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
562                 btrfs_err(fs_info,
563                         "COW'ing blocks on a fs root that's being dropped");
564
565         if (trans->transaction != fs_info->running_transaction)
566                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
567                        trans->transid,
568                        fs_info->running_transaction->transid);
569
570         if (trans->transid != fs_info->generation)
571                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
572                        trans->transid, fs_info->generation);
573
574         if (!should_cow_block(trans, root, buf)) {
575                 *cow_ret = buf;
576                 return 0;
577         }
578
579         search_start = buf->start & ~((u64)SZ_1G - 1);
580
581         /*
582          * Before CoWing this block for later modification, check if it's
583          * the subtree root and do the delayed subtree trace if needed.
584          *
585          * Also We don't care about the error, as it's handled internally.
586          */
587         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
588         ret = __btrfs_cow_block(trans, root, buf, parent,
589                                  parent_slot, cow_ret, search_start, 0, nest);
590
591         trace_btrfs_cow_block(root, buf, *cow_ret);
592
593         return ret;
594 }
595 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
596
597 /*
598  * helper function for defrag to decide if two blocks pointed to by a
599  * node are actually close by
600  */
601 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
602 {
603         if (blocknr < other && other - (blocknr + blocksize) < 32768)
604                 return 1;
605         if (blocknr > other && blocknr - (other + blocksize) < 32768)
606                 return 1;
607         return 0;
608 }
609
610 #ifdef __LITTLE_ENDIAN
611
612 /*
613  * Compare two keys, on little-endian the disk order is same as CPU order and
614  * we can avoid the conversion.
615  */
616 static int comp_keys(const struct btrfs_disk_key *disk_key,
617                      const struct btrfs_key *k2)
618 {
619         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
620
621         return btrfs_comp_cpu_keys(k1, k2);
622 }
623
624 #else
625
626 /*
627  * compare two keys in a memcmp fashion
628  */
629 static int comp_keys(const struct btrfs_disk_key *disk,
630                      const struct btrfs_key *k2)
631 {
632         struct btrfs_key k1;
633
634         btrfs_disk_key_to_cpu(&k1, disk);
635
636         return btrfs_comp_cpu_keys(&k1, k2);
637 }
638 #endif
639
640 /*
641  * same as comp_keys only with two btrfs_key's
642  */
643 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
644 {
645         if (k1->objectid > k2->objectid)
646                 return 1;
647         if (k1->objectid < k2->objectid)
648                 return -1;
649         if (k1->type > k2->type)
650                 return 1;
651         if (k1->type < k2->type)
652                 return -1;
653         if (k1->offset > k2->offset)
654                 return 1;
655         if (k1->offset < k2->offset)
656                 return -1;
657         return 0;
658 }
659
660 /*
661  * this is used by the defrag code to go through all the
662  * leaves pointed to by a node and reallocate them so that
663  * disk order is close to key order
664  */
665 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
666                        struct btrfs_root *root, struct extent_buffer *parent,
667                        int start_slot, u64 *last_ret,
668                        struct btrfs_key *progress)
669 {
670         struct btrfs_fs_info *fs_info = root->fs_info;
671         struct extent_buffer *cur;
672         u64 blocknr;
673         u64 search_start = *last_ret;
674         u64 last_block = 0;
675         u64 other;
676         u32 parent_nritems;
677         int end_slot;
678         int i;
679         int err = 0;
680         u32 blocksize;
681         int progress_passed = 0;
682         struct btrfs_disk_key disk_key;
683
684         WARN_ON(trans->transaction != fs_info->running_transaction);
685         WARN_ON(trans->transid != fs_info->generation);
686
687         parent_nritems = btrfs_header_nritems(parent);
688         blocksize = fs_info->nodesize;
689         end_slot = parent_nritems - 1;
690
691         if (parent_nritems <= 1)
692                 return 0;
693
694         for (i = start_slot; i <= end_slot; i++) {
695                 int close = 1;
696
697                 btrfs_node_key(parent, &disk_key, i);
698                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
699                         continue;
700
701                 progress_passed = 1;
702                 blocknr = btrfs_node_blockptr(parent, i);
703                 if (last_block == 0)
704                         last_block = blocknr;
705
706                 if (i > 0) {
707                         other = btrfs_node_blockptr(parent, i - 1);
708                         close = close_blocks(blocknr, other, blocksize);
709                 }
710                 if (!close && i < end_slot) {
711                         other = btrfs_node_blockptr(parent, i + 1);
712                         close = close_blocks(blocknr, other, blocksize);
713                 }
714                 if (close) {
715                         last_block = blocknr;
716                         continue;
717                 }
718
719                 cur = btrfs_read_node_slot(parent, i);
720                 if (IS_ERR(cur))
721                         return PTR_ERR(cur);
722                 if (search_start == 0)
723                         search_start = last_block;
724
725                 btrfs_tree_lock(cur);
726                 err = __btrfs_cow_block(trans, root, cur, parent, i,
727                                         &cur, search_start,
728                                         min(16 * blocksize,
729                                             (end_slot - i) * blocksize),
730                                         BTRFS_NESTING_COW);
731                 if (err) {
732                         btrfs_tree_unlock(cur);
733                         free_extent_buffer(cur);
734                         break;
735                 }
736                 search_start = cur->start;
737                 last_block = cur->start;
738                 *last_ret = search_start;
739                 btrfs_tree_unlock(cur);
740                 free_extent_buffer(cur);
741         }
742         return err;
743 }
744
745 /*
746  * Search for a key in the given extent_buffer.
747  *
748  * The lower boundary for the search is specified by the slot number @low. Use a
749  * value of 0 to search over the whole extent buffer.
750  *
751  * The slot in the extent buffer is returned via @slot. If the key exists in the
752  * extent buffer, then @slot will point to the slot where the key is, otherwise
753  * it points to the slot where you would insert the key.
754  *
755  * Slot may point to the total number of items (i.e. one position beyond the last
756  * key) if the key is bigger than the last key in the extent buffer.
757  */
758 static noinline int generic_bin_search(struct extent_buffer *eb, int low,
759                                        const struct btrfs_key *key, int *slot)
760 {
761         unsigned long p;
762         int item_size;
763         int high = btrfs_header_nritems(eb);
764         int ret;
765         const int key_size = sizeof(struct btrfs_disk_key);
766
767         if (low > high) {
768                 btrfs_err(eb->fs_info,
769                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
770                           __func__, low, high, eb->start,
771                           btrfs_header_owner(eb), btrfs_header_level(eb));
772                 return -EINVAL;
773         }
774
775         if (btrfs_header_level(eb) == 0) {
776                 p = offsetof(struct btrfs_leaf, items);
777                 item_size = sizeof(struct btrfs_item);
778         } else {
779                 p = offsetof(struct btrfs_node, ptrs);
780                 item_size = sizeof(struct btrfs_key_ptr);
781         }
782
783         while (low < high) {
784                 unsigned long oip;
785                 unsigned long offset;
786                 struct btrfs_disk_key *tmp;
787                 struct btrfs_disk_key unaligned;
788                 int mid;
789
790                 mid = (low + high) / 2;
791                 offset = p + mid * item_size;
792                 oip = offset_in_page(offset);
793
794                 if (oip + key_size <= PAGE_SIZE) {
795                         const unsigned long idx = get_eb_page_index(offset);
796                         char *kaddr = page_address(eb->pages[idx]);
797
798                         oip = get_eb_offset_in_page(eb, offset);
799                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
800                 } else {
801                         read_extent_buffer(eb, &unaligned, offset, key_size);
802                         tmp = &unaligned;
803                 }
804
805                 ret = comp_keys(tmp, key);
806
807                 if (ret < 0)
808                         low = mid + 1;
809                 else if (ret > 0)
810                         high = mid;
811                 else {
812                         *slot = mid;
813                         return 0;
814                 }
815         }
816         *slot = low;
817         return 1;
818 }
819
820 /*
821  * Simple binary search on an extent buffer. Works for both leaves and nodes, and
822  * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
823  */
824 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
825                      int *slot)
826 {
827         return generic_bin_search(eb, 0, key, slot);
828 }
829
830 static void root_add_used(struct btrfs_root *root, u32 size)
831 {
832         spin_lock(&root->accounting_lock);
833         btrfs_set_root_used(&root->root_item,
834                             btrfs_root_used(&root->root_item) + size);
835         spin_unlock(&root->accounting_lock);
836 }
837
838 static void root_sub_used(struct btrfs_root *root, u32 size)
839 {
840         spin_lock(&root->accounting_lock);
841         btrfs_set_root_used(&root->root_item,
842                             btrfs_root_used(&root->root_item) - size);
843         spin_unlock(&root->accounting_lock);
844 }
845
846 /* given a node and slot number, this reads the blocks it points to.  The
847  * extent buffer is returned with a reference taken (but unlocked).
848  */
849 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
850                                            int slot)
851 {
852         int level = btrfs_header_level(parent);
853         struct extent_buffer *eb;
854         struct btrfs_key first_key;
855
856         if (slot < 0 || slot >= btrfs_header_nritems(parent))
857                 return ERR_PTR(-ENOENT);
858
859         BUG_ON(level == 0);
860
861         btrfs_node_key_to_cpu(parent, &first_key, slot);
862         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
863                              btrfs_header_owner(parent),
864                              btrfs_node_ptr_generation(parent, slot),
865                              level - 1, &first_key);
866         if (IS_ERR(eb))
867                 return eb;
868         if (!extent_buffer_uptodate(eb)) {
869                 free_extent_buffer(eb);
870                 return ERR_PTR(-EIO);
871         }
872
873         return eb;
874 }
875
876 /*
877  * node level balancing, used to make sure nodes are in proper order for
878  * item deletion.  We balance from the top down, so we have to make sure
879  * that a deletion won't leave an node completely empty later on.
880  */
881 static noinline int balance_level(struct btrfs_trans_handle *trans,
882                          struct btrfs_root *root,
883                          struct btrfs_path *path, int level)
884 {
885         struct btrfs_fs_info *fs_info = root->fs_info;
886         struct extent_buffer *right = NULL;
887         struct extent_buffer *mid;
888         struct extent_buffer *left = NULL;
889         struct extent_buffer *parent = NULL;
890         int ret = 0;
891         int wret;
892         int pslot;
893         int orig_slot = path->slots[level];
894         u64 orig_ptr;
895
896         ASSERT(level > 0);
897
898         mid = path->nodes[level];
899
900         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
901         WARN_ON(btrfs_header_generation(mid) != trans->transid);
902
903         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904
905         if (level < BTRFS_MAX_LEVEL - 1) {
906                 parent = path->nodes[level + 1];
907                 pslot = path->slots[level + 1];
908         }
909
910         /*
911          * deal with the case where there is only one pointer in the root
912          * by promoting the node below to a root
913          */
914         if (!parent) {
915                 struct extent_buffer *child;
916
917                 if (btrfs_header_nritems(mid) != 1)
918                         return 0;
919
920                 /* promote the child to a root */
921                 child = btrfs_read_node_slot(mid, 0);
922                 if (IS_ERR(child)) {
923                         ret = PTR_ERR(child);
924                         btrfs_handle_fs_error(fs_info, ret, NULL);
925                         goto enospc;
926                 }
927
928                 btrfs_tree_lock(child);
929                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
930                                       BTRFS_NESTING_COW);
931                 if (ret) {
932                         btrfs_tree_unlock(child);
933                         free_extent_buffer(child);
934                         goto enospc;
935                 }
936
937                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
938                 BUG_ON(ret < 0);
939                 rcu_assign_pointer(root->node, child);
940
941                 add_root_to_dirty_list(root);
942                 btrfs_tree_unlock(child);
943
944                 path->locks[level] = 0;
945                 path->nodes[level] = NULL;
946                 btrfs_clean_tree_block(mid);
947                 btrfs_tree_unlock(mid);
948                 /* once for the path */
949                 free_extent_buffer(mid);
950
951                 root_sub_used(root, mid->len);
952                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
953                 /* once for the root ptr */
954                 free_extent_buffer_stale(mid);
955                 return 0;
956         }
957         if (btrfs_header_nritems(mid) >
958             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
959                 return 0;
960
961         left = btrfs_read_node_slot(parent, pslot - 1);
962         if (IS_ERR(left))
963                 left = NULL;
964
965         if (left) {
966                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
967                 wret = btrfs_cow_block(trans, root, left,
968                                        parent, pslot - 1, &left,
969                                        BTRFS_NESTING_LEFT_COW);
970                 if (wret) {
971                         ret = wret;
972                         goto enospc;
973                 }
974         }
975
976         right = btrfs_read_node_slot(parent, pslot + 1);
977         if (IS_ERR(right))
978                 right = NULL;
979
980         if (right) {
981                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
982                 wret = btrfs_cow_block(trans, root, right,
983                                        parent, pslot + 1, &right,
984                                        BTRFS_NESTING_RIGHT_COW);
985                 if (wret) {
986                         ret = wret;
987                         goto enospc;
988                 }
989         }
990
991         /* first, try to make some room in the middle buffer */
992         if (left) {
993                 orig_slot += btrfs_header_nritems(left);
994                 wret = push_node_left(trans, left, mid, 1);
995                 if (wret < 0)
996                         ret = wret;
997         }
998
999         /*
1000          * then try to empty the right most buffer into the middle
1001          */
1002         if (right) {
1003                 wret = push_node_left(trans, mid, right, 1);
1004                 if (wret < 0 && wret != -ENOSPC)
1005                         ret = wret;
1006                 if (btrfs_header_nritems(right) == 0) {
1007                         btrfs_clean_tree_block(right);
1008                         btrfs_tree_unlock(right);
1009                         del_ptr(root, path, level + 1, pslot + 1);
1010                         root_sub_used(root, right->len);
1011                         btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1012                                               0, 1);
1013                         free_extent_buffer_stale(right);
1014                         right = NULL;
1015                 } else {
1016                         struct btrfs_disk_key right_key;
1017                         btrfs_node_key(right, &right_key, 0);
1018                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1019                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1020                         BUG_ON(ret < 0);
1021                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1022                         btrfs_mark_buffer_dirty(parent);
1023                 }
1024         }
1025         if (btrfs_header_nritems(mid) == 1) {
1026                 /*
1027                  * we're not allowed to leave a node with one item in the
1028                  * tree during a delete.  A deletion from lower in the tree
1029                  * could try to delete the only pointer in this node.
1030                  * So, pull some keys from the left.
1031                  * There has to be a left pointer at this point because
1032                  * otherwise we would have pulled some pointers from the
1033                  * right
1034                  */
1035                 if (!left) {
1036                         ret = -EROFS;
1037                         btrfs_handle_fs_error(fs_info, ret, NULL);
1038                         goto enospc;
1039                 }
1040                 wret = balance_node_right(trans, mid, left);
1041                 if (wret < 0) {
1042                         ret = wret;
1043                         goto enospc;
1044                 }
1045                 if (wret == 1) {
1046                         wret = push_node_left(trans, left, mid, 1);
1047                         if (wret < 0)
1048                                 ret = wret;
1049                 }
1050                 BUG_ON(wret == 1);
1051         }
1052         if (btrfs_header_nritems(mid) == 0) {
1053                 btrfs_clean_tree_block(mid);
1054                 btrfs_tree_unlock(mid);
1055                 del_ptr(root, path, level + 1, pslot);
1056                 root_sub_used(root, mid->len);
1057                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1058                 free_extent_buffer_stale(mid);
1059                 mid = NULL;
1060         } else {
1061                 /* update the parent key to reflect our changes */
1062                 struct btrfs_disk_key mid_key;
1063                 btrfs_node_key(mid, &mid_key, 0);
1064                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1065                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1066                 BUG_ON(ret < 0);
1067                 btrfs_set_node_key(parent, &mid_key, pslot);
1068                 btrfs_mark_buffer_dirty(parent);
1069         }
1070
1071         /* update the path */
1072         if (left) {
1073                 if (btrfs_header_nritems(left) > orig_slot) {
1074                         atomic_inc(&left->refs);
1075                         /* left was locked after cow */
1076                         path->nodes[level] = left;
1077                         path->slots[level + 1] -= 1;
1078                         path->slots[level] = orig_slot;
1079                         if (mid) {
1080                                 btrfs_tree_unlock(mid);
1081                                 free_extent_buffer(mid);
1082                         }
1083                 } else {
1084                         orig_slot -= btrfs_header_nritems(left);
1085                         path->slots[level] = orig_slot;
1086                 }
1087         }
1088         /* double check we haven't messed things up */
1089         if (orig_ptr !=
1090             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1091                 BUG();
1092 enospc:
1093         if (right) {
1094                 btrfs_tree_unlock(right);
1095                 free_extent_buffer(right);
1096         }
1097         if (left) {
1098                 if (path->nodes[level] != left)
1099                         btrfs_tree_unlock(left);
1100                 free_extent_buffer(left);
1101         }
1102         return ret;
1103 }
1104
1105 /* Node balancing for insertion.  Here we only split or push nodes around
1106  * when they are completely full.  This is also done top down, so we
1107  * have to be pessimistic.
1108  */
1109 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1110                                           struct btrfs_root *root,
1111                                           struct btrfs_path *path, int level)
1112 {
1113         struct btrfs_fs_info *fs_info = root->fs_info;
1114         struct extent_buffer *right = NULL;
1115         struct extent_buffer *mid;
1116         struct extent_buffer *left = NULL;
1117         struct extent_buffer *parent = NULL;
1118         int ret = 0;
1119         int wret;
1120         int pslot;
1121         int orig_slot = path->slots[level];
1122
1123         if (level == 0)
1124                 return 1;
1125
1126         mid = path->nodes[level];
1127         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1128
1129         if (level < BTRFS_MAX_LEVEL - 1) {
1130                 parent = path->nodes[level + 1];
1131                 pslot = path->slots[level + 1];
1132         }
1133
1134         if (!parent)
1135                 return 1;
1136
1137         left = btrfs_read_node_slot(parent, pslot - 1);
1138         if (IS_ERR(left))
1139                 left = NULL;
1140
1141         /* first, try to make some room in the middle buffer */
1142         if (left) {
1143                 u32 left_nr;
1144
1145                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1146
1147                 left_nr = btrfs_header_nritems(left);
1148                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1149                         wret = 1;
1150                 } else {
1151                         ret = btrfs_cow_block(trans, root, left, parent,
1152                                               pslot - 1, &left,
1153                                               BTRFS_NESTING_LEFT_COW);
1154                         if (ret)
1155                                 wret = 1;
1156                         else {
1157                                 wret = push_node_left(trans, left, mid, 0);
1158                         }
1159                 }
1160                 if (wret < 0)
1161                         ret = wret;
1162                 if (wret == 0) {
1163                         struct btrfs_disk_key disk_key;
1164                         orig_slot += left_nr;
1165                         btrfs_node_key(mid, &disk_key, 0);
1166                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1167                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1168                         BUG_ON(ret < 0);
1169                         btrfs_set_node_key(parent, &disk_key, pslot);
1170                         btrfs_mark_buffer_dirty(parent);
1171                         if (btrfs_header_nritems(left) > orig_slot) {
1172                                 path->nodes[level] = left;
1173                                 path->slots[level + 1] -= 1;
1174                                 path->slots[level] = orig_slot;
1175                                 btrfs_tree_unlock(mid);
1176                                 free_extent_buffer(mid);
1177                         } else {
1178                                 orig_slot -=
1179                                         btrfs_header_nritems(left);
1180                                 path->slots[level] = orig_slot;
1181                                 btrfs_tree_unlock(left);
1182                                 free_extent_buffer(left);
1183                         }
1184                         return 0;
1185                 }
1186                 btrfs_tree_unlock(left);
1187                 free_extent_buffer(left);
1188         }
1189         right = btrfs_read_node_slot(parent, pslot + 1);
1190         if (IS_ERR(right))
1191                 right = NULL;
1192
1193         /*
1194          * then try to empty the right most buffer into the middle
1195          */
1196         if (right) {
1197                 u32 right_nr;
1198
1199                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1200
1201                 right_nr = btrfs_header_nritems(right);
1202                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1203                         wret = 1;
1204                 } else {
1205                         ret = btrfs_cow_block(trans, root, right,
1206                                               parent, pslot + 1,
1207                                               &right, BTRFS_NESTING_RIGHT_COW);
1208                         if (ret)
1209                                 wret = 1;
1210                         else {
1211                                 wret = balance_node_right(trans, right, mid);
1212                         }
1213                 }
1214                 if (wret < 0)
1215                         ret = wret;
1216                 if (wret == 0) {
1217                         struct btrfs_disk_key disk_key;
1218
1219                         btrfs_node_key(right, &disk_key, 0);
1220                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1221                                         BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1222                         BUG_ON(ret < 0);
1223                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1224                         btrfs_mark_buffer_dirty(parent);
1225
1226                         if (btrfs_header_nritems(mid) <= orig_slot) {
1227                                 path->nodes[level] = right;
1228                                 path->slots[level + 1] += 1;
1229                                 path->slots[level] = orig_slot -
1230                                         btrfs_header_nritems(mid);
1231                                 btrfs_tree_unlock(mid);
1232                                 free_extent_buffer(mid);
1233                         } else {
1234                                 btrfs_tree_unlock(right);
1235                                 free_extent_buffer(right);
1236                         }
1237                         return 0;
1238                 }
1239                 btrfs_tree_unlock(right);
1240                 free_extent_buffer(right);
1241         }
1242         return 1;
1243 }
1244
1245 /*
1246  * readahead one full node of leaves, finding things that are close
1247  * to the block in 'slot', and triggering ra on them.
1248  */
1249 static void reada_for_search(struct btrfs_fs_info *fs_info,
1250                              struct btrfs_path *path,
1251                              int level, int slot, u64 objectid)
1252 {
1253         struct extent_buffer *node;
1254         struct btrfs_disk_key disk_key;
1255         u32 nritems;
1256         u64 search;
1257         u64 target;
1258         u64 nread = 0;
1259         u64 nread_max;
1260         u32 nr;
1261         u32 blocksize;
1262         u32 nscan = 0;
1263
1264         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1265                 return;
1266
1267         if (!path->nodes[level])
1268                 return;
1269
1270         node = path->nodes[level];
1271
1272         /*
1273          * Since the time between visiting leaves is much shorter than the time
1274          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1275          * much IO at once (possibly random).
1276          */
1277         if (path->reada == READA_FORWARD_ALWAYS) {
1278                 if (level > 1)
1279                         nread_max = node->fs_info->nodesize;
1280                 else
1281                         nread_max = SZ_128K;
1282         } else {
1283                 nread_max = SZ_64K;
1284         }
1285
1286         search = btrfs_node_blockptr(node, slot);
1287         blocksize = fs_info->nodesize;
1288         if (path->reada != READA_FORWARD_ALWAYS) {
1289                 struct extent_buffer *eb;
1290
1291                 eb = find_extent_buffer(fs_info, search);
1292                 if (eb) {
1293                         free_extent_buffer(eb);
1294                         return;
1295                 }
1296         }
1297
1298         target = search;
1299
1300         nritems = btrfs_header_nritems(node);
1301         nr = slot;
1302
1303         while (1) {
1304                 if (path->reada == READA_BACK) {
1305                         if (nr == 0)
1306                                 break;
1307                         nr--;
1308                 } else if (path->reada == READA_FORWARD ||
1309                            path->reada == READA_FORWARD_ALWAYS) {
1310                         nr++;
1311                         if (nr >= nritems)
1312                                 break;
1313                 }
1314                 if (path->reada == READA_BACK && objectid) {
1315                         btrfs_node_key(node, &disk_key, nr);
1316                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1317                                 break;
1318                 }
1319                 search = btrfs_node_blockptr(node, nr);
1320                 if (path->reada == READA_FORWARD_ALWAYS ||
1321                     (search <= target && target - search <= 65536) ||
1322                     (search > target && search - target <= 65536)) {
1323                         btrfs_readahead_node_child(node, nr);
1324                         nread += blocksize;
1325                 }
1326                 nscan++;
1327                 if (nread > nread_max || nscan > 32)
1328                         break;
1329         }
1330 }
1331
1332 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1333 {
1334         struct extent_buffer *parent;
1335         int slot;
1336         int nritems;
1337
1338         parent = path->nodes[level + 1];
1339         if (!parent)
1340                 return;
1341
1342         nritems = btrfs_header_nritems(parent);
1343         slot = path->slots[level + 1];
1344
1345         if (slot > 0)
1346                 btrfs_readahead_node_child(parent, slot - 1);
1347         if (slot + 1 < nritems)
1348                 btrfs_readahead_node_child(parent, slot + 1);
1349 }
1350
1351
1352 /*
1353  * when we walk down the tree, it is usually safe to unlock the higher layers
1354  * in the tree.  The exceptions are when our path goes through slot 0, because
1355  * operations on the tree might require changing key pointers higher up in the
1356  * tree.
1357  *
1358  * callers might also have set path->keep_locks, which tells this code to keep
1359  * the lock if the path points to the last slot in the block.  This is part of
1360  * walking through the tree, and selecting the next slot in the higher block.
1361  *
1362  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1363  * if lowest_unlock is 1, level 0 won't be unlocked
1364  */
1365 static noinline void unlock_up(struct btrfs_path *path, int level,
1366                                int lowest_unlock, int min_write_lock_level,
1367                                int *write_lock_level)
1368 {
1369         int i;
1370         int skip_level = level;
1371         bool check_skip = true;
1372
1373         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1374                 if (!path->nodes[i])
1375                         break;
1376                 if (!path->locks[i])
1377                         break;
1378
1379                 if (check_skip) {
1380                         if (path->slots[i] == 0) {
1381                                 skip_level = i + 1;
1382                                 continue;
1383                         }
1384
1385                         if (path->keep_locks) {
1386                                 u32 nritems;
1387
1388                                 nritems = btrfs_header_nritems(path->nodes[i]);
1389                                 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1390                                         skip_level = i + 1;
1391                                         continue;
1392                                 }
1393                         }
1394                 }
1395
1396                 if (i >= lowest_unlock && i > skip_level) {
1397                         check_skip = false;
1398                         btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1399                         path->locks[i] = 0;
1400                         if (write_lock_level &&
1401                             i > min_write_lock_level &&
1402                             i <= *write_lock_level) {
1403                                 *write_lock_level = i - 1;
1404                         }
1405                 }
1406         }
1407 }
1408
1409 /*
1410  * Helper function for btrfs_search_slot() and other functions that do a search
1411  * on a btree. The goal is to find a tree block in the cache (the radix tree at
1412  * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1413  * its pages from disk.
1414  *
1415  * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1416  * whole btree search, starting again from the current root node.
1417  */
1418 static int
1419 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1420                       struct extent_buffer **eb_ret, int level, int slot,
1421                       const struct btrfs_key *key)
1422 {
1423         struct btrfs_fs_info *fs_info = root->fs_info;
1424         u64 blocknr;
1425         u64 gen;
1426         struct extent_buffer *tmp;
1427         struct btrfs_key first_key;
1428         int ret;
1429         int parent_level;
1430         bool unlock_up;
1431
1432         unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1433         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1434         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1435         parent_level = btrfs_header_level(*eb_ret);
1436         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1437
1438         /*
1439          * If we need to read an extent buffer from disk and we are holding locks
1440          * on upper level nodes, we unlock all the upper nodes before reading the
1441          * extent buffer, and then return -EAGAIN to the caller as it needs to
1442          * restart the search. We don't release the lock on the current level
1443          * because we need to walk this node to figure out which blocks to read.
1444          */
1445         tmp = find_extent_buffer(fs_info, blocknr);
1446         if (tmp) {
1447                 if (p->reada == READA_FORWARD_ALWAYS)
1448                         reada_for_search(fs_info, p, level, slot, key->objectid);
1449
1450                 /* first we do an atomic uptodate check */
1451                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1452                         /*
1453                          * Do extra check for first_key, eb can be stale due to
1454                          * being cached, read from scrub, or have multiple
1455                          * parents (shared tree blocks).
1456                          */
1457                         if (btrfs_verify_level_key(tmp,
1458                                         parent_level - 1, &first_key, gen)) {
1459                                 free_extent_buffer(tmp);
1460                                 return -EUCLEAN;
1461                         }
1462                         *eb_ret = tmp;
1463                         return 0;
1464                 }
1465
1466                 if (p->nowait) {
1467                         free_extent_buffer(tmp);
1468                         return -EAGAIN;
1469                 }
1470
1471                 if (unlock_up)
1472                         btrfs_unlock_up_safe(p, level + 1);
1473
1474                 /* now we're allowed to do a blocking uptodate check */
1475                 ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
1476                 if (ret) {
1477                         free_extent_buffer(tmp);
1478                         btrfs_release_path(p);
1479                         return -EIO;
1480                 }
1481                 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1482                         free_extent_buffer(tmp);
1483                         btrfs_release_path(p);
1484                         return -EUCLEAN;
1485                 }
1486
1487                 if (unlock_up)
1488                         ret = -EAGAIN;
1489
1490                 goto out;
1491         } else if (p->nowait) {
1492                 return -EAGAIN;
1493         }
1494
1495         if (unlock_up) {
1496                 btrfs_unlock_up_safe(p, level + 1);
1497                 ret = -EAGAIN;
1498         } else {
1499                 ret = 0;
1500         }
1501
1502         if (p->reada != READA_NONE)
1503                 reada_for_search(fs_info, p, level, slot, key->objectid);
1504
1505         tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1506                               gen, parent_level - 1, &first_key);
1507         if (IS_ERR(tmp)) {
1508                 btrfs_release_path(p);
1509                 return PTR_ERR(tmp);
1510         }
1511         /*
1512          * If the read above didn't mark this buffer up to date,
1513          * it will never end up being up to date.  Set ret to EIO now
1514          * and give up so that our caller doesn't loop forever
1515          * on our EAGAINs.
1516          */
1517         if (!extent_buffer_uptodate(tmp))
1518                 ret = -EIO;
1519
1520 out:
1521         if (ret == 0) {
1522                 *eb_ret = tmp;
1523         } else {
1524                 free_extent_buffer(tmp);
1525                 btrfs_release_path(p);
1526         }
1527
1528         return ret;
1529 }
1530
1531 /*
1532  * helper function for btrfs_search_slot.  This does all of the checks
1533  * for node-level blocks and does any balancing required based on
1534  * the ins_len.
1535  *
1536  * If no extra work was required, zero is returned.  If we had to
1537  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1538  * start over
1539  */
1540 static int
1541 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1542                        struct btrfs_root *root, struct btrfs_path *p,
1543                        struct extent_buffer *b, int level, int ins_len,
1544                        int *write_lock_level)
1545 {
1546         struct btrfs_fs_info *fs_info = root->fs_info;
1547         int ret = 0;
1548
1549         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1550             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1551
1552                 if (*write_lock_level < level + 1) {
1553                         *write_lock_level = level + 1;
1554                         btrfs_release_path(p);
1555                         return -EAGAIN;
1556                 }
1557
1558                 reada_for_balance(p, level);
1559                 ret = split_node(trans, root, p, level);
1560
1561                 b = p->nodes[level];
1562         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1563                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1564
1565                 if (*write_lock_level < level + 1) {
1566                         *write_lock_level = level + 1;
1567                         btrfs_release_path(p);
1568                         return -EAGAIN;
1569                 }
1570
1571                 reada_for_balance(p, level);
1572                 ret = balance_level(trans, root, p, level);
1573                 if (ret)
1574                         return ret;
1575
1576                 b = p->nodes[level];
1577                 if (!b) {
1578                         btrfs_release_path(p);
1579                         return -EAGAIN;
1580                 }
1581                 BUG_ON(btrfs_header_nritems(b) == 1);
1582         }
1583         return ret;
1584 }
1585
1586 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1587                 u64 iobjectid, u64 ioff, u8 key_type,
1588                 struct btrfs_key *found_key)
1589 {
1590         int ret;
1591         struct btrfs_key key;
1592         struct extent_buffer *eb;
1593
1594         ASSERT(path);
1595         ASSERT(found_key);
1596
1597         key.type = key_type;
1598         key.objectid = iobjectid;
1599         key.offset = ioff;
1600
1601         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1602         if (ret < 0)
1603                 return ret;
1604
1605         eb = path->nodes[0];
1606         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1607                 ret = btrfs_next_leaf(fs_root, path);
1608                 if (ret)
1609                         return ret;
1610                 eb = path->nodes[0];
1611         }
1612
1613         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1614         if (found_key->type != key.type ||
1615                         found_key->objectid != key.objectid)
1616                 return 1;
1617
1618         return 0;
1619 }
1620
1621 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1622                                                         struct btrfs_path *p,
1623                                                         int write_lock_level)
1624 {
1625         struct extent_buffer *b;
1626         int root_lock = 0;
1627         int level = 0;
1628
1629         if (p->search_commit_root) {
1630                 b = root->commit_root;
1631                 atomic_inc(&b->refs);
1632                 level = btrfs_header_level(b);
1633                 /*
1634                  * Ensure that all callers have set skip_locking when
1635                  * p->search_commit_root = 1.
1636                  */
1637                 ASSERT(p->skip_locking == 1);
1638
1639                 goto out;
1640         }
1641
1642         if (p->skip_locking) {
1643                 b = btrfs_root_node(root);
1644                 level = btrfs_header_level(b);
1645                 goto out;
1646         }
1647
1648         /* We try very hard to do read locks on the root */
1649         root_lock = BTRFS_READ_LOCK;
1650
1651         /*
1652          * If the level is set to maximum, we can skip trying to get the read
1653          * lock.
1654          */
1655         if (write_lock_level < BTRFS_MAX_LEVEL) {
1656                 /*
1657                  * We don't know the level of the root node until we actually
1658                  * have it read locked
1659                  */
1660                 if (p->nowait) {
1661                         b = btrfs_try_read_lock_root_node(root);
1662                         if (IS_ERR(b))
1663                                 return b;
1664                 } else {
1665                         b = btrfs_read_lock_root_node(root);
1666                 }
1667                 level = btrfs_header_level(b);
1668                 if (level > write_lock_level)
1669                         goto out;
1670
1671                 /* Whoops, must trade for write lock */
1672                 btrfs_tree_read_unlock(b);
1673                 free_extent_buffer(b);
1674         }
1675
1676         b = btrfs_lock_root_node(root);
1677         root_lock = BTRFS_WRITE_LOCK;
1678
1679         /* The level might have changed, check again */
1680         level = btrfs_header_level(b);
1681
1682 out:
1683         /*
1684          * The root may have failed to write out at some point, and thus is no
1685          * longer valid, return an error in this case.
1686          */
1687         if (!extent_buffer_uptodate(b)) {
1688                 if (root_lock)
1689                         btrfs_tree_unlock_rw(b, root_lock);
1690                 free_extent_buffer(b);
1691                 return ERR_PTR(-EIO);
1692         }
1693
1694         p->nodes[level] = b;
1695         if (!p->skip_locking)
1696                 p->locks[level] = root_lock;
1697         /*
1698          * Callers are responsible for dropping b's references.
1699          */
1700         return b;
1701 }
1702
1703 /*
1704  * Replace the extent buffer at the lowest level of the path with a cloned
1705  * version. The purpose is to be able to use it safely, after releasing the
1706  * commit root semaphore, even if relocation is happening in parallel, the
1707  * transaction used for relocation is committed and the extent buffer is
1708  * reallocated in the next transaction.
1709  *
1710  * This is used in a context where the caller does not prevent transaction
1711  * commits from happening, either by holding a transaction handle or holding
1712  * some lock, while it's doing searches through a commit root.
1713  * At the moment it's only used for send operations.
1714  */
1715 static int finish_need_commit_sem_search(struct btrfs_path *path)
1716 {
1717         const int i = path->lowest_level;
1718         const int slot = path->slots[i];
1719         struct extent_buffer *lowest = path->nodes[i];
1720         struct extent_buffer *clone;
1721
1722         ASSERT(path->need_commit_sem);
1723
1724         if (!lowest)
1725                 return 0;
1726
1727         lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1728
1729         clone = btrfs_clone_extent_buffer(lowest);
1730         if (!clone)
1731                 return -ENOMEM;
1732
1733         btrfs_release_path(path);
1734         path->nodes[i] = clone;
1735         path->slots[i] = slot;
1736
1737         return 0;
1738 }
1739
1740 static inline int search_for_key_slot(struct extent_buffer *eb,
1741                                       int search_low_slot,
1742                                       const struct btrfs_key *key,
1743                                       int prev_cmp,
1744                                       int *slot)
1745 {
1746         /*
1747          * If a previous call to btrfs_bin_search() on a parent node returned an
1748          * exact match (prev_cmp == 0), we can safely assume the target key will
1749          * always be at slot 0 on lower levels, since each key pointer
1750          * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1751          * subtree it points to. Thus we can skip searching lower levels.
1752          */
1753         if (prev_cmp == 0) {
1754                 *slot = 0;
1755                 return 0;
1756         }
1757
1758         return generic_bin_search(eb, search_low_slot, key, slot);
1759 }
1760
1761 static int search_leaf(struct btrfs_trans_handle *trans,
1762                        struct btrfs_root *root,
1763                        const struct btrfs_key *key,
1764                        struct btrfs_path *path,
1765                        int ins_len,
1766                        int prev_cmp)
1767 {
1768         struct extent_buffer *leaf = path->nodes[0];
1769         int leaf_free_space = -1;
1770         int search_low_slot = 0;
1771         int ret;
1772         bool do_bin_search = true;
1773
1774         /*
1775          * If we are doing an insertion, the leaf has enough free space and the
1776          * destination slot for the key is not slot 0, then we can unlock our
1777          * write lock on the parent, and any other upper nodes, before doing the
1778          * binary search on the leaf (with search_for_key_slot()), allowing other
1779          * tasks to lock the parent and any other upper nodes.
1780          */
1781         if (ins_len > 0) {
1782                 /*
1783                  * Cache the leaf free space, since we will need it later and it
1784                  * will not change until then.
1785                  */
1786                 leaf_free_space = btrfs_leaf_free_space(leaf);
1787
1788                 /*
1789                  * !path->locks[1] means we have a single node tree, the leaf is
1790                  * the root of the tree.
1791                  */
1792                 if (path->locks[1] && leaf_free_space >= ins_len) {
1793                         struct btrfs_disk_key first_key;
1794
1795                         ASSERT(btrfs_header_nritems(leaf) > 0);
1796                         btrfs_item_key(leaf, &first_key, 0);
1797
1798                         /*
1799                          * Doing the extra comparison with the first key is cheap,
1800                          * taking into account that the first key is very likely
1801                          * already in a cache line because it immediately follows
1802                          * the extent buffer's header and we have recently accessed
1803                          * the header's level field.
1804                          */
1805                         ret = comp_keys(&first_key, key);
1806                         if (ret < 0) {
1807                                 /*
1808                                  * The first key is smaller than the key we want
1809                                  * to insert, so we are safe to unlock all upper
1810                                  * nodes and we have to do the binary search.
1811                                  *
1812                                  * We do use btrfs_unlock_up_safe() and not
1813                                  * unlock_up() because the later does not unlock
1814                                  * nodes with a slot of 0 - we can safely unlock
1815                                  * any node even if its slot is 0 since in this
1816                                  * case the key does not end up at slot 0 of the
1817                                  * leaf and there's no need to split the leaf.
1818                                  */
1819                                 btrfs_unlock_up_safe(path, 1);
1820                                 search_low_slot = 1;
1821                         } else {
1822                                 /*
1823                                  * The first key is >= then the key we want to
1824                                  * insert, so we can skip the binary search as
1825                                  * the target key will be at slot 0.
1826                                  *
1827                                  * We can not unlock upper nodes when the key is
1828                                  * less than the first key, because we will need
1829                                  * to update the key at slot 0 of the parent node
1830                                  * and possibly of other upper nodes too.
1831                                  * If the key matches the first key, then we can
1832                                  * unlock all the upper nodes, using
1833                                  * btrfs_unlock_up_safe() instead of unlock_up()
1834                                  * as stated above.
1835                                  */
1836                                 if (ret == 0)
1837                                         btrfs_unlock_up_safe(path, 1);
1838                                 /*
1839                                  * ret is already 0 or 1, matching the result of
1840                                  * a btrfs_bin_search() call, so there is no need
1841                                  * to adjust it.
1842                                  */
1843                                 do_bin_search = false;
1844                                 path->slots[0] = 0;
1845                         }
1846                 }
1847         }
1848
1849         if (do_bin_search) {
1850                 ret = search_for_key_slot(leaf, search_low_slot, key,
1851                                           prev_cmp, &path->slots[0]);
1852                 if (ret < 0)
1853                         return ret;
1854         }
1855
1856         if (ins_len > 0) {
1857                 /*
1858                  * Item key already exists. In this case, if we are allowed to
1859                  * insert the item (for example, in dir_item case, item key
1860                  * collision is allowed), it will be merged with the original
1861                  * item. Only the item size grows, no new btrfs item will be
1862                  * added. If search_for_extension is not set, ins_len already
1863                  * accounts the size btrfs_item, deduct it here so leaf space
1864                  * check will be correct.
1865                  */
1866                 if (ret == 0 && !path->search_for_extension) {
1867                         ASSERT(ins_len >= sizeof(struct btrfs_item));
1868                         ins_len -= sizeof(struct btrfs_item);
1869                 }
1870
1871                 ASSERT(leaf_free_space >= 0);
1872
1873                 if (leaf_free_space < ins_len) {
1874                         int err;
1875
1876                         err = split_leaf(trans, root, key, path, ins_len,
1877                                          (ret == 0));
1878                         ASSERT(err <= 0);
1879                         if (WARN_ON(err > 0))
1880                                 err = -EUCLEAN;
1881                         if (err)
1882                                 ret = err;
1883                 }
1884         }
1885
1886         return ret;
1887 }
1888
1889 /*
1890  * btrfs_search_slot - look for a key in a tree and perform necessary
1891  * modifications to preserve tree invariants.
1892  *
1893  * @trans:      Handle of transaction, used when modifying the tree
1894  * @p:          Holds all btree nodes along the search path
1895  * @root:       The root node of the tree
1896  * @key:        The key we are looking for
1897  * @ins_len:    Indicates purpose of search:
1898  *              >0  for inserts it's size of item inserted (*)
1899  *              <0  for deletions
1900  *               0  for plain searches, not modifying the tree
1901  *
1902  *              (*) If size of item inserted doesn't include
1903  *              sizeof(struct btrfs_item), then p->search_for_extension must
1904  *              be set.
1905  * @cow:        boolean should CoW operations be performed. Must always be 1
1906  *              when modifying the tree.
1907  *
1908  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1909  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1910  *
1911  * If @key is found, 0 is returned and you can find the item in the leaf level
1912  * of the path (level 0)
1913  *
1914  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1915  * points to the slot where it should be inserted
1916  *
1917  * If an error is encountered while searching the tree a negative error number
1918  * is returned
1919  */
1920 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1921                       const struct btrfs_key *key, struct btrfs_path *p,
1922                       int ins_len, int cow)
1923 {
1924         struct btrfs_fs_info *fs_info = root->fs_info;
1925         struct extent_buffer *b;
1926         int slot;
1927         int ret;
1928         int err;
1929         int level;
1930         int lowest_unlock = 1;
1931         /* everything at write_lock_level or lower must be write locked */
1932         int write_lock_level = 0;
1933         u8 lowest_level = 0;
1934         int min_write_lock_level;
1935         int prev_cmp;
1936
1937         lowest_level = p->lowest_level;
1938         WARN_ON(lowest_level && ins_len > 0);
1939         WARN_ON(p->nodes[0] != NULL);
1940         BUG_ON(!cow && ins_len);
1941
1942         /*
1943          * For now only allow nowait for read only operations.  There's no
1944          * strict reason why we can't, we just only need it for reads so it's
1945          * only implemented for reads.
1946          */
1947         ASSERT(!p->nowait || !cow);
1948
1949         if (ins_len < 0) {
1950                 lowest_unlock = 2;
1951
1952                 /* when we are removing items, we might have to go up to level
1953                  * two as we update tree pointers  Make sure we keep write
1954                  * for those levels as well
1955                  */
1956                 write_lock_level = 2;
1957         } else if (ins_len > 0) {
1958                 /*
1959                  * for inserting items, make sure we have a write lock on
1960                  * level 1 so we can update keys
1961                  */
1962                 write_lock_level = 1;
1963         }
1964
1965         if (!cow)
1966                 write_lock_level = -1;
1967
1968         if (cow && (p->keep_locks || p->lowest_level))
1969                 write_lock_level = BTRFS_MAX_LEVEL;
1970
1971         min_write_lock_level = write_lock_level;
1972
1973         if (p->need_commit_sem) {
1974                 ASSERT(p->search_commit_root);
1975                 if (p->nowait) {
1976                         if (!down_read_trylock(&fs_info->commit_root_sem))
1977                                 return -EAGAIN;
1978                 } else {
1979                         down_read(&fs_info->commit_root_sem);
1980                 }
1981         }
1982
1983 again:
1984         prev_cmp = -1;
1985         b = btrfs_search_slot_get_root(root, p, write_lock_level);
1986         if (IS_ERR(b)) {
1987                 ret = PTR_ERR(b);
1988                 goto done;
1989         }
1990
1991         while (b) {
1992                 int dec = 0;
1993
1994                 level = btrfs_header_level(b);
1995
1996                 if (cow) {
1997                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1998
1999                         /*
2000                          * if we don't really need to cow this block
2001                          * then we don't want to set the path blocking,
2002                          * so we test it here
2003                          */
2004                         if (!should_cow_block(trans, root, b))
2005                                 goto cow_done;
2006
2007                         /*
2008                          * must have write locks on this node and the
2009                          * parent
2010                          */
2011                         if (level > write_lock_level ||
2012                             (level + 1 > write_lock_level &&
2013                             level + 1 < BTRFS_MAX_LEVEL &&
2014                             p->nodes[level + 1])) {
2015                                 write_lock_level = level + 1;
2016                                 btrfs_release_path(p);
2017                                 goto again;
2018                         }
2019
2020                         if (last_level)
2021                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2022                                                       &b,
2023                                                       BTRFS_NESTING_COW);
2024                         else
2025                                 err = btrfs_cow_block(trans, root, b,
2026                                                       p->nodes[level + 1],
2027                                                       p->slots[level + 1], &b,
2028                                                       BTRFS_NESTING_COW);
2029                         if (err) {
2030                                 ret = err;
2031                                 goto done;
2032                         }
2033                 }
2034 cow_done:
2035                 p->nodes[level] = b;
2036
2037                 /*
2038                  * we have a lock on b and as long as we aren't changing
2039                  * the tree, there is no way to for the items in b to change.
2040                  * It is safe to drop the lock on our parent before we
2041                  * go through the expensive btree search on b.
2042                  *
2043                  * If we're inserting or deleting (ins_len != 0), then we might
2044                  * be changing slot zero, which may require changing the parent.
2045                  * So, we can't drop the lock until after we know which slot
2046                  * we're operating on.
2047                  */
2048                 if (!ins_len && !p->keep_locks) {
2049                         int u = level + 1;
2050
2051                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2052                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2053                                 p->locks[u] = 0;
2054                         }
2055                 }
2056
2057                 if (level == 0) {
2058                         if (ins_len > 0)
2059                                 ASSERT(write_lock_level >= 1);
2060
2061                         ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2062                         if (!p->search_for_split)
2063                                 unlock_up(p, level, lowest_unlock,
2064                                           min_write_lock_level, NULL);
2065                         goto done;
2066                 }
2067
2068                 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2069                 if (ret < 0)
2070                         goto done;
2071                 prev_cmp = ret;
2072
2073                 if (ret && slot > 0) {
2074                         dec = 1;
2075                         slot--;
2076                 }
2077                 p->slots[level] = slot;
2078                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2079                                              &write_lock_level);
2080                 if (err == -EAGAIN)
2081                         goto again;
2082                 if (err) {
2083                         ret = err;
2084                         goto done;
2085                 }
2086                 b = p->nodes[level];
2087                 slot = p->slots[level];
2088
2089                 /*
2090                  * Slot 0 is special, if we change the key we have to update
2091                  * the parent pointer which means we must have a write lock on
2092                  * the parent
2093                  */
2094                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2095                         write_lock_level = level + 1;
2096                         btrfs_release_path(p);
2097                         goto again;
2098                 }
2099
2100                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2101                           &write_lock_level);
2102
2103                 if (level == lowest_level) {
2104                         if (dec)
2105                                 p->slots[level]++;
2106                         goto done;
2107                 }
2108
2109                 err = read_block_for_search(root, p, &b, level, slot, key);
2110                 if (err == -EAGAIN)
2111                         goto again;
2112                 if (err) {
2113                         ret = err;
2114                         goto done;
2115                 }
2116
2117                 if (!p->skip_locking) {
2118                         level = btrfs_header_level(b);
2119
2120                         btrfs_maybe_reset_lockdep_class(root, b);
2121
2122                         if (level <= write_lock_level) {
2123                                 btrfs_tree_lock(b);
2124                                 p->locks[level] = BTRFS_WRITE_LOCK;
2125                         } else {
2126                                 if (p->nowait) {
2127                                         if (!btrfs_try_tree_read_lock(b)) {
2128                                                 free_extent_buffer(b);
2129                                                 ret = -EAGAIN;
2130                                                 goto done;
2131                                         }
2132                                 } else {
2133                                         btrfs_tree_read_lock(b);
2134                                 }
2135                                 p->locks[level] = BTRFS_READ_LOCK;
2136                         }
2137                         p->nodes[level] = b;
2138                 }
2139         }
2140         ret = 1;
2141 done:
2142         if (ret < 0 && !p->skip_release_on_error)
2143                 btrfs_release_path(p);
2144
2145         if (p->need_commit_sem) {
2146                 int ret2;
2147
2148                 ret2 = finish_need_commit_sem_search(p);
2149                 up_read(&fs_info->commit_root_sem);
2150                 if (ret2)
2151                         ret = ret2;
2152         }
2153
2154         return ret;
2155 }
2156 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2157
2158 /*
2159  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2160  * current state of the tree together with the operations recorded in the tree
2161  * modification log to search for the key in a previous version of this tree, as
2162  * denoted by the time_seq parameter.
2163  *
2164  * Naturally, there is no support for insert, delete or cow operations.
2165  *
2166  * The resulting path and return value will be set up as if we called
2167  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2168  */
2169 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2170                           struct btrfs_path *p, u64 time_seq)
2171 {
2172         struct btrfs_fs_info *fs_info = root->fs_info;
2173         struct extent_buffer *b;
2174         int slot;
2175         int ret;
2176         int err;
2177         int level;
2178         int lowest_unlock = 1;
2179         u8 lowest_level = 0;
2180
2181         lowest_level = p->lowest_level;
2182         WARN_ON(p->nodes[0] != NULL);
2183         ASSERT(!p->nowait);
2184
2185         if (p->search_commit_root) {
2186                 BUG_ON(time_seq);
2187                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2188         }
2189
2190 again:
2191         b = btrfs_get_old_root(root, time_seq);
2192         if (!b) {
2193                 ret = -EIO;
2194                 goto done;
2195         }
2196         level = btrfs_header_level(b);
2197         p->locks[level] = BTRFS_READ_LOCK;
2198
2199         while (b) {
2200                 int dec = 0;
2201
2202                 level = btrfs_header_level(b);
2203                 p->nodes[level] = b;
2204
2205                 /*
2206                  * we have a lock on b and as long as we aren't changing
2207                  * the tree, there is no way to for the items in b to change.
2208                  * It is safe to drop the lock on our parent before we
2209                  * go through the expensive btree search on b.
2210                  */
2211                 btrfs_unlock_up_safe(p, level + 1);
2212
2213                 ret = btrfs_bin_search(b, key, &slot);
2214                 if (ret < 0)
2215                         goto done;
2216
2217                 if (level == 0) {
2218                         p->slots[level] = slot;
2219                         unlock_up(p, level, lowest_unlock, 0, NULL);
2220                         goto done;
2221                 }
2222
2223                 if (ret && slot > 0) {
2224                         dec = 1;
2225                         slot--;
2226                 }
2227                 p->slots[level] = slot;
2228                 unlock_up(p, level, lowest_unlock, 0, NULL);
2229
2230                 if (level == lowest_level) {
2231                         if (dec)
2232                                 p->slots[level]++;
2233                         goto done;
2234                 }
2235
2236                 err = read_block_for_search(root, p, &b, level, slot, key);
2237                 if (err == -EAGAIN)
2238                         goto again;
2239                 if (err) {
2240                         ret = err;
2241                         goto done;
2242                 }
2243
2244                 level = btrfs_header_level(b);
2245                 btrfs_tree_read_lock(b);
2246                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2247                 if (!b) {
2248                         ret = -ENOMEM;
2249                         goto done;
2250                 }
2251                 p->locks[level] = BTRFS_READ_LOCK;
2252                 p->nodes[level] = b;
2253         }
2254         ret = 1;
2255 done:
2256         if (ret < 0)
2257                 btrfs_release_path(p);
2258
2259         return ret;
2260 }
2261
2262 /*
2263  * helper to use instead of search slot if no exact match is needed but
2264  * instead the next or previous item should be returned.
2265  * When find_higher is true, the next higher item is returned, the next lower
2266  * otherwise.
2267  * When return_any and find_higher are both true, and no higher item is found,
2268  * return the next lower instead.
2269  * When return_any is true and find_higher is false, and no lower item is found,
2270  * return the next higher instead.
2271  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2272  * < 0 on error
2273  */
2274 int btrfs_search_slot_for_read(struct btrfs_root *root,
2275                                const struct btrfs_key *key,
2276                                struct btrfs_path *p, int find_higher,
2277                                int return_any)
2278 {
2279         int ret;
2280         struct extent_buffer *leaf;
2281
2282 again:
2283         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2284         if (ret <= 0)
2285                 return ret;
2286         /*
2287          * a return value of 1 means the path is at the position where the
2288          * item should be inserted. Normally this is the next bigger item,
2289          * but in case the previous item is the last in a leaf, path points
2290          * to the first free slot in the previous leaf, i.e. at an invalid
2291          * item.
2292          */
2293         leaf = p->nodes[0];
2294
2295         if (find_higher) {
2296                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2297                         ret = btrfs_next_leaf(root, p);
2298                         if (ret <= 0)
2299                                 return ret;
2300                         if (!return_any)
2301                                 return 1;
2302                         /*
2303                          * no higher item found, return the next
2304                          * lower instead
2305                          */
2306                         return_any = 0;
2307                         find_higher = 0;
2308                         btrfs_release_path(p);
2309                         goto again;
2310                 }
2311         } else {
2312                 if (p->slots[0] == 0) {
2313                         ret = btrfs_prev_leaf(root, p);
2314                         if (ret < 0)
2315                                 return ret;
2316                         if (!ret) {
2317                                 leaf = p->nodes[0];
2318                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2319                                         p->slots[0]--;
2320                                 return 0;
2321                         }
2322                         if (!return_any)
2323                                 return 1;
2324                         /*
2325                          * no lower item found, return the next
2326                          * higher instead
2327                          */
2328                         return_any = 0;
2329                         find_higher = 1;
2330                         btrfs_release_path(p);
2331                         goto again;
2332                 } else {
2333                         --p->slots[0];
2334                 }
2335         }
2336         return 0;
2337 }
2338
2339 /*
2340  * Execute search and call btrfs_previous_item to traverse backwards if the item
2341  * was not found.
2342  *
2343  * Return 0 if found, 1 if not found and < 0 if error.
2344  */
2345 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2346                            struct btrfs_path *path)
2347 {
2348         int ret;
2349
2350         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2351         if (ret > 0)
2352                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2353
2354         if (ret == 0)
2355                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2356
2357         return ret;
2358 }
2359
2360 /**
2361  * Search for a valid slot for the given path.
2362  *
2363  * @root:       The root node of the tree.
2364  * @key:        Will contain a valid item if found.
2365  * @path:       The starting point to validate the slot.
2366  *
2367  * Return: 0  if the item is valid
2368  *         1  if not found
2369  *         <0 if error.
2370  */
2371 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2372                               struct btrfs_path *path)
2373 {
2374         while (1) {
2375                 int ret;
2376                 const int slot = path->slots[0];
2377                 const struct extent_buffer *leaf = path->nodes[0];
2378
2379                 /* This is where we start walking the path. */
2380                 if (slot >= btrfs_header_nritems(leaf)) {
2381                         /*
2382                          * If we've reached the last slot in this leaf we need
2383                          * to go to the next leaf and reset the path.
2384                          */
2385                         ret = btrfs_next_leaf(root, path);
2386                         if (ret)
2387                                 return ret;
2388                         continue;
2389                 }
2390                 /* Store the found, valid item in @key. */
2391                 btrfs_item_key_to_cpu(leaf, key, slot);
2392                 break;
2393         }
2394         return 0;
2395 }
2396
2397 /*
2398  * adjust the pointers going up the tree, starting at level
2399  * making sure the right key of each node is points to 'key'.
2400  * This is used after shifting pointers to the left, so it stops
2401  * fixing up pointers when a given leaf/node is not in slot 0 of the
2402  * higher levels
2403  *
2404  */
2405 static void fixup_low_keys(struct btrfs_path *path,
2406                            struct btrfs_disk_key *key, int level)
2407 {
2408         int i;
2409         struct extent_buffer *t;
2410         int ret;
2411
2412         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2413                 int tslot = path->slots[i];
2414
2415                 if (!path->nodes[i])
2416                         break;
2417                 t = path->nodes[i];
2418                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2419                                 BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2420                 BUG_ON(ret < 0);
2421                 btrfs_set_node_key(t, key, tslot);
2422                 btrfs_mark_buffer_dirty(path->nodes[i]);
2423                 if (tslot != 0)
2424                         break;
2425         }
2426 }
2427
2428 /*
2429  * update item key.
2430  *
2431  * This function isn't completely safe. It's the caller's responsibility
2432  * that the new key won't break the order
2433  */
2434 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2435                              struct btrfs_path *path,
2436                              const struct btrfs_key *new_key)
2437 {
2438         struct btrfs_disk_key disk_key;
2439         struct extent_buffer *eb;
2440         int slot;
2441
2442         eb = path->nodes[0];
2443         slot = path->slots[0];
2444         if (slot > 0) {
2445                 btrfs_item_key(eb, &disk_key, slot - 1);
2446                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2447                         btrfs_crit(fs_info,
2448                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2449                                    slot, btrfs_disk_key_objectid(&disk_key),
2450                                    btrfs_disk_key_type(&disk_key),
2451                                    btrfs_disk_key_offset(&disk_key),
2452                                    new_key->objectid, new_key->type,
2453                                    new_key->offset);
2454                         btrfs_print_leaf(eb);
2455                         BUG();
2456                 }
2457         }
2458         if (slot < btrfs_header_nritems(eb) - 1) {
2459                 btrfs_item_key(eb, &disk_key, slot + 1);
2460                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2461                         btrfs_crit(fs_info,
2462                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2463                                    slot, btrfs_disk_key_objectid(&disk_key),
2464                                    btrfs_disk_key_type(&disk_key),
2465                                    btrfs_disk_key_offset(&disk_key),
2466                                    new_key->objectid, new_key->type,
2467                                    new_key->offset);
2468                         btrfs_print_leaf(eb);
2469                         BUG();
2470                 }
2471         }
2472
2473         btrfs_cpu_key_to_disk(&disk_key, new_key);
2474         btrfs_set_item_key(eb, &disk_key, slot);
2475         btrfs_mark_buffer_dirty(eb);
2476         if (slot == 0)
2477                 fixup_low_keys(path, &disk_key, 1);
2478 }
2479
2480 /*
2481  * Check key order of two sibling extent buffers.
2482  *
2483  * Return true if something is wrong.
2484  * Return false if everything is fine.
2485  *
2486  * Tree-checker only works inside one tree block, thus the following
2487  * corruption can not be detected by tree-checker:
2488  *
2489  * Leaf @left                   | Leaf @right
2490  * --------------------------------------------------------------
2491  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2492  *
2493  * Key f6 in leaf @left itself is valid, but not valid when the next
2494  * key in leaf @right is 7.
2495  * This can only be checked at tree block merge time.
2496  * And since tree checker has ensured all key order in each tree block
2497  * is correct, we only need to bother the last key of @left and the first
2498  * key of @right.
2499  */
2500 static bool check_sibling_keys(struct extent_buffer *left,
2501                                struct extent_buffer *right)
2502 {
2503         struct btrfs_key left_last;
2504         struct btrfs_key right_first;
2505         int level = btrfs_header_level(left);
2506         int nr_left = btrfs_header_nritems(left);
2507         int nr_right = btrfs_header_nritems(right);
2508
2509         /* No key to check in one of the tree blocks */
2510         if (!nr_left || !nr_right)
2511                 return false;
2512
2513         if (level) {
2514                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2515                 btrfs_node_key_to_cpu(right, &right_first, 0);
2516         } else {
2517                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2518                 btrfs_item_key_to_cpu(right, &right_first, 0);
2519         }
2520
2521         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2522                 btrfs_crit(left->fs_info,
2523 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2524                            left_last.objectid, left_last.type,
2525                            left_last.offset, right_first.objectid,
2526                            right_first.type, right_first.offset);
2527                 return true;
2528         }
2529         return false;
2530 }
2531
2532 /*
2533  * try to push data from one node into the next node left in the
2534  * tree.
2535  *
2536  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2537  * error, and > 0 if there was no room in the left hand block.
2538  */
2539 static int push_node_left(struct btrfs_trans_handle *trans,
2540                           struct extent_buffer *dst,
2541                           struct extent_buffer *src, int empty)
2542 {
2543         struct btrfs_fs_info *fs_info = trans->fs_info;
2544         int push_items = 0;
2545         int src_nritems;
2546         int dst_nritems;
2547         int ret = 0;
2548
2549         src_nritems = btrfs_header_nritems(src);
2550         dst_nritems = btrfs_header_nritems(dst);
2551         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2552         WARN_ON(btrfs_header_generation(src) != trans->transid);
2553         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2554
2555         if (!empty && src_nritems <= 8)
2556                 return 1;
2557
2558         if (push_items <= 0)
2559                 return 1;
2560
2561         if (empty) {
2562                 push_items = min(src_nritems, push_items);
2563                 if (push_items < src_nritems) {
2564                         /* leave at least 8 pointers in the node if
2565                          * we aren't going to empty it
2566                          */
2567                         if (src_nritems - push_items < 8) {
2568                                 if (push_items <= 8)
2569                                         return 1;
2570                                 push_items -= 8;
2571                         }
2572                 }
2573         } else
2574                 push_items = min(src_nritems - 8, push_items);
2575
2576         /* dst is the left eb, src is the middle eb */
2577         if (check_sibling_keys(dst, src)) {
2578                 ret = -EUCLEAN;
2579                 btrfs_abort_transaction(trans, ret);
2580                 return ret;
2581         }
2582         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2583         if (ret) {
2584                 btrfs_abort_transaction(trans, ret);
2585                 return ret;
2586         }
2587         copy_extent_buffer(dst, src,
2588                            btrfs_node_key_ptr_offset(dst_nritems),
2589                            btrfs_node_key_ptr_offset(0),
2590                            push_items * sizeof(struct btrfs_key_ptr));
2591
2592         if (push_items < src_nritems) {
2593                 /*
2594                  * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2595                  * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2596                  */
2597                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2598                                       btrfs_node_key_ptr_offset(push_items),
2599                                       (src_nritems - push_items) *
2600                                       sizeof(struct btrfs_key_ptr));
2601         }
2602         btrfs_set_header_nritems(src, src_nritems - push_items);
2603         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2604         btrfs_mark_buffer_dirty(src);
2605         btrfs_mark_buffer_dirty(dst);
2606
2607         return ret;
2608 }
2609
2610 /*
2611  * try to push data from one node into the next node right in the
2612  * tree.
2613  *
2614  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2615  * error, and > 0 if there was no room in the right hand block.
2616  *
2617  * this will  only push up to 1/2 the contents of the left node over
2618  */
2619 static int balance_node_right(struct btrfs_trans_handle *trans,
2620                               struct extent_buffer *dst,
2621                               struct extent_buffer *src)
2622 {
2623         struct btrfs_fs_info *fs_info = trans->fs_info;
2624         int push_items = 0;
2625         int max_push;
2626         int src_nritems;
2627         int dst_nritems;
2628         int ret = 0;
2629
2630         WARN_ON(btrfs_header_generation(src) != trans->transid);
2631         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2632
2633         src_nritems = btrfs_header_nritems(src);
2634         dst_nritems = btrfs_header_nritems(dst);
2635         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2636         if (push_items <= 0)
2637                 return 1;
2638
2639         if (src_nritems < 4)
2640                 return 1;
2641
2642         max_push = src_nritems / 2 + 1;
2643         /* don't try to empty the node */
2644         if (max_push >= src_nritems)
2645                 return 1;
2646
2647         if (max_push < push_items)
2648                 push_items = max_push;
2649
2650         /* dst is the right eb, src is the middle eb */
2651         if (check_sibling_keys(src, dst)) {
2652                 ret = -EUCLEAN;
2653                 btrfs_abort_transaction(trans, ret);
2654                 return ret;
2655         }
2656         ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2657         BUG_ON(ret < 0);
2658         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2659                                       btrfs_node_key_ptr_offset(0),
2660                                       (dst_nritems) *
2661                                       sizeof(struct btrfs_key_ptr));
2662
2663         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2664                                          push_items);
2665         if (ret) {
2666                 btrfs_abort_transaction(trans, ret);
2667                 return ret;
2668         }
2669         copy_extent_buffer(dst, src,
2670                            btrfs_node_key_ptr_offset(0),
2671                            btrfs_node_key_ptr_offset(src_nritems - push_items),
2672                            push_items * sizeof(struct btrfs_key_ptr));
2673
2674         btrfs_set_header_nritems(src, src_nritems - push_items);
2675         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2676
2677         btrfs_mark_buffer_dirty(src);
2678         btrfs_mark_buffer_dirty(dst);
2679
2680         return ret;
2681 }
2682
2683 /*
2684  * helper function to insert a new root level in the tree.
2685  * A new node is allocated, and a single item is inserted to
2686  * point to the existing root
2687  *
2688  * returns zero on success or < 0 on failure.
2689  */
2690 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2691                            struct btrfs_root *root,
2692                            struct btrfs_path *path, int level)
2693 {
2694         struct btrfs_fs_info *fs_info = root->fs_info;
2695         u64 lower_gen;
2696         struct extent_buffer *lower;
2697         struct extent_buffer *c;
2698         struct extent_buffer *old;
2699         struct btrfs_disk_key lower_key;
2700         int ret;
2701
2702         BUG_ON(path->nodes[level]);
2703         BUG_ON(path->nodes[level-1] != root->node);
2704
2705         lower = path->nodes[level-1];
2706         if (level == 1)
2707                 btrfs_item_key(lower, &lower_key, 0);
2708         else
2709                 btrfs_node_key(lower, &lower_key, 0);
2710
2711         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2712                                    &lower_key, level, root->node->start, 0,
2713                                    BTRFS_NESTING_NEW_ROOT);
2714         if (IS_ERR(c))
2715                 return PTR_ERR(c);
2716
2717         root_add_used(root, fs_info->nodesize);
2718
2719         btrfs_set_header_nritems(c, 1);
2720         btrfs_set_node_key(c, &lower_key, 0);
2721         btrfs_set_node_blockptr(c, 0, lower->start);
2722         lower_gen = btrfs_header_generation(lower);
2723         WARN_ON(lower_gen != trans->transid);
2724
2725         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2726
2727         btrfs_mark_buffer_dirty(c);
2728
2729         old = root->node;
2730         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2731         BUG_ON(ret < 0);
2732         rcu_assign_pointer(root->node, c);
2733
2734         /* the super has an extra ref to root->node */
2735         free_extent_buffer(old);
2736
2737         add_root_to_dirty_list(root);
2738         atomic_inc(&c->refs);
2739         path->nodes[level] = c;
2740         path->locks[level] = BTRFS_WRITE_LOCK;
2741         path->slots[level] = 0;
2742         return 0;
2743 }
2744
2745 /*
2746  * worker function to insert a single pointer in a node.
2747  * the node should have enough room for the pointer already
2748  *
2749  * slot and level indicate where you want the key to go, and
2750  * blocknr is the block the key points to.
2751  */
2752 static void insert_ptr(struct btrfs_trans_handle *trans,
2753                        struct btrfs_path *path,
2754                        struct btrfs_disk_key *key, u64 bytenr,
2755                        int slot, int level)
2756 {
2757         struct extent_buffer *lower;
2758         int nritems;
2759         int ret;
2760
2761         BUG_ON(!path->nodes[level]);
2762         btrfs_assert_tree_write_locked(path->nodes[level]);
2763         lower = path->nodes[level];
2764         nritems = btrfs_header_nritems(lower);
2765         BUG_ON(slot > nritems);
2766         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2767         if (slot != nritems) {
2768                 if (level) {
2769                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2770                                         slot, nritems - slot);
2771                         BUG_ON(ret < 0);
2772                 }
2773                 memmove_extent_buffer(lower,
2774                               btrfs_node_key_ptr_offset(slot + 1),
2775                               btrfs_node_key_ptr_offset(slot),
2776                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2777         }
2778         if (level) {
2779                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2780                                             BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2781                 BUG_ON(ret < 0);
2782         }
2783         btrfs_set_node_key(lower, key, slot);
2784         btrfs_set_node_blockptr(lower, slot, bytenr);
2785         WARN_ON(trans->transid == 0);
2786         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2787         btrfs_set_header_nritems(lower, nritems + 1);
2788         btrfs_mark_buffer_dirty(lower);
2789 }
2790
2791 /*
2792  * split the node at the specified level in path in two.
2793  * The path is corrected to point to the appropriate node after the split
2794  *
2795  * Before splitting this tries to make some room in the node by pushing
2796  * left and right, if either one works, it returns right away.
2797  *
2798  * returns 0 on success and < 0 on failure
2799  */
2800 static noinline int split_node(struct btrfs_trans_handle *trans,
2801                                struct btrfs_root *root,
2802                                struct btrfs_path *path, int level)
2803 {
2804         struct btrfs_fs_info *fs_info = root->fs_info;
2805         struct extent_buffer *c;
2806         struct extent_buffer *split;
2807         struct btrfs_disk_key disk_key;
2808         int mid;
2809         int ret;
2810         u32 c_nritems;
2811
2812         c = path->nodes[level];
2813         WARN_ON(btrfs_header_generation(c) != trans->transid);
2814         if (c == root->node) {
2815                 /*
2816                  * trying to split the root, lets make a new one
2817                  *
2818                  * tree mod log: We don't log_removal old root in
2819                  * insert_new_root, because that root buffer will be kept as a
2820                  * normal node. We are going to log removal of half of the
2821                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2822                  * holding a tree lock on the buffer, which is why we cannot
2823                  * race with other tree_mod_log users.
2824                  */
2825                 ret = insert_new_root(trans, root, path, level + 1);
2826                 if (ret)
2827                         return ret;
2828         } else {
2829                 ret = push_nodes_for_insert(trans, root, path, level);
2830                 c = path->nodes[level];
2831                 if (!ret && btrfs_header_nritems(c) <
2832                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2833                         return 0;
2834                 if (ret < 0)
2835                         return ret;
2836         }
2837
2838         c_nritems = btrfs_header_nritems(c);
2839         mid = (c_nritems + 1) / 2;
2840         btrfs_node_key(c, &disk_key, mid);
2841
2842         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2843                                        &disk_key, level, c->start, 0,
2844                                        BTRFS_NESTING_SPLIT);
2845         if (IS_ERR(split))
2846                 return PTR_ERR(split);
2847
2848         root_add_used(root, fs_info->nodesize);
2849         ASSERT(btrfs_header_level(c) == level);
2850
2851         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2852         if (ret) {
2853                 btrfs_abort_transaction(trans, ret);
2854                 return ret;
2855         }
2856         copy_extent_buffer(split, c,
2857                            btrfs_node_key_ptr_offset(0),
2858                            btrfs_node_key_ptr_offset(mid),
2859                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2860         btrfs_set_header_nritems(split, c_nritems - mid);
2861         btrfs_set_header_nritems(c, mid);
2862
2863         btrfs_mark_buffer_dirty(c);
2864         btrfs_mark_buffer_dirty(split);
2865
2866         insert_ptr(trans, path, &disk_key, split->start,
2867                    path->slots[level + 1] + 1, level + 1);
2868
2869         if (path->slots[level] >= mid) {
2870                 path->slots[level] -= mid;
2871                 btrfs_tree_unlock(c);
2872                 free_extent_buffer(c);
2873                 path->nodes[level] = split;
2874                 path->slots[level + 1] += 1;
2875         } else {
2876                 btrfs_tree_unlock(split);
2877                 free_extent_buffer(split);
2878         }
2879         return 0;
2880 }
2881
2882 /*
2883  * how many bytes are required to store the items in a leaf.  start
2884  * and nr indicate which items in the leaf to check.  This totals up the
2885  * space used both by the item structs and the item data
2886  */
2887 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2888 {
2889         int data_len;
2890         int nritems = btrfs_header_nritems(l);
2891         int end = min(nritems, start + nr) - 1;
2892
2893         if (!nr)
2894                 return 0;
2895         data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
2896         data_len = data_len - btrfs_item_offset(l, end);
2897         data_len += sizeof(struct btrfs_item) * nr;
2898         WARN_ON(data_len < 0);
2899         return data_len;
2900 }
2901
2902 /*
2903  * The space between the end of the leaf items and
2904  * the start of the leaf data.  IOW, how much room
2905  * the leaf has left for both items and data
2906  */
2907 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2908 {
2909         struct btrfs_fs_info *fs_info = leaf->fs_info;
2910         int nritems = btrfs_header_nritems(leaf);
2911         int ret;
2912
2913         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2914         if (ret < 0) {
2915                 btrfs_crit(fs_info,
2916                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2917                            ret,
2918                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2919                            leaf_space_used(leaf, 0, nritems), nritems);
2920         }
2921         return ret;
2922 }
2923
2924 /*
2925  * min slot controls the lowest index we're willing to push to the
2926  * right.  We'll push up to and including min_slot, but no lower
2927  */
2928 static noinline int __push_leaf_right(struct btrfs_path *path,
2929                                       int data_size, int empty,
2930                                       struct extent_buffer *right,
2931                                       int free_space, u32 left_nritems,
2932                                       u32 min_slot)
2933 {
2934         struct btrfs_fs_info *fs_info = right->fs_info;
2935         struct extent_buffer *left = path->nodes[0];
2936         struct extent_buffer *upper = path->nodes[1];
2937         struct btrfs_map_token token;
2938         struct btrfs_disk_key disk_key;
2939         int slot;
2940         u32 i;
2941         int push_space = 0;
2942         int push_items = 0;
2943         u32 nr;
2944         u32 right_nritems;
2945         u32 data_end;
2946         u32 this_item_size;
2947
2948         if (empty)
2949                 nr = 0;
2950         else
2951                 nr = max_t(u32, 1, min_slot);
2952
2953         if (path->slots[0] >= left_nritems)
2954                 push_space += data_size;
2955
2956         slot = path->slots[1];
2957         i = left_nritems - 1;
2958         while (i >= nr) {
2959                 if (!empty && push_items > 0) {
2960                         if (path->slots[0] > i)
2961                                 break;
2962                         if (path->slots[0] == i) {
2963                                 int space = btrfs_leaf_free_space(left);
2964
2965                                 if (space + push_space * 2 > free_space)
2966                                         break;
2967                         }
2968                 }
2969
2970                 if (path->slots[0] == i)
2971                         push_space += data_size;
2972
2973                 this_item_size = btrfs_item_size(left, i);
2974                 if (this_item_size + sizeof(struct btrfs_item) +
2975                     push_space > free_space)
2976                         break;
2977
2978                 push_items++;
2979                 push_space += this_item_size + sizeof(struct btrfs_item);
2980                 if (i == 0)
2981                         break;
2982                 i--;
2983         }
2984
2985         if (push_items == 0)
2986                 goto out_unlock;
2987
2988         WARN_ON(!empty && push_items == left_nritems);
2989
2990         /* push left to right */
2991         right_nritems = btrfs_header_nritems(right);
2992
2993         push_space = btrfs_item_data_end(left, left_nritems - push_items);
2994         push_space -= leaf_data_end(left);
2995
2996         /* make room in the right data area */
2997         data_end = leaf_data_end(right);
2998         memmove_extent_buffer(right,
2999                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3000                               BTRFS_LEAF_DATA_OFFSET + data_end,
3001                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3002
3003         /* copy from the left data area */
3004         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3005                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3006                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3007                      push_space);
3008
3009         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3010                               btrfs_item_nr_offset(0),
3011                               right_nritems * sizeof(struct btrfs_item));
3012
3013         /* copy the items from left to right */
3014         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3015                    btrfs_item_nr_offset(left_nritems - push_items),
3016                    push_items * sizeof(struct btrfs_item));
3017
3018         /* update the item pointers */
3019         btrfs_init_map_token(&token, right);
3020         right_nritems += push_items;
3021         btrfs_set_header_nritems(right, right_nritems);
3022         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3023         for (i = 0; i < right_nritems; i++) {
3024                 push_space -= btrfs_token_item_size(&token, i);
3025                 btrfs_set_token_item_offset(&token, i, push_space);
3026         }
3027
3028         left_nritems -= push_items;
3029         btrfs_set_header_nritems(left, left_nritems);
3030
3031         if (left_nritems)
3032                 btrfs_mark_buffer_dirty(left);
3033         else
3034                 btrfs_clean_tree_block(left);
3035
3036         btrfs_mark_buffer_dirty(right);
3037
3038         btrfs_item_key(right, &disk_key, 0);
3039         btrfs_set_node_key(upper, &disk_key, slot + 1);
3040         btrfs_mark_buffer_dirty(upper);
3041
3042         /* then fixup the leaf pointer in the path */
3043         if (path->slots[0] >= left_nritems) {
3044                 path->slots[0] -= left_nritems;
3045                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3046                         btrfs_clean_tree_block(path->nodes[0]);
3047                 btrfs_tree_unlock(path->nodes[0]);
3048                 free_extent_buffer(path->nodes[0]);
3049                 path->nodes[0] = right;
3050                 path->slots[1] += 1;
3051         } else {
3052                 btrfs_tree_unlock(right);
3053                 free_extent_buffer(right);
3054         }
3055         return 0;
3056
3057 out_unlock:
3058         btrfs_tree_unlock(right);
3059         free_extent_buffer(right);
3060         return 1;
3061 }
3062
3063 /*
3064  * push some data in the path leaf to the right, trying to free up at
3065  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3066  *
3067  * returns 1 if the push failed because the other node didn't have enough
3068  * room, 0 if everything worked out and < 0 if there were major errors.
3069  *
3070  * this will push starting from min_slot to the end of the leaf.  It won't
3071  * push any slot lower than min_slot
3072  */
3073 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3074                            *root, struct btrfs_path *path,
3075                            int min_data_size, int data_size,
3076                            int empty, u32 min_slot)
3077 {
3078         struct extent_buffer *left = path->nodes[0];
3079         struct extent_buffer *right;
3080         struct extent_buffer *upper;
3081         int slot;
3082         int free_space;
3083         u32 left_nritems;
3084         int ret;
3085
3086         if (!path->nodes[1])
3087                 return 1;
3088
3089         slot = path->slots[1];
3090         upper = path->nodes[1];
3091         if (slot >= btrfs_header_nritems(upper) - 1)
3092                 return 1;
3093
3094         btrfs_assert_tree_write_locked(path->nodes[1]);
3095
3096         right = btrfs_read_node_slot(upper, slot + 1);
3097         /*
3098          * slot + 1 is not valid or we fail to read the right node,
3099          * no big deal, just return.
3100          */
3101         if (IS_ERR(right))
3102                 return 1;
3103
3104         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3105
3106         free_space = btrfs_leaf_free_space(right);
3107         if (free_space < data_size)
3108                 goto out_unlock;
3109
3110         ret = btrfs_cow_block(trans, root, right, upper,
3111                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3112         if (ret)
3113                 goto out_unlock;
3114
3115         left_nritems = btrfs_header_nritems(left);
3116         if (left_nritems == 0)
3117                 goto out_unlock;
3118
3119         if (check_sibling_keys(left, right)) {
3120                 ret = -EUCLEAN;
3121                 btrfs_tree_unlock(right);
3122                 free_extent_buffer(right);
3123                 return ret;
3124         }
3125         if (path->slots[0] == left_nritems && !empty) {
3126                 /* Key greater than all keys in the leaf, right neighbor has
3127                  * enough room for it and we're not emptying our leaf to delete
3128                  * it, therefore use right neighbor to insert the new item and
3129                  * no need to touch/dirty our left leaf. */
3130                 btrfs_tree_unlock(left);
3131                 free_extent_buffer(left);
3132                 path->nodes[0] = right;
3133                 path->slots[0] = 0;
3134                 path->slots[1]++;
3135                 return 0;
3136         }
3137
3138         return __push_leaf_right(path, min_data_size, empty,
3139                                 right, free_space, left_nritems, min_slot);
3140 out_unlock:
3141         btrfs_tree_unlock(right);
3142         free_extent_buffer(right);
3143         return 1;
3144 }
3145
3146 /*
3147  * push some data in the path leaf to the left, trying to free up at
3148  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3149  *
3150  * max_slot can put a limit on how far into the leaf we'll push items.  The
3151  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3152  * items
3153  */
3154 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3155                                      int empty, struct extent_buffer *left,
3156                                      int free_space, u32 right_nritems,
3157                                      u32 max_slot)
3158 {
3159         struct btrfs_fs_info *fs_info = left->fs_info;
3160         struct btrfs_disk_key disk_key;
3161         struct extent_buffer *right = path->nodes[0];
3162         int i;
3163         int push_space = 0;
3164         int push_items = 0;
3165         u32 old_left_nritems;
3166         u32 nr;
3167         int ret = 0;
3168         u32 this_item_size;
3169         u32 old_left_item_size;
3170         struct btrfs_map_token token;
3171
3172         if (empty)
3173                 nr = min(right_nritems, max_slot);
3174         else
3175                 nr = min(right_nritems - 1, max_slot);
3176
3177         for (i = 0; i < nr; i++) {
3178                 if (!empty && push_items > 0) {
3179                         if (path->slots[0] < i)
3180                                 break;
3181                         if (path->slots[0] == i) {
3182                                 int space = btrfs_leaf_free_space(right);
3183
3184                                 if (space + push_space * 2 > free_space)
3185                                         break;
3186                         }
3187                 }
3188
3189                 if (path->slots[0] == i)
3190                         push_space += data_size;
3191
3192                 this_item_size = btrfs_item_size(right, i);
3193                 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3194                     free_space)
3195                         break;
3196
3197                 push_items++;
3198                 push_space += this_item_size + sizeof(struct btrfs_item);
3199         }
3200
3201         if (push_items == 0) {
3202                 ret = 1;
3203                 goto out;
3204         }
3205         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3206
3207         /* push data from right to left */
3208         copy_extent_buffer(left, right,
3209                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3210                            btrfs_item_nr_offset(0),
3211                            push_items * sizeof(struct btrfs_item));
3212
3213         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3214                      btrfs_item_offset(right, push_items - 1);
3215
3216         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3217                      leaf_data_end(left) - push_space,
3218                      BTRFS_LEAF_DATA_OFFSET +
3219                      btrfs_item_offset(right, push_items - 1),
3220                      push_space);
3221         old_left_nritems = btrfs_header_nritems(left);
3222         BUG_ON(old_left_nritems <= 0);
3223
3224         btrfs_init_map_token(&token, left);
3225         old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3226         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3227                 u32 ioff;
3228
3229                 ioff = btrfs_token_item_offset(&token, i);
3230                 btrfs_set_token_item_offset(&token, i,
3231                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3232         }
3233         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3234
3235         /* fixup right node */
3236         if (push_items > right_nritems)
3237                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3238                        right_nritems);
3239
3240         if (push_items < right_nritems) {
3241                 push_space = btrfs_item_offset(right, push_items - 1) -
3242                                                   leaf_data_end(right);
3243                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3244                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3245                                       BTRFS_LEAF_DATA_OFFSET +
3246                                       leaf_data_end(right), push_space);
3247
3248                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3249                               btrfs_item_nr_offset(push_items),
3250                              (btrfs_header_nritems(right) - push_items) *
3251                              sizeof(struct btrfs_item));
3252         }
3253
3254         btrfs_init_map_token(&token, right);
3255         right_nritems -= push_items;
3256         btrfs_set_header_nritems(right, right_nritems);
3257         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3258         for (i = 0; i < right_nritems; i++) {
3259                 push_space = push_space - btrfs_token_item_size(&token, i);
3260                 btrfs_set_token_item_offset(&token, i, push_space);
3261         }
3262
3263         btrfs_mark_buffer_dirty(left);
3264         if (right_nritems)
3265                 btrfs_mark_buffer_dirty(right);
3266         else
3267                 btrfs_clean_tree_block(right);
3268
3269         btrfs_item_key(right, &disk_key, 0);
3270         fixup_low_keys(path, &disk_key, 1);
3271
3272         /* then fixup the leaf pointer in the path */
3273         if (path->slots[0] < push_items) {
3274                 path->slots[0] += old_left_nritems;
3275                 btrfs_tree_unlock(path->nodes[0]);
3276                 free_extent_buffer(path->nodes[0]);
3277                 path->nodes[0] = left;
3278                 path->slots[1] -= 1;
3279         } else {
3280                 btrfs_tree_unlock(left);
3281                 free_extent_buffer(left);
3282                 path->slots[0] -= push_items;
3283         }
3284         BUG_ON(path->slots[0] < 0);
3285         return ret;
3286 out:
3287         btrfs_tree_unlock(left);
3288         free_extent_buffer(left);
3289         return ret;
3290 }
3291
3292 /*
3293  * push some data in the path leaf to the left, trying to free up at
3294  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3295  *
3296  * max_slot can put a limit on how far into the leaf we'll push items.  The
3297  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3298  * items
3299  */
3300 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3301                           *root, struct btrfs_path *path, int min_data_size,
3302                           int data_size, int empty, u32 max_slot)
3303 {
3304         struct extent_buffer *right = path->nodes[0];
3305         struct extent_buffer *left;
3306         int slot;
3307         int free_space;
3308         u32 right_nritems;
3309         int ret = 0;
3310
3311         slot = path->slots[1];
3312         if (slot == 0)
3313                 return 1;
3314         if (!path->nodes[1])
3315                 return 1;
3316
3317         right_nritems = btrfs_header_nritems(right);
3318         if (right_nritems == 0)
3319                 return 1;
3320
3321         btrfs_assert_tree_write_locked(path->nodes[1]);
3322
3323         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3324         /*
3325          * slot - 1 is not valid or we fail to read the left node,
3326          * no big deal, just return.
3327          */
3328         if (IS_ERR(left))
3329                 return 1;
3330
3331         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3332
3333         free_space = btrfs_leaf_free_space(left);
3334         if (free_space < data_size) {
3335                 ret = 1;
3336                 goto out;
3337         }
3338
3339         ret = btrfs_cow_block(trans, root, left,
3340                               path->nodes[1], slot - 1, &left,
3341                               BTRFS_NESTING_LEFT_COW);
3342         if (ret) {
3343                 /* we hit -ENOSPC, but it isn't fatal here */
3344                 if (ret == -ENOSPC)
3345                         ret = 1;
3346                 goto out;
3347         }
3348
3349         if (check_sibling_keys(left, right)) {
3350                 ret = -EUCLEAN;
3351                 goto out;
3352         }
3353         return __push_leaf_left(path, min_data_size,
3354                                empty, left, free_space, right_nritems,
3355                                max_slot);
3356 out:
3357         btrfs_tree_unlock(left);
3358         free_extent_buffer(left);
3359         return ret;
3360 }
3361
3362 /*
3363  * split the path's leaf in two, making sure there is at least data_size
3364  * available for the resulting leaf level of the path.
3365  */
3366 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3367                                     struct btrfs_path *path,
3368                                     struct extent_buffer *l,
3369                                     struct extent_buffer *right,
3370                                     int slot, int mid, int nritems)
3371 {
3372         struct btrfs_fs_info *fs_info = trans->fs_info;
3373         int data_copy_size;
3374         int rt_data_off;
3375         int i;
3376         struct btrfs_disk_key disk_key;
3377         struct btrfs_map_token token;
3378
3379         nritems = nritems - mid;
3380         btrfs_set_header_nritems(right, nritems);
3381         data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3382
3383         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3384                            btrfs_item_nr_offset(mid),
3385                            nritems * sizeof(struct btrfs_item));
3386
3387         copy_extent_buffer(right, l,
3388                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3389                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3390                      leaf_data_end(l), data_copy_size);
3391
3392         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3393
3394         btrfs_init_map_token(&token, right);
3395         for (i = 0; i < nritems; i++) {
3396                 u32 ioff;
3397
3398                 ioff = btrfs_token_item_offset(&token, i);
3399                 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3400         }
3401
3402         btrfs_set_header_nritems(l, mid);
3403         btrfs_item_key(right, &disk_key, 0);
3404         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3405
3406         btrfs_mark_buffer_dirty(right);
3407         btrfs_mark_buffer_dirty(l);
3408         BUG_ON(path->slots[0] != slot);
3409
3410         if (mid <= slot) {
3411                 btrfs_tree_unlock(path->nodes[0]);
3412                 free_extent_buffer(path->nodes[0]);
3413                 path->nodes[0] = right;
3414                 path->slots[0] -= mid;
3415                 path->slots[1] += 1;
3416         } else {
3417                 btrfs_tree_unlock(right);
3418                 free_extent_buffer(right);
3419         }
3420
3421         BUG_ON(path->slots[0] < 0);
3422 }
3423
3424 /*
3425  * double splits happen when we need to insert a big item in the middle
3426  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3427  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3428  *          A                 B                 C
3429  *
3430  * We avoid this by trying to push the items on either side of our target
3431  * into the adjacent leaves.  If all goes well we can avoid the double split
3432  * completely.
3433  */
3434 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3435                                           struct btrfs_root *root,
3436                                           struct btrfs_path *path,
3437                                           int data_size)
3438 {
3439         int ret;
3440         int progress = 0;
3441         int slot;
3442         u32 nritems;
3443         int space_needed = data_size;
3444
3445         slot = path->slots[0];
3446         if (slot < btrfs_header_nritems(path->nodes[0]))
3447                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3448
3449         /*
3450          * try to push all the items after our slot into the
3451          * right leaf
3452          */
3453         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3454         if (ret < 0)
3455                 return ret;
3456
3457         if (ret == 0)
3458                 progress++;
3459
3460         nritems = btrfs_header_nritems(path->nodes[0]);
3461         /*
3462          * our goal is to get our slot at the start or end of a leaf.  If
3463          * we've done so we're done
3464          */
3465         if (path->slots[0] == 0 || path->slots[0] == nritems)
3466                 return 0;
3467
3468         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3469                 return 0;
3470
3471         /* try to push all the items before our slot into the next leaf */
3472         slot = path->slots[0];
3473         space_needed = data_size;
3474         if (slot > 0)
3475                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3476         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3477         if (ret < 0)
3478                 return ret;
3479
3480         if (ret == 0)
3481                 progress++;
3482
3483         if (progress)
3484                 return 0;
3485         return 1;
3486 }
3487
3488 /*
3489  * split the path's leaf in two, making sure there is at least data_size
3490  * available for the resulting leaf level of the path.
3491  *
3492  * returns 0 if all went well and < 0 on failure.
3493  */
3494 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3495                                struct btrfs_root *root,
3496                                const struct btrfs_key *ins_key,
3497                                struct btrfs_path *path, int data_size,
3498                                int extend)
3499 {
3500         struct btrfs_disk_key disk_key;
3501         struct extent_buffer *l;
3502         u32 nritems;
3503         int mid;
3504         int slot;
3505         struct extent_buffer *right;
3506         struct btrfs_fs_info *fs_info = root->fs_info;
3507         int ret = 0;
3508         int wret;
3509         int split;
3510         int num_doubles = 0;
3511         int tried_avoid_double = 0;
3512
3513         l = path->nodes[0];
3514         slot = path->slots[0];
3515         if (extend && data_size + btrfs_item_size(l, slot) +
3516             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3517                 return -EOVERFLOW;
3518
3519         /* first try to make some room by pushing left and right */
3520         if (data_size && path->nodes[1]) {
3521                 int space_needed = data_size;
3522
3523                 if (slot < btrfs_header_nritems(l))
3524                         space_needed -= btrfs_leaf_free_space(l);
3525
3526                 wret = push_leaf_right(trans, root, path, space_needed,
3527                                        space_needed, 0, 0);
3528                 if (wret < 0)
3529                         return wret;
3530                 if (wret) {
3531                         space_needed = data_size;
3532                         if (slot > 0)
3533                                 space_needed -= btrfs_leaf_free_space(l);
3534                         wret = push_leaf_left(trans, root, path, space_needed,
3535                                               space_needed, 0, (u32)-1);
3536                         if (wret < 0)
3537                                 return wret;
3538                 }
3539                 l = path->nodes[0];
3540
3541                 /* did the pushes work? */
3542                 if (btrfs_leaf_free_space(l) >= data_size)
3543                         return 0;
3544         }
3545
3546         if (!path->nodes[1]) {
3547                 ret = insert_new_root(trans, root, path, 1);
3548                 if (ret)
3549                         return ret;
3550         }
3551 again:
3552         split = 1;
3553         l = path->nodes[0];
3554         slot = path->slots[0];
3555         nritems = btrfs_header_nritems(l);
3556         mid = (nritems + 1) / 2;
3557
3558         if (mid <= slot) {
3559                 if (nritems == 1 ||
3560                     leaf_space_used(l, mid, nritems - mid) + data_size >
3561                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3562                         if (slot >= nritems) {
3563                                 split = 0;
3564                         } else {
3565                                 mid = slot;
3566                                 if (mid != nritems &&
3567                                     leaf_space_used(l, mid, nritems - mid) +
3568                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3569                                         if (data_size && !tried_avoid_double)
3570                                                 goto push_for_double;
3571                                         split = 2;
3572                                 }
3573                         }
3574                 }
3575         } else {
3576                 if (leaf_space_used(l, 0, mid) + data_size >
3577                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3578                         if (!extend && data_size && slot == 0) {
3579                                 split = 0;
3580                         } else if ((extend || !data_size) && slot == 0) {
3581                                 mid = 1;
3582                         } else {
3583                                 mid = slot;
3584                                 if (mid != nritems &&
3585                                     leaf_space_used(l, mid, nritems - mid) +
3586                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3587                                         if (data_size && !tried_avoid_double)
3588                                                 goto push_for_double;
3589                                         split = 2;
3590                                 }
3591                         }
3592                 }
3593         }
3594
3595         if (split == 0)
3596                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3597         else
3598                 btrfs_item_key(l, &disk_key, mid);
3599
3600         /*
3601          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3602          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3603          * subclasses, which is 8 at the time of this patch, and we've maxed it
3604          * out.  In the future we could add a
3605          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3606          * use BTRFS_NESTING_NEW_ROOT.
3607          */
3608         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3609                                        &disk_key, 0, l->start, 0,
3610                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3611                                        BTRFS_NESTING_SPLIT);
3612         if (IS_ERR(right))
3613                 return PTR_ERR(right);
3614
3615         root_add_used(root, fs_info->nodesize);
3616
3617         if (split == 0) {
3618                 if (mid <= slot) {
3619                         btrfs_set_header_nritems(right, 0);
3620                         insert_ptr(trans, path, &disk_key,
3621                                    right->start, path->slots[1] + 1, 1);
3622                         btrfs_tree_unlock(path->nodes[0]);
3623                         free_extent_buffer(path->nodes[0]);
3624                         path->nodes[0] = right;
3625                         path->slots[0] = 0;
3626                         path->slots[1] += 1;
3627                 } else {
3628                         btrfs_set_header_nritems(right, 0);
3629                         insert_ptr(trans, path, &disk_key,
3630                                    right->start, path->slots[1], 1);
3631                         btrfs_tree_unlock(path->nodes[0]);
3632                         free_extent_buffer(path->nodes[0]);
3633                         path->nodes[0] = right;
3634                         path->slots[0] = 0;
3635                         if (path->slots[1] == 0)
3636                                 fixup_low_keys(path, &disk_key, 1);
3637                 }
3638                 /*
3639                  * We create a new leaf 'right' for the required ins_len and
3640                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3641                  * the content of ins_len to 'right'.
3642                  */
3643                 return ret;
3644         }
3645
3646         copy_for_split(trans, path, l, right, slot, mid, nritems);
3647
3648         if (split == 2) {
3649                 BUG_ON(num_doubles != 0);
3650                 num_doubles++;
3651                 goto again;
3652         }
3653
3654         return 0;
3655
3656 push_for_double:
3657         push_for_double_split(trans, root, path, data_size);
3658         tried_avoid_double = 1;
3659         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3660                 return 0;
3661         goto again;
3662 }
3663
3664 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3665                                          struct btrfs_root *root,
3666                                          struct btrfs_path *path, int ins_len)
3667 {
3668         struct btrfs_key key;
3669         struct extent_buffer *leaf;
3670         struct btrfs_file_extent_item *fi;
3671         u64 extent_len = 0;
3672         u32 item_size;
3673         int ret;
3674
3675         leaf = path->nodes[0];
3676         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3677
3678         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3679                key.type != BTRFS_EXTENT_CSUM_KEY);
3680
3681         if (btrfs_leaf_free_space(leaf) >= ins_len)
3682                 return 0;
3683
3684         item_size = btrfs_item_size(leaf, path->slots[0]);
3685         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3686                 fi = btrfs_item_ptr(leaf, path->slots[0],
3687                                     struct btrfs_file_extent_item);
3688                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3689         }
3690         btrfs_release_path(path);
3691
3692         path->keep_locks = 1;
3693         path->search_for_split = 1;
3694         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3695         path->search_for_split = 0;
3696         if (ret > 0)
3697                 ret = -EAGAIN;
3698         if (ret < 0)
3699                 goto err;
3700
3701         ret = -EAGAIN;
3702         leaf = path->nodes[0];
3703         /* if our item isn't there, return now */
3704         if (item_size != btrfs_item_size(leaf, path->slots[0]))
3705                 goto err;
3706
3707         /* the leaf has  changed, it now has room.  return now */
3708         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3709                 goto err;
3710
3711         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3712                 fi = btrfs_item_ptr(leaf, path->slots[0],
3713                                     struct btrfs_file_extent_item);
3714                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3715                         goto err;
3716         }
3717
3718         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3719         if (ret)
3720                 goto err;
3721
3722         path->keep_locks = 0;
3723         btrfs_unlock_up_safe(path, 1);
3724         return 0;
3725 err:
3726         path->keep_locks = 0;
3727         return ret;
3728 }
3729
3730 static noinline int split_item(struct btrfs_path *path,
3731                                const struct btrfs_key *new_key,
3732                                unsigned long split_offset)
3733 {
3734         struct extent_buffer *leaf;
3735         int orig_slot, slot;
3736         char *buf;
3737         u32 nritems;
3738         u32 item_size;
3739         u32 orig_offset;
3740         struct btrfs_disk_key disk_key;
3741
3742         leaf = path->nodes[0];
3743         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3744
3745         orig_slot = path->slots[0];
3746         orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3747         item_size = btrfs_item_size(leaf, path->slots[0]);
3748
3749         buf = kmalloc(item_size, GFP_NOFS);
3750         if (!buf)
3751                 return -ENOMEM;
3752
3753         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3754                             path->slots[0]), item_size);
3755
3756         slot = path->slots[0] + 1;
3757         nritems = btrfs_header_nritems(leaf);
3758         if (slot != nritems) {
3759                 /* shift the items */
3760                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3761                                 btrfs_item_nr_offset(slot),
3762                                 (nritems - slot) * sizeof(struct btrfs_item));
3763         }
3764
3765         btrfs_cpu_key_to_disk(&disk_key, new_key);
3766         btrfs_set_item_key(leaf, &disk_key, slot);
3767
3768         btrfs_set_item_offset(leaf, slot, orig_offset);
3769         btrfs_set_item_size(leaf, slot, item_size - split_offset);
3770
3771         btrfs_set_item_offset(leaf, orig_slot,
3772                                  orig_offset + item_size - split_offset);
3773         btrfs_set_item_size(leaf, orig_slot, split_offset);
3774
3775         btrfs_set_header_nritems(leaf, nritems + 1);
3776
3777         /* write the data for the start of the original item */
3778         write_extent_buffer(leaf, buf,
3779                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3780                             split_offset);
3781
3782         /* write the data for the new item */
3783         write_extent_buffer(leaf, buf + split_offset,
3784                             btrfs_item_ptr_offset(leaf, slot),
3785                             item_size - split_offset);
3786         btrfs_mark_buffer_dirty(leaf);
3787
3788         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3789         kfree(buf);
3790         return 0;
3791 }
3792
3793 /*
3794  * This function splits a single item into two items,
3795  * giving 'new_key' to the new item and splitting the
3796  * old one at split_offset (from the start of the item).
3797  *
3798  * The path may be released by this operation.  After
3799  * the split, the path is pointing to the old item.  The
3800  * new item is going to be in the same node as the old one.
3801  *
3802  * Note, the item being split must be smaller enough to live alone on
3803  * a tree block with room for one extra struct btrfs_item
3804  *
3805  * This allows us to split the item in place, keeping a lock on the
3806  * leaf the entire time.
3807  */
3808 int btrfs_split_item(struct btrfs_trans_handle *trans,
3809                      struct btrfs_root *root,
3810                      struct btrfs_path *path,
3811                      const struct btrfs_key *new_key,
3812                      unsigned long split_offset)
3813 {
3814         int ret;
3815         ret = setup_leaf_for_split(trans, root, path,
3816                                    sizeof(struct btrfs_item));
3817         if (ret)
3818                 return ret;
3819
3820         ret = split_item(path, new_key, split_offset);
3821         return ret;
3822 }
3823
3824 /*
3825  * make the item pointed to by the path smaller.  new_size indicates
3826  * how small to make it, and from_end tells us if we just chop bytes
3827  * off the end of the item or if we shift the item to chop bytes off
3828  * the front.
3829  */
3830 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3831 {
3832         int slot;
3833         struct extent_buffer *leaf;
3834         u32 nritems;
3835         unsigned int data_end;
3836         unsigned int old_data_start;
3837         unsigned int old_size;
3838         unsigned int size_diff;
3839         int i;
3840         struct btrfs_map_token token;
3841
3842         leaf = path->nodes[0];
3843         slot = path->slots[0];
3844
3845         old_size = btrfs_item_size(leaf, slot);
3846         if (old_size == new_size)
3847                 return;
3848
3849         nritems = btrfs_header_nritems(leaf);
3850         data_end = leaf_data_end(leaf);
3851
3852         old_data_start = btrfs_item_offset(leaf, slot);
3853
3854         size_diff = old_size - new_size;
3855
3856         BUG_ON(slot < 0);
3857         BUG_ON(slot >= nritems);
3858
3859         /*
3860          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3861          */
3862         /* first correct the data pointers */
3863         btrfs_init_map_token(&token, leaf);
3864         for (i = slot; i < nritems; i++) {
3865                 u32 ioff;
3866
3867                 ioff = btrfs_token_item_offset(&token, i);
3868                 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
3869         }
3870
3871         /* shift the data */
3872         if (from_end) {
3873                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3874                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3875                               data_end, old_data_start + new_size - data_end);
3876         } else {
3877                 struct btrfs_disk_key disk_key;
3878                 u64 offset;
3879
3880                 btrfs_item_key(leaf, &disk_key, slot);
3881
3882                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3883                         unsigned long ptr;
3884                         struct btrfs_file_extent_item *fi;
3885
3886                         fi = btrfs_item_ptr(leaf, slot,
3887                                             struct btrfs_file_extent_item);
3888                         fi = (struct btrfs_file_extent_item *)(
3889                              (unsigned long)fi - size_diff);
3890
3891                         if (btrfs_file_extent_type(leaf, fi) ==
3892                             BTRFS_FILE_EXTENT_INLINE) {
3893                                 ptr = btrfs_item_ptr_offset(leaf, slot);
3894                                 memmove_extent_buffer(leaf, ptr,
3895                                       (unsigned long)fi,
3896                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
3897                         }
3898                 }
3899
3900                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3901                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3902                               data_end, old_data_start - data_end);
3903
3904                 offset = btrfs_disk_key_offset(&disk_key);
3905                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3906                 btrfs_set_item_key(leaf, &disk_key, slot);
3907                 if (slot == 0)
3908                         fixup_low_keys(path, &disk_key, 1);
3909         }
3910
3911         btrfs_set_item_size(leaf, slot, new_size);
3912         btrfs_mark_buffer_dirty(leaf);
3913
3914         if (btrfs_leaf_free_space(leaf) < 0) {
3915                 btrfs_print_leaf(leaf);
3916                 BUG();
3917         }
3918 }
3919
3920 /*
3921  * make the item pointed to by the path bigger, data_size is the added size.
3922  */
3923 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3924 {
3925         int slot;
3926         struct extent_buffer *leaf;
3927         u32 nritems;
3928         unsigned int data_end;
3929         unsigned int old_data;
3930         unsigned int old_size;
3931         int i;
3932         struct btrfs_map_token token;
3933
3934         leaf = path->nodes[0];
3935
3936         nritems = btrfs_header_nritems(leaf);
3937         data_end = leaf_data_end(leaf);
3938
3939         if (btrfs_leaf_free_space(leaf) < data_size) {
3940                 btrfs_print_leaf(leaf);
3941                 BUG();
3942         }
3943         slot = path->slots[0];
3944         old_data = btrfs_item_data_end(leaf, slot);
3945
3946         BUG_ON(slot < 0);
3947         if (slot >= nritems) {
3948                 btrfs_print_leaf(leaf);
3949                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3950                            slot, nritems);
3951                 BUG();
3952         }
3953
3954         /*
3955          * item0..itemN ... dataN.offset..dataN.size .. data0.size
3956          */
3957         /* first correct the data pointers */
3958         btrfs_init_map_token(&token, leaf);
3959         for (i = slot; i < nritems; i++) {
3960                 u32 ioff;
3961
3962                 ioff = btrfs_token_item_offset(&token, i);
3963                 btrfs_set_token_item_offset(&token, i, ioff - data_size);
3964         }
3965
3966         /* shift the data */
3967         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3968                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3969                       data_end, old_data - data_end);
3970
3971         data_end = old_data;
3972         old_size = btrfs_item_size(leaf, slot);
3973         btrfs_set_item_size(leaf, slot, old_size + data_size);
3974         btrfs_mark_buffer_dirty(leaf);
3975
3976         if (btrfs_leaf_free_space(leaf) < 0) {
3977                 btrfs_print_leaf(leaf);
3978                 BUG();
3979         }
3980 }
3981
3982 /**
3983  * setup_items_for_insert - Helper called before inserting one or more items
3984  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3985  * in a function that doesn't call btrfs_search_slot
3986  *
3987  * @root:       root we are inserting items to
3988  * @path:       points to the leaf/slot where we are going to insert new items
3989  * @batch:      information about the batch of items to insert
3990  */
3991 static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3992                                    const struct btrfs_item_batch *batch)
3993 {
3994         struct btrfs_fs_info *fs_info = root->fs_info;
3995         int i;
3996         u32 nritems;
3997         unsigned int data_end;
3998         struct btrfs_disk_key disk_key;
3999         struct extent_buffer *leaf;
4000         int slot;
4001         struct btrfs_map_token token;
4002         u32 total_size;
4003
4004         /*
4005          * Before anything else, update keys in the parent and other ancestors
4006          * if needed, then release the write locks on them, so that other tasks
4007          * can use them while we modify the leaf.
4008          */
4009         if (path->slots[0] == 0) {
4010                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4011                 fixup_low_keys(path, &disk_key, 1);
4012         }
4013         btrfs_unlock_up_safe(path, 1);
4014
4015         leaf = path->nodes[0];
4016         slot = path->slots[0];
4017
4018         nritems = btrfs_header_nritems(leaf);
4019         data_end = leaf_data_end(leaf);
4020         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4021
4022         if (btrfs_leaf_free_space(leaf) < total_size) {
4023                 btrfs_print_leaf(leaf);
4024                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4025                            total_size, btrfs_leaf_free_space(leaf));
4026                 BUG();
4027         }
4028
4029         btrfs_init_map_token(&token, leaf);
4030         if (slot != nritems) {
4031                 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4032
4033                 if (old_data < data_end) {
4034                         btrfs_print_leaf(leaf);
4035                         btrfs_crit(fs_info,
4036                 "item at slot %d with data offset %u beyond data end of leaf %u",
4037                                    slot, old_data, data_end);
4038                         BUG();
4039                 }
4040                 /*
4041                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4042                  */
4043                 /* first correct the data pointers */
4044                 for (i = slot; i < nritems; i++) {
4045                         u32 ioff;
4046
4047                         ioff = btrfs_token_item_offset(&token, i);
4048                         btrfs_set_token_item_offset(&token, i,
4049                                                        ioff - batch->total_data_size);
4050                 }
4051                 /* shift the items */
4052                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
4053                               btrfs_item_nr_offset(slot),
4054                               (nritems - slot) * sizeof(struct btrfs_item));
4055
4056                 /* shift the data */
4057                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4058                                       data_end - batch->total_data_size,
4059                                       BTRFS_LEAF_DATA_OFFSET + data_end,
4060                                       old_data - data_end);
4061                 data_end = old_data;
4062         }
4063
4064         /* setup the item for the new data */
4065         for (i = 0; i < batch->nr; i++) {
4066                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4067                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4068                 data_end -= batch->data_sizes[i];
4069                 btrfs_set_token_item_offset(&token, slot + i, data_end);
4070                 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4071         }
4072
4073         btrfs_set_header_nritems(leaf, nritems + batch->nr);
4074         btrfs_mark_buffer_dirty(leaf);
4075
4076         if (btrfs_leaf_free_space(leaf) < 0) {
4077                 btrfs_print_leaf(leaf);
4078                 BUG();
4079         }
4080 }
4081
4082 /*
4083  * Insert a new item into a leaf.
4084  *
4085  * @root:      The root of the btree.
4086  * @path:      A path pointing to the target leaf and slot.
4087  * @key:       The key of the new item.
4088  * @data_size: The size of the data associated with the new key.
4089  */
4090 void btrfs_setup_item_for_insert(struct btrfs_root *root,
4091                                  struct btrfs_path *path,
4092                                  const struct btrfs_key *key,
4093                                  u32 data_size)
4094 {
4095         struct btrfs_item_batch batch;
4096
4097         batch.keys = key;
4098         batch.data_sizes = &data_size;
4099         batch.total_data_size = data_size;
4100         batch.nr = 1;
4101
4102         setup_items_for_insert(root, path, &batch);
4103 }
4104
4105 /*
4106  * Given a key and some data, insert items into the tree.
4107  * This does all the path init required, making room in the tree if needed.
4108  */
4109 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4110                             struct btrfs_root *root,
4111                             struct btrfs_path *path,
4112                             const struct btrfs_item_batch *batch)
4113 {
4114         int ret = 0;
4115         int slot;
4116         u32 total_size;
4117
4118         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4119         ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4120         if (ret == 0)
4121                 return -EEXIST;
4122         if (ret < 0)
4123                 return ret;
4124
4125         slot = path->slots[0];
4126         BUG_ON(slot < 0);
4127
4128         setup_items_for_insert(root, path, batch);
4129         return 0;
4130 }
4131
4132 /*
4133  * Given a key and some data, insert an item into the tree.
4134  * This does all the path init required, making room in the tree if needed.
4135  */
4136 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4137                       const struct btrfs_key *cpu_key, void *data,
4138                       u32 data_size)
4139 {
4140         int ret = 0;
4141         struct btrfs_path *path;
4142         struct extent_buffer *leaf;
4143         unsigned long ptr;
4144
4145         path = btrfs_alloc_path();
4146         if (!path)
4147                 return -ENOMEM;
4148         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4149         if (!ret) {
4150                 leaf = path->nodes[0];
4151                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4152                 write_extent_buffer(leaf, data, ptr, data_size);
4153                 btrfs_mark_buffer_dirty(leaf);
4154         }
4155         btrfs_free_path(path);
4156         return ret;
4157 }
4158
4159 /*
4160  * This function duplicates an item, giving 'new_key' to the new item.
4161  * It guarantees both items live in the same tree leaf and the new item is
4162  * contiguous with the original item.
4163  *
4164  * This allows us to split a file extent in place, keeping a lock on the leaf
4165  * the entire time.
4166  */
4167 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4168                          struct btrfs_root *root,
4169                          struct btrfs_path *path,
4170                          const struct btrfs_key *new_key)
4171 {
4172         struct extent_buffer *leaf;
4173         int ret;
4174         u32 item_size;
4175
4176         leaf = path->nodes[0];
4177         item_size = btrfs_item_size(leaf, path->slots[0]);
4178         ret = setup_leaf_for_split(trans, root, path,
4179                                    item_size + sizeof(struct btrfs_item));
4180         if (ret)
4181                 return ret;
4182
4183         path->slots[0]++;
4184         btrfs_setup_item_for_insert(root, path, new_key, item_size);
4185         leaf = path->nodes[0];
4186         memcpy_extent_buffer(leaf,
4187                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4188                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4189                              item_size);
4190         return 0;
4191 }
4192
4193 /*
4194  * delete the pointer from a given node.
4195  *
4196  * the tree should have been previously balanced so the deletion does not
4197  * empty a node.
4198  */
4199 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4200                     int level, int slot)
4201 {
4202         struct extent_buffer *parent = path->nodes[level];
4203         u32 nritems;
4204         int ret;
4205
4206         nritems = btrfs_header_nritems(parent);
4207         if (slot != nritems - 1) {
4208                 if (level) {
4209                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
4210                                         slot + 1, nritems - slot - 1);
4211                         BUG_ON(ret < 0);
4212                 }
4213                 memmove_extent_buffer(parent,
4214                               btrfs_node_key_ptr_offset(slot),
4215                               btrfs_node_key_ptr_offset(slot + 1),
4216                               sizeof(struct btrfs_key_ptr) *
4217                               (nritems - slot - 1));
4218         } else if (level) {
4219                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4220                                 BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
4221                 BUG_ON(ret < 0);
4222         }
4223
4224         nritems--;
4225         btrfs_set_header_nritems(parent, nritems);
4226         if (nritems == 0 && parent == root->node) {
4227                 BUG_ON(btrfs_header_level(root->node) != 1);
4228                 /* just turn the root into a leaf and break */
4229                 btrfs_set_header_level(root->node, 0);
4230         } else if (slot == 0) {
4231                 struct btrfs_disk_key disk_key;
4232
4233                 btrfs_node_key(parent, &disk_key, 0);
4234                 fixup_low_keys(path, &disk_key, level + 1);
4235         }
4236         btrfs_mark_buffer_dirty(parent);
4237 }
4238
4239 /*
4240  * a helper function to delete the leaf pointed to by path->slots[1] and
4241  * path->nodes[1].
4242  *
4243  * This deletes the pointer in path->nodes[1] and frees the leaf
4244  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4245  *
4246  * The path must have already been setup for deleting the leaf, including
4247  * all the proper balancing.  path->nodes[1] must be locked.
4248  */
4249 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4250                                     struct btrfs_root *root,
4251                                     struct btrfs_path *path,
4252                                     struct extent_buffer *leaf)
4253 {
4254         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4255         del_ptr(root, path, 1, path->slots[1]);
4256
4257         /*
4258          * btrfs_free_extent is expensive, we want to make sure we
4259          * aren't holding any locks when we call it
4260          */
4261         btrfs_unlock_up_safe(path, 0);
4262
4263         root_sub_used(root, leaf->len);
4264
4265         atomic_inc(&leaf->refs);
4266         btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4267         free_extent_buffer_stale(leaf);
4268 }
4269 /*
4270  * delete the item at the leaf level in path.  If that empties
4271  * the leaf, remove it from the tree
4272  */
4273 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4274                     struct btrfs_path *path, int slot, int nr)
4275 {
4276         struct btrfs_fs_info *fs_info = root->fs_info;
4277         struct extent_buffer *leaf;
4278         int ret = 0;
4279         int wret;
4280         u32 nritems;
4281
4282         leaf = path->nodes[0];
4283         nritems = btrfs_header_nritems(leaf);
4284
4285         if (slot + nr != nritems) {
4286                 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4287                 const int data_end = leaf_data_end(leaf);
4288                 struct btrfs_map_token token;
4289                 u32 dsize = 0;
4290                 int i;
4291
4292                 for (i = 0; i < nr; i++)
4293                         dsize += btrfs_item_size(leaf, slot + i);
4294
4295                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4296                               data_end + dsize,
4297                               BTRFS_LEAF_DATA_OFFSET + data_end,
4298                               last_off - data_end);
4299
4300                 btrfs_init_map_token(&token, leaf);
4301                 for (i = slot + nr; i < nritems; i++) {
4302                         u32 ioff;
4303
4304                         ioff = btrfs_token_item_offset(&token, i);
4305                         btrfs_set_token_item_offset(&token, i, ioff + dsize);
4306                 }
4307
4308                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4309                               btrfs_item_nr_offset(slot + nr),
4310                               sizeof(struct btrfs_item) *
4311                               (nritems - slot - nr));
4312         }
4313         btrfs_set_header_nritems(leaf, nritems - nr);
4314         nritems -= nr;
4315
4316         /* delete the leaf if we've emptied it */
4317         if (nritems == 0) {
4318                 if (leaf == root->node) {
4319                         btrfs_set_header_level(leaf, 0);
4320                 } else {
4321                         btrfs_clean_tree_block(leaf);
4322                         btrfs_del_leaf(trans, root, path, leaf);
4323                 }
4324         } else {
4325                 int used = leaf_space_used(leaf, 0, nritems);
4326                 if (slot == 0) {
4327                         struct btrfs_disk_key disk_key;
4328
4329                         btrfs_item_key(leaf, &disk_key, 0);
4330                         fixup_low_keys(path, &disk_key, 1);
4331                 }
4332
4333                 /*
4334                  * Try to delete the leaf if it is mostly empty. We do this by
4335                  * trying to move all its items into its left and right neighbours.
4336                  * If we can't move all the items, then we don't delete it - it's
4337                  * not ideal, but future insertions might fill the leaf with more
4338                  * items, or items from other leaves might be moved later into our
4339                  * leaf due to deletions on those leaves.
4340                  */
4341                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4342                         u32 min_push_space;
4343
4344                         /* push_leaf_left fixes the path.
4345                          * make sure the path still points to our leaf
4346                          * for possible call to del_ptr below
4347                          */
4348                         slot = path->slots[1];
4349                         atomic_inc(&leaf->refs);
4350                         /*
4351                          * We want to be able to at least push one item to the
4352                          * left neighbour leaf, and that's the first item.
4353                          */
4354                         min_push_space = sizeof(struct btrfs_item) +
4355                                 btrfs_item_size(leaf, 0);
4356                         wret = push_leaf_left(trans, root, path, 0,
4357                                               min_push_space, 1, (u32)-1);
4358                         if (wret < 0 && wret != -ENOSPC)
4359                                 ret = wret;
4360
4361                         if (path->nodes[0] == leaf &&
4362                             btrfs_header_nritems(leaf)) {
4363                                 /*
4364                                  * If we were not able to push all items from our
4365                                  * leaf to its left neighbour, then attempt to
4366                                  * either push all the remaining items to the
4367                                  * right neighbour or none. There's no advantage
4368                                  * in pushing only some items, instead of all, as
4369                                  * it's pointless to end up with a leaf having
4370                                  * too few items while the neighbours can be full
4371                                  * or nearly full.
4372                                  */
4373                                 nritems = btrfs_header_nritems(leaf);
4374                                 min_push_space = leaf_space_used(leaf, 0, nritems);
4375                                 wret = push_leaf_right(trans, root, path, 0,
4376                                                        min_push_space, 1, 0);
4377                                 if (wret < 0 && wret != -ENOSPC)
4378                                         ret = wret;
4379                         }
4380
4381                         if (btrfs_header_nritems(leaf) == 0) {
4382                                 path->slots[1] = slot;
4383                                 btrfs_del_leaf(trans, root, path, leaf);
4384                                 free_extent_buffer(leaf);
4385                                 ret = 0;
4386                         } else {
4387                                 /* if we're still in the path, make sure
4388                                  * we're dirty.  Otherwise, one of the
4389                                  * push_leaf functions must have already
4390                                  * dirtied this buffer
4391                                  */
4392                                 if (path->nodes[0] == leaf)
4393                                         btrfs_mark_buffer_dirty(leaf);
4394                                 free_extent_buffer(leaf);
4395                         }
4396                 } else {
4397                         btrfs_mark_buffer_dirty(leaf);
4398                 }
4399         }
4400         return ret;
4401 }
4402
4403 /*
4404  * search the tree again to find a leaf with lesser keys
4405  * returns 0 if it found something or 1 if there are no lesser leaves.
4406  * returns < 0 on io errors.
4407  *
4408  * This may release the path, and so you may lose any locks held at the
4409  * time you call it.
4410  */
4411 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4412 {
4413         struct btrfs_key key;
4414         struct btrfs_disk_key found_key;
4415         int ret;
4416
4417         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4418
4419         if (key.offset > 0) {
4420                 key.offset--;
4421         } else if (key.type > 0) {
4422                 key.type--;
4423                 key.offset = (u64)-1;
4424         } else if (key.objectid > 0) {
4425                 key.objectid--;
4426                 key.type = (u8)-1;
4427                 key.offset = (u64)-1;
4428         } else {
4429                 return 1;
4430         }
4431
4432         btrfs_release_path(path);
4433         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4434         if (ret < 0)
4435                 return ret;
4436         btrfs_item_key(path->nodes[0], &found_key, 0);
4437         ret = comp_keys(&found_key, &key);
4438         /*
4439          * We might have had an item with the previous key in the tree right
4440          * before we released our path. And after we released our path, that
4441          * item might have been pushed to the first slot (0) of the leaf we
4442          * were holding due to a tree balance. Alternatively, an item with the
4443          * previous key can exist as the only element of a leaf (big fat item).
4444          * Therefore account for these 2 cases, so that our callers (like
4445          * btrfs_previous_item) don't miss an existing item with a key matching
4446          * the previous key we computed above.
4447          */
4448         if (ret <= 0)
4449                 return 0;
4450         return 1;
4451 }
4452
4453 /*
4454  * A helper function to walk down the tree starting at min_key, and looking
4455  * for nodes or leaves that are have a minimum transaction id.
4456  * This is used by the btree defrag code, and tree logging
4457  *
4458  * This does not cow, but it does stuff the starting key it finds back
4459  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4460  * key and get a writable path.
4461  *
4462  * This honors path->lowest_level to prevent descent past a given level
4463  * of the tree.
4464  *
4465  * min_trans indicates the oldest transaction that you are interested
4466  * in walking through.  Any nodes or leaves older than min_trans are
4467  * skipped over (without reading them).
4468  *
4469  * returns zero if something useful was found, < 0 on error and 1 if there
4470  * was nothing in the tree that matched the search criteria.
4471  */
4472 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4473                          struct btrfs_path *path,
4474                          u64 min_trans)
4475 {
4476         struct extent_buffer *cur;
4477         struct btrfs_key found_key;
4478         int slot;
4479         int sret;
4480         u32 nritems;
4481         int level;
4482         int ret = 1;
4483         int keep_locks = path->keep_locks;
4484
4485         ASSERT(!path->nowait);
4486         path->keep_locks = 1;
4487 again:
4488         cur = btrfs_read_lock_root_node(root);
4489         level = btrfs_header_level(cur);
4490         WARN_ON(path->nodes[level]);
4491         path->nodes[level] = cur;
4492         path->locks[level] = BTRFS_READ_LOCK;
4493
4494         if (btrfs_header_generation(cur) < min_trans) {
4495                 ret = 1;
4496                 goto out;
4497         }
4498         while (1) {
4499                 nritems = btrfs_header_nritems(cur);
4500                 level = btrfs_header_level(cur);
4501                 sret = btrfs_bin_search(cur, min_key, &slot);
4502                 if (sret < 0) {
4503                         ret = sret;
4504                         goto out;
4505                 }
4506
4507                 /* at the lowest level, we're done, setup the path and exit */
4508                 if (level == path->lowest_level) {
4509                         if (slot >= nritems)
4510                                 goto find_next_key;
4511                         ret = 0;
4512                         path->slots[level] = slot;
4513                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4514                         goto out;
4515                 }
4516                 if (sret && slot > 0)
4517                         slot--;
4518                 /*
4519                  * check this node pointer against the min_trans parameters.
4520                  * If it is too old, skip to the next one.
4521                  */
4522                 while (slot < nritems) {
4523                         u64 gen;
4524
4525                         gen = btrfs_node_ptr_generation(cur, slot);
4526                         if (gen < min_trans) {
4527                                 slot++;
4528                                 continue;
4529                         }
4530                         break;
4531                 }
4532 find_next_key:
4533                 /*
4534                  * we didn't find a candidate key in this node, walk forward
4535                  * and find another one
4536                  */
4537                 if (slot >= nritems) {
4538                         path->slots[level] = slot;
4539                         sret = btrfs_find_next_key(root, path, min_key, level,
4540                                                   min_trans);
4541                         if (sret == 0) {
4542                                 btrfs_release_path(path);
4543                                 goto again;
4544                         } else {
4545                                 goto out;
4546                         }
4547                 }
4548                 /* save our key for returning back */
4549                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4550                 path->slots[level] = slot;
4551                 if (level == path->lowest_level) {
4552                         ret = 0;
4553                         goto out;
4554                 }
4555                 cur = btrfs_read_node_slot(cur, slot);
4556                 if (IS_ERR(cur)) {
4557                         ret = PTR_ERR(cur);
4558                         goto out;
4559                 }
4560
4561                 btrfs_tree_read_lock(cur);
4562
4563                 path->locks[level - 1] = BTRFS_READ_LOCK;
4564                 path->nodes[level - 1] = cur;
4565                 unlock_up(path, level, 1, 0, NULL);
4566         }
4567 out:
4568         path->keep_locks = keep_locks;
4569         if (ret == 0) {
4570                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4571                 memcpy(min_key, &found_key, sizeof(found_key));
4572         }
4573         return ret;
4574 }
4575
4576 /*
4577  * this is similar to btrfs_next_leaf, but does not try to preserve
4578  * and fixup the path.  It looks for and returns the next key in the
4579  * tree based on the current path and the min_trans parameters.
4580  *
4581  * 0 is returned if another key is found, < 0 if there are any errors
4582  * and 1 is returned if there are no higher keys in the tree
4583  *
4584  * path->keep_locks should be set to 1 on the search made before
4585  * calling this function.
4586  */
4587 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4588                         struct btrfs_key *key, int level, u64 min_trans)
4589 {
4590         int slot;
4591         struct extent_buffer *c;
4592
4593         WARN_ON(!path->keep_locks && !path->skip_locking);
4594         while (level < BTRFS_MAX_LEVEL) {
4595                 if (!path->nodes[level])
4596                         return 1;
4597
4598                 slot = path->slots[level] + 1;
4599                 c = path->nodes[level];
4600 next:
4601                 if (slot >= btrfs_header_nritems(c)) {
4602                         int ret;
4603                         int orig_lowest;
4604                         struct btrfs_key cur_key;
4605                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4606                             !path->nodes[level + 1])
4607                                 return 1;
4608
4609                         if (path->locks[level + 1] || path->skip_locking) {
4610                                 level++;
4611                                 continue;
4612                         }
4613
4614                         slot = btrfs_header_nritems(c) - 1;
4615                         if (level == 0)
4616                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4617                         else
4618                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4619
4620                         orig_lowest = path->lowest_level;
4621                         btrfs_release_path(path);
4622                         path->lowest_level = level;
4623                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4624                                                 0, 0);
4625                         path->lowest_level = orig_lowest;
4626                         if (ret < 0)
4627                                 return ret;
4628
4629                         c = path->nodes[level];
4630                         slot = path->slots[level];
4631                         if (ret == 0)
4632                                 slot++;
4633                         goto next;
4634                 }
4635
4636                 if (level == 0)
4637                         btrfs_item_key_to_cpu(c, key, slot);
4638                 else {
4639                         u64 gen = btrfs_node_ptr_generation(c, slot);
4640
4641                         if (gen < min_trans) {
4642                                 slot++;
4643                                 goto next;
4644                         }
4645                         btrfs_node_key_to_cpu(c, key, slot);
4646                 }
4647                 return 0;
4648         }
4649         return 1;
4650 }
4651
4652 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4653                         u64 time_seq)
4654 {
4655         int slot;
4656         int level;
4657         struct extent_buffer *c;
4658         struct extent_buffer *next;
4659         struct btrfs_fs_info *fs_info = root->fs_info;
4660         struct btrfs_key key;
4661         bool need_commit_sem = false;
4662         u32 nritems;
4663         int ret;
4664         int i;
4665
4666         /*
4667          * The nowait semantics are used only for write paths, where we don't
4668          * use the tree mod log and sequence numbers.
4669          */
4670         if (time_seq)
4671                 ASSERT(!path->nowait);
4672
4673         nritems = btrfs_header_nritems(path->nodes[0]);
4674         if (nritems == 0)
4675                 return 1;
4676
4677         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4678 again:
4679         level = 1;
4680         next = NULL;
4681         btrfs_release_path(path);
4682
4683         path->keep_locks = 1;
4684
4685         if (time_seq) {
4686                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4687         } else {
4688                 if (path->need_commit_sem) {
4689                         path->need_commit_sem = 0;
4690                         need_commit_sem = true;
4691                         if (path->nowait) {
4692                                 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4693                                         ret = -EAGAIN;
4694                                         goto done;
4695                                 }
4696                         } else {
4697                                 down_read(&fs_info->commit_root_sem);
4698                         }
4699                 }
4700                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4701         }
4702         path->keep_locks = 0;
4703
4704         if (ret < 0)
4705                 goto done;
4706
4707         nritems = btrfs_header_nritems(path->nodes[0]);
4708         /*
4709          * by releasing the path above we dropped all our locks.  A balance
4710          * could have added more items next to the key that used to be
4711          * at the very end of the block.  So, check again here and
4712          * advance the path if there are now more items available.
4713          */
4714         if (nritems > 0 && path->slots[0] < nritems - 1) {
4715                 if (ret == 0)
4716                         path->slots[0]++;
4717                 ret = 0;
4718                 goto done;
4719         }
4720         /*
4721          * So the above check misses one case:
4722          * - after releasing the path above, someone has removed the item that
4723          *   used to be at the very end of the block, and balance between leafs
4724          *   gets another one with bigger key.offset to replace it.
4725          *
4726          * This one should be returned as well, or we can get leaf corruption
4727          * later(esp. in __btrfs_drop_extents()).
4728          *
4729          * And a bit more explanation about this check,
4730          * with ret > 0, the key isn't found, the path points to the slot
4731          * where it should be inserted, so the path->slots[0] item must be the
4732          * bigger one.
4733          */
4734         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4735                 ret = 0;
4736                 goto done;
4737         }
4738
4739         while (level < BTRFS_MAX_LEVEL) {
4740                 if (!path->nodes[level]) {
4741                         ret = 1;
4742                         goto done;
4743                 }
4744
4745                 slot = path->slots[level] + 1;
4746                 c = path->nodes[level];
4747                 if (slot >= btrfs_header_nritems(c)) {
4748                         level++;
4749                         if (level == BTRFS_MAX_LEVEL) {
4750                                 ret = 1;
4751                                 goto done;
4752                         }
4753                         continue;
4754                 }
4755
4756
4757                 /*
4758                  * Our current level is where we're going to start from, and to
4759                  * make sure lockdep doesn't complain we need to drop our locks
4760                  * and nodes from 0 to our current level.
4761                  */
4762                 for (i = 0; i < level; i++) {
4763                         if (path->locks[level]) {
4764                                 btrfs_tree_read_unlock(path->nodes[i]);
4765                                 path->locks[i] = 0;
4766                         }
4767                         free_extent_buffer(path->nodes[i]);
4768                         path->nodes[i] = NULL;
4769                 }
4770
4771                 next = c;
4772                 ret = read_block_for_search(root, path, &next, level,
4773                                             slot, &key);
4774                 if (ret == -EAGAIN && !path->nowait)
4775                         goto again;
4776
4777                 if (ret < 0) {
4778                         btrfs_release_path(path);
4779                         goto done;
4780                 }
4781
4782                 if (!path->skip_locking) {
4783                         ret = btrfs_try_tree_read_lock(next);
4784                         if (!ret && path->nowait) {
4785                                 ret = -EAGAIN;
4786                                 goto done;
4787                         }
4788                         if (!ret && time_seq) {
4789                                 /*
4790                                  * If we don't get the lock, we may be racing
4791                                  * with push_leaf_left, holding that lock while
4792                                  * itself waiting for the leaf we've currently
4793                                  * locked. To solve this situation, we give up
4794                                  * on our lock and cycle.
4795                                  */
4796                                 free_extent_buffer(next);
4797                                 btrfs_release_path(path);
4798                                 cond_resched();
4799                                 goto again;
4800                         }
4801                         if (!ret)
4802                                 btrfs_tree_read_lock(next);
4803                 }
4804                 break;
4805         }
4806         path->slots[level] = slot;
4807         while (1) {
4808                 level--;
4809                 path->nodes[level] = next;
4810                 path->slots[level] = 0;
4811                 if (!path->skip_locking)
4812                         path->locks[level] = BTRFS_READ_LOCK;
4813                 if (!level)
4814                         break;
4815
4816                 ret = read_block_for_search(root, path, &next, level,
4817                                             0, &key);
4818                 if (ret == -EAGAIN && !path->nowait)
4819                         goto again;
4820
4821                 if (ret < 0) {
4822                         btrfs_release_path(path);
4823                         goto done;
4824                 }
4825
4826                 if (!path->skip_locking) {
4827                         if (path->nowait) {
4828                                 if (!btrfs_try_tree_read_lock(next)) {
4829                                         ret = -EAGAIN;
4830                                         goto done;
4831                                 }
4832                         } else {
4833                                 btrfs_tree_read_lock(next);
4834                         }
4835                 }
4836         }
4837         ret = 0;
4838 done:
4839         unlock_up(path, 0, 1, 0, NULL);
4840         if (need_commit_sem) {
4841                 int ret2;
4842
4843                 path->need_commit_sem = 1;
4844                 ret2 = finish_need_commit_sem_search(path);
4845                 up_read(&fs_info->commit_root_sem);
4846                 if (ret2)
4847                         ret = ret2;
4848         }
4849
4850         return ret;
4851 }
4852
4853 /*
4854  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4855  * searching until it gets past min_objectid or finds an item of 'type'
4856  *
4857  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4858  */
4859 int btrfs_previous_item(struct btrfs_root *root,
4860                         struct btrfs_path *path, u64 min_objectid,
4861                         int type)
4862 {
4863         struct btrfs_key found_key;
4864         struct extent_buffer *leaf;
4865         u32 nritems;
4866         int ret;
4867
4868         while (1) {
4869                 if (path->slots[0] == 0) {
4870                         ret = btrfs_prev_leaf(root, path);
4871                         if (ret != 0)
4872                                 return ret;
4873                 } else {
4874                         path->slots[0]--;
4875                 }
4876                 leaf = path->nodes[0];
4877                 nritems = btrfs_header_nritems(leaf);
4878                 if (nritems == 0)
4879                         return 1;
4880                 if (path->slots[0] == nritems)
4881                         path->slots[0]--;
4882
4883                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4884                 if (found_key.objectid < min_objectid)
4885                         break;
4886                 if (found_key.type == type)
4887                         return 0;
4888                 if (found_key.objectid == min_objectid &&
4889                     found_key.type < type)
4890                         break;
4891         }
4892         return 1;
4893 }
4894
4895 /*
4896  * search in extent tree to find a previous Metadata/Data extent item with
4897  * min objecitd.
4898  *
4899  * returns 0 if something is found, 1 if nothing was found and < 0 on error
4900  */
4901 int btrfs_previous_extent_item(struct btrfs_root *root,
4902                         struct btrfs_path *path, u64 min_objectid)
4903 {
4904         struct btrfs_key found_key;
4905         struct extent_buffer *leaf;
4906         u32 nritems;
4907         int ret;
4908
4909         while (1) {
4910                 if (path->slots[0] == 0) {
4911                         ret = btrfs_prev_leaf(root, path);
4912                         if (ret != 0)
4913                                 return ret;
4914                 } else {
4915                         path->slots[0]--;
4916                 }
4917                 leaf = path->nodes[0];
4918                 nritems = btrfs_header_nritems(leaf);
4919                 if (nritems == 0)
4920                         return 1;
4921                 if (path->slots[0] == nritems)
4922                         path->slots[0]--;
4923
4924                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4925                 if (found_key.objectid < min_objectid)
4926                         break;
4927                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4928                     found_key.type == BTRFS_METADATA_ITEM_KEY)
4929                         return 0;
4930                 if (found_key.objectid == min_objectid &&
4931                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
4932                         break;
4933         }
4934         return 1;
4935 }