Btrfs: don't add extent 0 to the free space cache v2
[platform/adaptation/renesas_rcar/renesas_kernel.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include "compat.h"
26 #include "hash.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "free-space-cache.h"
34
35 static int update_block_group(struct btrfs_trans_handle *trans,
36                               struct btrfs_root *root,
37                               u64 bytenr, u64 num_bytes, int alloc,
38                               int mark_free);
39 static int update_reserved_extents(struct btrfs_block_group_cache *cache,
40                                    u64 num_bytes, int reserve);
41 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42                                 struct btrfs_root *root,
43                                 u64 bytenr, u64 num_bytes, u64 parent,
44                                 u64 root_objectid, u64 owner_objectid,
45                                 u64 owner_offset, int refs_to_drop,
46                                 struct btrfs_delayed_extent_op *extra_op);
47 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
48                                     struct extent_buffer *leaf,
49                                     struct btrfs_extent_item *ei);
50 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
51                                       struct btrfs_root *root,
52                                       u64 parent, u64 root_objectid,
53                                       u64 flags, u64 owner, u64 offset,
54                                       struct btrfs_key *ins, int ref_mod);
55 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56                                      struct btrfs_root *root,
57                                      u64 parent, u64 root_objectid,
58                                      u64 flags, struct btrfs_disk_key *key,
59                                      int level, struct btrfs_key *ins);
60 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61                           struct btrfs_root *extent_root, u64 alloc_bytes,
62                           u64 flags, int force);
63 static int pin_down_bytes(struct btrfs_trans_handle *trans,
64                           struct btrfs_root *root,
65                           struct btrfs_path *path,
66                           u64 bytenr, u64 num_bytes,
67                           int is_data, int reserved,
68                           struct extent_buffer **must_clean);
69 static int find_next_key(struct btrfs_path *path, int level,
70                          struct btrfs_key *key);
71 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72                             int dump_block_groups);
73
74 static noinline int
75 block_group_cache_done(struct btrfs_block_group_cache *cache)
76 {
77         smp_mb();
78         return cache->cached == BTRFS_CACHE_FINISHED;
79 }
80
81 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
82 {
83         return (cache->flags & bits) == bits;
84 }
85
86 /*
87  * this adds the block group to the fs_info rb tree for the block group
88  * cache
89  */
90 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
91                                 struct btrfs_block_group_cache *block_group)
92 {
93         struct rb_node **p;
94         struct rb_node *parent = NULL;
95         struct btrfs_block_group_cache *cache;
96
97         spin_lock(&info->block_group_cache_lock);
98         p = &info->block_group_cache_tree.rb_node;
99
100         while (*p) {
101                 parent = *p;
102                 cache = rb_entry(parent, struct btrfs_block_group_cache,
103                                  cache_node);
104                 if (block_group->key.objectid < cache->key.objectid) {
105                         p = &(*p)->rb_left;
106                 } else if (block_group->key.objectid > cache->key.objectid) {
107                         p = &(*p)->rb_right;
108                 } else {
109                         spin_unlock(&info->block_group_cache_lock);
110                         return -EEXIST;
111                 }
112         }
113
114         rb_link_node(&block_group->cache_node, parent, p);
115         rb_insert_color(&block_group->cache_node,
116                         &info->block_group_cache_tree);
117         spin_unlock(&info->block_group_cache_lock);
118
119         return 0;
120 }
121
122 /*
123  * This will return the block group at or after bytenr if contains is 0, else
124  * it will return the block group that contains the bytenr
125  */
126 static struct btrfs_block_group_cache *
127 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
128                               int contains)
129 {
130         struct btrfs_block_group_cache *cache, *ret = NULL;
131         struct rb_node *n;
132         u64 end, start;
133
134         spin_lock(&info->block_group_cache_lock);
135         n = info->block_group_cache_tree.rb_node;
136
137         while (n) {
138                 cache = rb_entry(n, struct btrfs_block_group_cache,
139                                  cache_node);
140                 end = cache->key.objectid + cache->key.offset - 1;
141                 start = cache->key.objectid;
142
143                 if (bytenr < start) {
144                         if (!contains && (!ret || start < ret->key.objectid))
145                                 ret = cache;
146                         n = n->rb_left;
147                 } else if (bytenr > start) {
148                         if (contains && bytenr <= end) {
149                                 ret = cache;
150                                 break;
151                         }
152                         n = n->rb_right;
153                 } else {
154                         ret = cache;
155                         break;
156                 }
157         }
158         if (ret)
159                 atomic_inc(&ret->count);
160         spin_unlock(&info->block_group_cache_lock);
161
162         return ret;
163 }
164
165 static int add_excluded_extent(struct btrfs_root *root,
166                                u64 start, u64 num_bytes)
167 {
168         u64 end = start + num_bytes - 1;
169         set_extent_bits(&root->fs_info->freed_extents[0],
170                         start, end, EXTENT_UPTODATE, GFP_NOFS);
171         set_extent_bits(&root->fs_info->freed_extents[1],
172                         start, end, EXTENT_UPTODATE, GFP_NOFS);
173         return 0;
174 }
175
176 static void free_excluded_extents(struct btrfs_root *root,
177                                   struct btrfs_block_group_cache *cache)
178 {
179         u64 start, end;
180
181         start = cache->key.objectid;
182         end = start + cache->key.offset - 1;
183
184         clear_extent_bits(&root->fs_info->freed_extents[0],
185                           start, end, EXTENT_UPTODATE, GFP_NOFS);
186         clear_extent_bits(&root->fs_info->freed_extents[1],
187                           start, end, EXTENT_UPTODATE, GFP_NOFS);
188 }
189
190 static int exclude_super_stripes(struct btrfs_root *root,
191                                  struct btrfs_block_group_cache *cache)
192 {
193         u64 bytenr;
194         u64 *logical;
195         int stripe_len;
196         int i, nr, ret;
197
198         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
199                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
200                 cache->bytes_super += stripe_len;
201                 ret = add_excluded_extent(root, cache->key.objectid,
202                                           stripe_len);
203                 BUG_ON(ret);
204         }
205
206         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
207                 bytenr = btrfs_sb_offset(i);
208                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
209                                        cache->key.objectid, bytenr,
210                                        0, &logical, &nr, &stripe_len);
211                 BUG_ON(ret);
212
213                 while (nr--) {
214                         cache->bytes_super += stripe_len;
215                         ret = add_excluded_extent(root, logical[nr],
216                                                   stripe_len);
217                         BUG_ON(ret);
218                 }
219
220                 kfree(logical);
221         }
222         return 0;
223 }
224
225 static struct btrfs_caching_control *
226 get_caching_control(struct btrfs_block_group_cache *cache)
227 {
228         struct btrfs_caching_control *ctl;
229
230         spin_lock(&cache->lock);
231         if (cache->cached != BTRFS_CACHE_STARTED) {
232                 spin_unlock(&cache->lock);
233                 return NULL;
234         }
235
236         ctl = cache->caching_ctl;
237         atomic_inc(&ctl->count);
238         spin_unlock(&cache->lock);
239         return ctl;
240 }
241
242 static void put_caching_control(struct btrfs_caching_control *ctl)
243 {
244         if (atomic_dec_and_test(&ctl->count))
245                 kfree(ctl);
246 }
247
248 /*
249  * this is only called by cache_block_group, since we could have freed extents
250  * we need to check the pinned_extents for any extents that can't be used yet
251  * since their free space will be released as soon as the transaction commits.
252  */
253 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
254                               struct btrfs_fs_info *info, u64 start, u64 end)
255 {
256         u64 extent_start, extent_end, size, total_added = 0;
257         int ret;
258
259         while (start < end) {
260                 ret = find_first_extent_bit(info->pinned_extents, start,
261                                             &extent_start, &extent_end,
262                                             EXTENT_DIRTY | EXTENT_UPTODATE);
263                 if (ret)
264                         break;
265
266                 if (extent_start <= start) {
267                         start = extent_end + 1;
268                 } else if (extent_start > start && extent_start < end) {
269                         size = extent_start - start;
270                         total_added += size;
271                         ret = btrfs_add_free_space(block_group, start,
272                                                    size);
273                         BUG_ON(ret);
274                         start = extent_end + 1;
275                 } else {
276                         break;
277                 }
278         }
279
280         if (start < end) {
281                 size = end - start;
282                 total_added += size;
283                 ret = btrfs_add_free_space(block_group, start, size);
284                 BUG_ON(ret);
285         }
286
287         return total_added;
288 }
289
290 static int caching_kthread(void *data)
291 {
292         struct btrfs_block_group_cache *block_group = data;
293         struct btrfs_fs_info *fs_info = block_group->fs_info;
294         struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
295         struct btrfs_root *extent_root = fs_info->extent_root;
296         struct btrfs_path *path;
297         struct extent_buffer *leaf;
298         struct btrfs_key key;
299         u64 total_found = 0;
300         u64 last = 0;
301         u32 nritems;
302         int ret = 0;
303
304         path = btrfs_alloc_path();
305         if (!path)
306                 return -ENOMEM;
307
308         exclude_super_stripes(extent_root, block_group);
309         spin_lock(&block_group->space_info->lock);
310         block_group->space_info->bytes_super += block_group->bytes_super;
311         spin_unlock(&block_group->space_info->lock);
312
313         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
314
315         /*
316          * We don't want to deadlock with somebody trying to allocate a new
317          * extent for the extent root while also trying to search the extent
318          * root to add free space.  So we skip locking and search the commit
319          * root, since its read-only
320          */
321         path->skip_locking = 1;
322         path->search_commit_root = 1;
323         path->reada = 2;
324
325         key.objectid = last;
326         key.offset = 0;
327         key.type = BTRFS_EXTENT_ITEM_KEY;
328 again:
329         mutex_lock(&caching_ctl->mutex);
330         /* need to make sure the commit_root doesn't disappear */
331         down_read(&fs_info->extent_commit_sem);
332
333         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
334         if (ret < 0)
335                 goto err;
336
337         leaf = path->nodes[0];
338         nritems = btrfs_header_nritems(leaf);
339
340         while (1) {
341                 smp_mb();
342                 if (fs_info->closing > 1) {
343                         last = (u64)-1;
344                         break;
345                 }
346
347                 if (path->slots[0] < nritems) {
348                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
349                 } else {
350                         ret = find_next_key(path, 0, &key);
351                         if (ret)
352                                 break;
353
354                         caching_ctl->progress = last;
355                         btrfs_release_path(extent_root, path);
356                         up_read(&fs_info->extent_commit_sem);
357                         mutex_unlock(&caching_ctl->mutex);
358                         if (btrfs_transaction_in_commit(fs_info))
359                                 schedule_timeout(1);
360                         else
361                                 cond_resched();
362                         goto again;
363                 }
364
365                 if (key.objectid < block_group->key.objectid) {
366                         path->slots[0]++;
367                         continue;
368                 }
369
370                 if (key.objectid >= block_group->key.objectid +
371                     block_group->key.offset)
372                         break;
373
374                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
375                         total_found += add_new_free_space(block_group,
376                                                           fs_info, last,
377                                                           key.objectid);
378                         last = key.objectid + key.offset;
379
380                         if (total_found > (1024 * 1024 * 2)) {
381                                 total_found = 0;
382                                 wake_up(&caching_ctl->wait);
383                         }
384                 }
385                 path->slots[0]++;
386         }
387         ret = 0;
388
389         total_found += add_new_free_space(block_group, fs_info, last,
390                                           block_group->key.objectid +
391                                           block_group->key.offset);
392         caching_ctl->progress = (u64)-1;
393
394         spin_lock(&block_group->lock);
395         block_group->caching_ctl = NULL;
396         block_group->cached = BTRFS_CACHE_FINISHED;
397         spin_unlock(&block_group->lock);
398
399 err:
400         btrfs_free_path(path);
401         up_read(&fs_info->extent_commit_sem);
402
403         free_excluded_extents(extent_root, block_group);
404
405         mutex_unlock(&caching_ctl->mutex);
406         wake_up(&caching_ctl->wait);
407
408         put_caching_control(caching_ctl);
409         atomic_dec(&block_group->space_info->caching_threads);
410         return 0;
411 }
412
413 static int cache_block_group(struct btrfs_block_group_cache *cache)
414 {
415         struct btrfs_fs_info *fs_info = cache->fs_info;
416         struct btrfs_caching_control *caching_ctl;
417         struct task_struct *tsk;
418         int ret = 0;
419
420         smp_mb();
421         if (cache->cached != BTRFS_CACHE_NO)
422                 return 0;
423
424         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
425         BUG_ON(!caching_ctl);
426
427         INIT_LIST_HEAD(&caching_ctl->list);
428         mutex_init(&caching_ctl->mutex);
429         init_waitqueue_head(&caching_ctl->wait);
430         caching_ctl->block_group = cache;
431         caching_ctl->progress = cache->key.objectid;
432         /* one for caching kthread, one for caching block group list */
433         atomic_set(&caching_ctl->count, 2);
434
435         spin_lock(&cache->lock);
436         if (cache->cached != BTRFS_CACHE_NO) {
437                 spin_unlock(&cache->lock);
438                 kfree(caching_ctl);
439                 return 0;
440         }
441         cache->caching_ctl = caching_ctl;
442         cache->cached = BTRFS_CACHE_STARTED;
443         spin_unlock(&cache->lock);
444
445         down_write(&fs_info->extent_commit_sem);
446         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
447         up_write(&fs_info->extent_commit_sem);
448
449         atomic_inc(&cache->space_info->caching_threads);
450
451         tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
452                           cache->key.objectid);
453         if (IS_ERR(tsk)) {
454                 ret = PTR_ERR(tsk);
455                 printk(KERN_ERR "error running thread %d\n", ret);
456                 BUG();
457         }
458
459         return ret;
460 }
461
462 /*
463  * return the block group that starts at or after bytenr
464  */
465 static struct btrfs_block_group_cache *
466 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
467 {
468         struct btrfs_block_group_cache *cache;
469
470         cache = block_group_cache_tree_search(info, bytenr, 0);
471
472         return cache;
473 }
474
475 /*
476  * return the block group that contains the given bytenr
477  */
478 struct btrfs_block_group_cache *btrfs_lookup_block_group(
479                                                  struct btrfs_fs_info *info,
480                                                  u64 bytenr)
481 {
482         struct btrfs_block_group_cache *cache;
483
484         cache = block_group_cache_tree_search(info, bytenr, 1);
485
486         return cache;
487 }
488
489 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
490 {
491         if (atomic_dec_and_test(&cache->count))
492                 kfree(cache);
493 }
494
495 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
496                                                   u64 flags)
497 {
498         struct list_head *head = &info->space_info;
499         struct btrfs_space_info *found;
500
501         rcu_read_lock();
502         list_for_each_entry_rcu(found, head, list) {
503                 if (found->flags == flags) {
504                         rcu_read_unlock();
505                         return found;
506                 }
507         }
508         rcu_read_unlock();
509         return NULL;
510 }
511
512 /*
513  * after adding space to the filesystem, we need to clear the full flags
514  * on all the space infos.
515  */
516 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
517 {
518         struct list_head *head = &info->space_info;
519         struct btrfs_space_info *found;
520
521         rcu_read_lock();
522         list_for_each_entry_rcu(found, head, list)
523                 found->full = 0;
524         rcu_read_unlock();
525 }
526
527 static u64 div_factor(u64 num, int factor)
528 {
529         if (factor == 10)
530                 return num;
531         num *= factor;
532         do_div(num, 10);
533         return num;
534 }
535
536 u64 btrfs_find_block_group(struct btrfs_root *root,
537                            u64 search_start, u64 search_hint, int owner)
538 {
539         struct btrfs_block_group_cache *cache;
540         u64 used;
541         u64 last = max(search_hint, search_start);
542         u64 group_start = 0;
543         int full_search = 0;
544         int factor = 9;
545         int wrapped = 0;
546 again:
547         while (1) {
548                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
549                 if (!cache)
550                         break;
551
552                 spin_lock(&cache->lock);
553                 last = cache->key.objectid + cache->key.offset;
554                 used = btrfs_block_group_used(&cache->item);
555
556                 if ((full_search || !cache->ro) &&
557                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
558                         if (used + cache->pinned + cache->reserved <
559                             div_factor(cache->key.offset, factor)) {
560                                 group_start = cache->key.objectid;
561                                 spin_unlock(&cache->lock);
562                                 btrfs_put_block_group(cache);
563                                 goto found;
564                         }
565                 }
566                 spin_unlock(&cache->lock);
567                 btrfs_put_block_group(cache);
568                 cond_resched();
569         }
570         if (!wrapped) {
571                 last = search_start;
572                 wrapped = 1;
573                 goto again;
574         }
575         if (!full_search && factor < 10) {
576                 last = search_start;
577                 full_search = 1;
578                 factor = 10;
579                 goto again;
580         }
581 found:
582         return group_start;
583 }
584
585 /* simple helper to search for an existing extent at a given offset */
586 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
587 {
588         int ret;
589         struct btrfs_key key;
590         struct btrfs_path *path;
591
592         path = btrfs_alloc_path();
593         BUG_ON(!path);
594         key.objectid = start;
595         key.offset = len;
596         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
597         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
598                                 0, 0);
599         btrfs_free_path(path);
600         return ret;
601 }
602
603 /*
604  * Back reference rules.  Back refs have three main goals:
605  *
606  * 1) differentiate between all holders of references to an extent so that
607  *    when a reference is dropped we can make sure it was a valid reference
608  *    before freeing the extent.
609  *
610  * 2) Provide enough information to quickly find the holders of an extent
611  *    if we notice a given block is corrupted or bad.
612  *
613  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
614  *    maintenance.  This is actually the same as #2, but with a slightly
615  *    different use case.
616  *
617  * There are two kinds of back refs. The implicit back refs is optimized
618  * for pointers in non-shared tree blocks. For a given pointer in a block,
619  * back refs of this kind provide information about the block's owner tree
620  * and the pointer's key. These information allow us to find the block by
621  * b-tree searching. The full back refs is for pointers in tree blocks not
622  * referenced by their owner trees. The location of tree block is recorded
623  * in the back refs. Actually the full back refs is generic, and can be
624  * used in all cases the implicit back refs is used. The major shortcoming
625  * of the full back refs is its overhead. Every time a tree block gets
626  * COWed, we have to update back refs entry for all pointers in it.
627  *
628  * For a newly allocated tree block, we use implicit back refs for
629  * pointers in it. This means most tree related operations only involve
630  * implicit back refs. For a tree block created in old transaction, the
631  * only way to drop a reference to it is COW it. So we can detect the
632  * event that tree block loses its owner tree's reference and do the
633  * back refs conversion.
634  *
635  * When a tree block is COW'd through a tree, there are four cases:
636  *
637  * The reference count of the block is one and the tree is the block's
638  * owner tree. Nothing to do in this case.
639  *
640  * The reference count of the block is one and the tree is not the
641  * block's owner tree. In this case, full back refs is used for pointers
642  * in the block. Remove these full back refs, add implicit back refs for
643  * every pointers in the new block.
644  *
645  * The reference count of the block is greater than one and the tree is
646  * the block's owner tree. In this case, implicit back refs is used for
647  * pointers in the block. Add full back refs for every pointers in the
648  * block, increase lower level extents' reference counts. The original
649  * implicit back refs are entailed to the new block.
650  *
651  * The reference count of the block is greater than one and the tree is
652  * not the block's owner tree. Add implicit back refs for every pointer in
653  * the new block, increase lower level extents' reference count.
654  *
655  * Back Reference Key composing:
656  *
657  * The key objectid corresponds to the first byte in the extent,
658  * The key type is used to differentiate between types of back refs.
659  * There are different meanings of the key offset for different types
660  * of back refs.
661  *
662  * File extents can be referenced by:
663  *
664  * - multiple snapshots, subvolumes, or different generations in one subvol
665  * - different files inside a single subvolume
666  * - different offsets inside a file (bookend extents in file.c)
667  *
668  * The extent ref structure for the implicit back refs has fields for:
669  *
670  * - Objectid of the subvolume root
671  * - objectid of the file holding the reference
672  * - original offset in the file
673  * - how many bookend extents
674  *
675  * The key offset for the implicit back refs is hash of the first
676  * three fields.
677  *
678  * The extent ref structure for the full back refs has field for:
679  *
680  * - number of pointers in the tree leaf
681  *
682  * The key offset for the implicit back refs is the first byte of
683  * the tree leaf
684  *
685  * When a file extent is allocated, The implicit back refs is used.
686  * the fields are filled in:
687  *
688  *     (root_key.objectid, inode objectid, offset in file, 1)
689  *
690  * When a file extent is removed file truncation, we find the
691  * corresponding implicit back refs and check the following fields:
692  *
693  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
694  *
695  * Btree extents can be referenced by:
696  *
697  * - Different subvolumes
698  *
699  * Both the implicit back refs and the full back refs for tree blocks
700  * only consist of key. The key offset for the implicit back refs is
701  * objectid of block's owner tree. The key offset for the full back refs
702  * is the first byte of parent block.
703  *
704  * When implicit back refs is used, information about the lowest key and
705  * level of the tree block are required. These information are stored in
706  * tree block info structure.
707  */
708
709 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
710 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
711                                   struct btrfs_root *root,
712                                   struct btrfs_path *path,
713                                   u64 owner, u32 extra_size)
714 {
715         struct btrfs_extent_item *item;
716         struct btrfs_extent_item_v0 *ei0;
717         struct btrfs_extent_ref_v0 *ref0;
718         struct btrfs_tree_block_info *bi;
719         struct extent_buffer *leaf;
720         struct btrfs_key key;
721         struct btrfs_key found_key;
722         u32 new_size = sizeof(*item);
723         u64 refs;
724         int ret;
725
726         leaf = path->nodes[0];
727         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
728
729         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
730         ei0 = btrfs_item_ptr(leaf, path->slots[0],
731                              struct btrfs_extent_item_v0);
732         refs = btrfs_extent_refs_v0(leaf, ei0);
733
734         if (owner == (u64)-1) {
735                 while (1) {
736                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
737                                 ret = btrfs_next_leaf(root, path);
738                                 if (ret < 0)
739                                         return ret;
740                                 BUG_ON(ret > 0);
741                                 leaf = path->nodes[0];
742                         }
743                         btrfs_item_key_to_cpu(leaf, &found_key,
744                                               path->slots[0]);
745                         BUG_ON(key.objectid != found_key.objectid);
746                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
747                                 path->slots[0]++;
748                                 continue;
749                         }
750                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
751                                               struct btrfs_extent_ref_v0);
752                         owner = btrfs_ref_objectid_v0(leaf, ref0);
753                         break;
754                 }
755         }
756         btrfs_release_path(root, path);
757
758         if (owner < BTRFS_FIRST_FREE_OBJECTID)
759                 new_size += sizeof(*bi);
760
761         new_size -= sizeof(*ei0);
762         ret = btrfs_search_slot(trans, root, &key, path,
763                                 new_size + extra_size, 1);
764         if (ret < 0)
765                 return ret;
766         BUG_ON(ret);
767
768         ret = btrfs_extend_item(trans, root, path, new_size);
769         BUG_ON(ret);
770
771         leaf = path->nodes[0];
772         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
773         btrfs_set_extent_refs(leaf, item, refs);
774         /* FIXME: get real generation */
775         btrfs_set_extent_generation(leaf, item, 0);
776         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
777                 btrfs_set_extent_flags(leaf, item,
778                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
779                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
780                 bi = (struct btrfs_tree_block_info *)(item + 1);
781                 /* FIXME: get first key of the block */
782                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
783                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
784         } else {
785                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
786         }
787         btrfs_mark_buffer_dirty(leaf);
788         return 0;
789 }
790 #endif
791
792 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
793 {
794         u32 high_crc = ~(u32)0;
795         u32 low_crc = ~(u32)0;
796         __le64 lenum;
797
798         lenum = cpu_to_le64(root_objectid);
799         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
800         lenum = cpu_to_le64(owner);
801         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
802         lenum = cpu_to_le64(offset);
803         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
804
805         return ((u64)high_crc << 31) ^ (u64)low_crc;
806 }
807
808 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
809                                      struct btrfs_extent_data_ref *ref)
810 {
811         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
812                                     btrfs_extent_data_ref_objectid(leaf, ref),
813                                     btrfs_extent_data_ref_offset(leaf, ref));
814 }
815
816 static int match_extent_data_ref(struct extent_buffer *leaf,
817                                  struct btrfs_extent_data_ref *ref,
818                                  u64 root_objectid, u64 owner, u64 offset)
819 {
820         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
821             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
822             btrfs_extent_data_ref_offset(leaf, ref) != offset)
823                 return 0;
824         return 1;
825 }
826
827 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
828                                            struct btrfs_root *root,
829                                            struct btrfs_path *path,
830                                            u64 bytenr, u64 parent,
831                                            u64 root_objectid,
832                                            u64 owner, u64 offset)
833 {
834         struct btrfs_key key;
835         struct btrfs_extent_data_ref *ref;
836         struct extent_buffer *leaf;
837         u32 nritems;
838         int ret;
839         int recow;
840         int err = -ENOENT;
841
842         key.objectid = bytenr;
843         if (parent) {
844                 key.type = BTRFS_SHARED_DATA_REF_KEY;
845                 key.offset = parent;
846         } else {
847                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
848                 key.offset = hash_extent_data_ref(root_objectid,
849                                                   owner, offset);
850         }
851 again:
852         recow = 0;
853         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
854         if (ret < 0) {
855                 err = ret;
856                 goto fail;
857         }
858
859         if (parent) {
860                 if (!ret)
861                         return 0;
862 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
863                 key.type = BTRFS_EXTENT_REF_V0_KEY;
864                 btrfs_release_path(root, path);
865                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
866                 if (ret < 0) {
867                         err = ret;
868                         goto fail;
869                 }
870                 if (!ret)
871                         return 0;
872 #endif
873                 goto fail;
874         }
875
876         leaf = path->nodes[0];
877         nritems = btrfs_header_nritems(leaf);
878         while (1) {
879                 if (path->slots[0] >= nritems) {
880                         ret = btrfs_next_leaf(root, path);
881                         if (ret < 0)
882                                 err = ret;
883                         if (ret)
884                                 goto fail;
885
886                         leaf = path->nodes[0];
887                         nritems = btrfs_header_nritems(leaf);
888                         recow = 1;
889                 }
890
891                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
892                 if (key.objectid != bytenr ||
893                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
894                         goto fail;
895
896                 ref = btrfs_item_ptr(leaf, path->slots[0],
897                                      struct btrfs_extent_data_ref);
898
899                 if (match_extent_data_ref(leaf, ref, root_objectid,
900                                           owner, offset)) {
901                         if (recow) {
902                                 btrfs_release_path(root, path);
903                                 goto again;
904                         }
905                         err = 0;
906                         break;
907                 }
908                 path->slots[0]++;
909         }
910 fail:
911         return err;
912 }
913
914 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
915                                            struct btrfs_root *root,
916                                            struct btrfs_path *path,
917                                            u64 bytenr, u64 parent,
918                                            u64 root_objectid, u64 owner,
919                                            u64 offset, int refs_to_add)
920 {
921         struct btrfs_key key;
922         struct extent_buffer *leaf;
923         u32 size;
924         u32 num_refs;
925         int ret;
926
927         key.objectid = bytenr;
928         if (parent) {
929                 key.type = BTRFS_SHARED_DATA_REF_KEY;
930                 key.offset = parent;
931                 size = sizeof(struct btrfs_shared_data_ref);
932         } else {
933                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
934                 key.offset = hash_extent_data_ref(root_objectid,
935                                                   owner, offset);
936                 size = sizeof(struct btrfs_extent_data_ref);
937         }
938
939         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
940         if (ret && ret != -EEXIST)
941                 goto fail;
942
943         leaf = path->nodes[0];
944         if (parent) {
945                 struct btrfs_shared_data_ref *ref;
946                 ref = btrfs_item_ptr(leaf, path->slots[0],
947                                      struct btrfs_shared_data_ref);
948                 if (ret == 0) {
949                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
950                 } else {
951                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
952                         num_refs += refs_to_add;
953                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
954                 }
955         } else {
956                 struct btrfs_extent_data_ref *ref;
957                 while (ret == -EEXIST) {
958                         ref = btrfs_item_ptr(leaf, path->slots[0],
959                                              struct btrfs_extent_data_ref);
960                         if (match_extent_data_ref(leaf, ref, root_objectid,
961                                                   owner, offset))
962                                 break;
963                         btrfs_release_path(root, path);
964                         key.offset++;
965                         ret = btrfs_insert_empty_item(trans, root, path, &key,
966                                                       size);
967                         if (ret && ret != -EEXIST)
968                                 goto fail;
969
970                         leaf = path->nodes[0];
971                 }
972                 ref = btrfs_item_ptr(leaf, path->slots[0],
973                                      struct btrfs_extent_data_ref);
974                 if (ret == 0) {
975                         btrfs_set_extent_data_ref_root(leaf, ref,
976                                                        root_objectid);
977                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
978                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
979                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
980                 } else {
981                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
982                         num_refs += refs_to_add;
983                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
984                 }
985         }
986         btrfs_mark_buffer_dirty(leaf);
987         ret = 0;
988 fail:
989         btrfs_release_path(root, path);
990         return ret;
991 }
992
993 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
994                                            struct btrfs_root *root,
995                                            struct btrfs_path *path,
996                                            int refs_to_drop)
997 {
998         struct btrfs_key key;
999         struct btrfs_extent_data_ref *ref1 = NULL;
1000         struct btrfs_shared_data_ref *ref2 = NULL;
1001         struct extent_buffer *leaf;
1002         u32 num_refs = 0;
1003         int ret = 0;
1004
1005         leaf = path->nodes[0];
1006         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1007
1008         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1009                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1010                                       struct btrfs_extent_data_ref);
1011                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1012         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1013                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1014                                       struct btrfs_shared_data_ref);
1015                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1016 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1017         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1018                 struct btrfs_extent_ref_v0 *ref0;
1019                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1020                                       struct btrfs_extent_ref_v0);
1021                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1022 #endif
1023         } else {
1024                 BUG();
1025         }
1026
1027         BUG_ON(num_refs < refs_to_drop);
1028         num_refs -= refs_to_drop;
1029
1030         if (num_refs == 0) {
1031                 ret = btrfs_del_item(trans, root, path);
1032         } else {
1033                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1034                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1035                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1036                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1037 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1038                 else {
1039                         struct btrfs_extent_ref_v0 *ref0;
1040                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1041                                         struct btrfs_extent_ref_v0);
1042                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1043                 }
1044 #endif
1045                 btrfs_mark_buffer_dirty(leaf);
1046         }
1047         return ret;
1048 }
1049
1050 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1051                                           struct btrfs_path *path,
1052                                           struct btrfs_extent_inline_ref *iref)
1053 {
1054         struct btrfs_key key;
1055         struct extent_buffer *leaf;
1056         struct btrfs_extent_data_ref *ref1;
1057         struct btrfs_shared_data_ref *ref2;
1058         u32 num_refs = 0;
1059
1060         leaf = path->nodes[0];
1061         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1062         if (iref) {
1063                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1064                     BTRFS_EXTENT_DATA_REF_KEY) {
1065                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1066                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1067                 } else {
1068                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1069                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1070                 }
1071         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1072                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1073                                       struct btrfs_extent_data_ref);
1074                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1075         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1076                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1077                                       struct btrfs_shared_data_ref);
1078                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1079 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1080         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1081                 struct btrfs_extent_ref_v0 *ref0;
1082                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1083                                       struct btrfs_extent_ref_v0);
1084                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1085 #endif
1086         } else {
1087                 WARN_ON(1);
1088         }
1089         return num_refs;
1090 }
1091
1092 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1093                                           struct btrfs_root *root,
1094                                           struct btrfs_path *path,
1095                                           u64 bytenr, u64 parent,
1096                                           u64 root_objectid)
1097 {
1098         struct btrfs_key key;
1099         int ret;
1100
1101         key.objectid = bytenr;
1102         if (parent) {
1103                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1104                 key.offset = parent;
1105         } else {
1106                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1107                 key.offset = root_objectid;
1108         }
1109
1110         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1111         if (ret > 0)
1112                 ret = -ENOENT;
1113 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1114         if (ret == -ENOENT && parent) {
1115                 btrfs_release_path(root, path);
1116                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1117                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1118                 if (ret > 0)
1119                         ret = -ENOENT;
1120         }
1121 #endif
1122         return ret;
1123 }
1124
1125 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1126                                           struct btrfs_root *root,
1127                                           struct btrfs_path *path,
1128                                           u64 bytenr, u64 parent,
1129                                           u64 root_objectid)
1130 {
1131         struct btrfs_key key;
1132         int ret;
1133
1134         key.objectid = bytenr;
1135         if (parent) {
1136                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1137                 key.offset = parent;
1138         } else {
1139                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1140                 key.offset = root_objectid;
1141         }
1142
1143         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1144         btrfs_release_path(root, path);
1145         return ret;
1146 }
1147
1148 static inline int extent_ref_type(u64 parent, u64 owner)
1149 {
1150         int type;
1151         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1152                 if (parent > 0)
1153                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1154                 else
1155                         type = BTRFS_TREE_BLOCK_REF_KEY;
1156         } else {
1157                 if (parent > 0)
1158                         type = BTRFS_SHARED_DATA_REF_KEY;
1159                 else
1160                         type = BTRFS_EXTENT_DATA_REF_KEY;
1161         }
1162         return type;
1163 }
1164
1165 static int find_next_key(struct btrfs_path *path, int level,
1166                          struct btrfs_key *key)
1167
1168 {
1169         for (; level < BTRFS_MAX_LEVEL; level++) {
1170                 if (!path->nodes[level])
1171                         break;
1172                 if (path->slots[level] + 1 >=
1173                     btrfs_header_nritems(path->nodes[level]))
1174                         continue;
1175                 if (level == 0)
1176                         btrfs_item_key_to_cpu(path->nodes[level], key,
1177                                               path->slots[level] + 1);
1178                 else
1179                         btrfs_node_key_to_cpu(path->nodes[level], key,
1180                                               path->slots[level] + 1);
1181                 return 0;
1182         }
1183         return 1;
1184 }
1185
1186 /*
1187  * look for inline back ref. if back ref is found, *ref_ret is set
1188  * to the address of inline back ref, and 0 is returned.
1189  *
1190  * if back ref isn't found, *ref_ret is set to the address where it
1191  * should be inserted, and -ENOENT is returned.
1192  *
1193  * if insert is true and there are too many inline back refs, the path
1194  * points to the extent item, and -EAGAIN is returned.
1195  *
1196  * NOTE: inline back refs are ordered in the same way that back ref
1197  *       items in the tree are ordered.
1198  */
1199 static noinline_for_stack
1200 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1201                                  struct btrfs_root *root,
1202                                  struct btrfs_path *path,
1203                                  struct btrfs_extent_inline_ref **ref_ret,
1204                                  u64 bytenr, u64 num_bytes,
1205                                  u64 parent, u64 root_objectid,
1206                                  u64 owner, u64 offset, int insert)
1207 {
1208         struct btrfs_key key;
1209         struct extent_buffer *leaf;
1210         struct btrfs_extent_item *ei;
1211         struct btrfs_extent_inline_ref *iref;
1212         u64 flags;
1213         u64 item_size;
1214         unsigned long ptr;
1215         unsigned long end;
1216         int extra_size;
1217         int type;
1218         int want;
1219         int ret;
1220         int err = 0;
1221
1222         key.objectid = bytenr;
1223         key.type = BTRFS_EXTENT_ITEM_KEY;
1224         key.offset = num_bytes;
1225
1226         want = extent_ref_type(parent, owner);
1227         if (insert) {
1228                 extra_size = btrfs_extent_inline_ref_size(want);
1229                 path->keep_locks = 1;
1230         } else
1231                 extra_size = -1;
1232         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1233         if (ret < 0) {
1234                 err = ret;
1235                 goto out;
1236         }
1237         BUG_ON(ret);
1238
1239         leaf = path->nodes[0];
1240         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1241 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1242         if (item_size < sizeof(*ei)) {
1243                 if (!insert) {
1244                         err = -ENOENT;
1245                         goto out;
1246                 }
1247                 ret = convert_extent_item_v0(trans, root, path, owner,
1248                                              extra_size);
1249                 if (ret < 0) {
1250                         err = ret;
1251                         goto out;
1252                 }
1253                 leaf = path->nodes[0];
1254                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1255         }
1256 #endif
1257         BUG_ON(item_size < sizeof(*ei));
1258
1259         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1260         flags = btrfs_extent_flags(leaf, ei);
1261
1262         ptr = (unsigned long)(ei + 1);
1263         end = (unsigned long)ei + item_size;
1264
1265         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1266                 ptr += sizeof(struct btrfs_tree_block_info);
1267                 BUG_ON(ptr > end);
1268         } else {
1269                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1270         }
1271
1272         err = -ENOENT;
1273         while (1) {
1274                 if (ptr >= end) {
1275                         WARN_ON(ptr > end);
1276                         break;
1277                 }
1278                 iref = (struct btrfs_extent_inline_ref *)ptr;
1279                 type = btrfs_extent_inline_ref_type(leaf, iref);
1280                 if (want < type)
1281                         break;
1282                 if (want > type) {
1283                         ptr += btrfs_extent_inline_ref_size(type);
1284                         continue;
1285                 }
1286
1287                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1288                         struct btrfs_extent_data_ref *dref;
1289                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1290                         if (match_extent_data_ref(leaf, dref, root_objectid,
1291                                                   owner, offset)) {
1292                                 err = 0;
1293                                 break;
1294                         }
1295                         if (hash_extent_data_ref_item(leaf, dref) <
1296                             hash_extent_data_ref(root_objectid, owner, offset))
1297                                 break;
1298                 } else {
1299                         u64 ref_offset;
1300                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1301                         if (parent > 0) {
1302                                 if (parent == ref_offset) {
1303                                         err = 0;
1304                                         break;
1305                                 }
1306                                 if (ref_offset < parent)
1307                                         break;
1308                         } else {
1309                                 if (root_objectid == ref_offset) {
1310                                         err = 0;
1311                                         break;
1312                                 }
1313                                 if (ref_offset < root_objectid)
1314                                         break;
1315                         }
1316                 }
1317                 ptr += btrfs_extent_inline_ref_size(type);
1318         }
1319         if (err == -ENOENT && insert) {
1320                 if (item_size + extra_size >=
1321                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1322                         err = -EAGAIN;
1323                         goto out;
1324                 }
1325                 /*
1326                  * To add new inline back ref, we have to make sure
1327                  * there is no corresponding back ref item.
1328                  * For simplicity, we just do not add new inline back
1329                  * ref if there is any kind of item for this block
1330                  */
1331                 if (find_next_key(path, 0, &key) == 0 &&
1332                     key.objectid == bytenr &&
1333                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1334                         err = -EAGAIN;
1335                         goto out;
1336                 }
1337         }
1338         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1339 out:
1340         if (insert) {
1341                 path->keep_locks = 0;
1342                 btrfs_unlock_up_safe(path, 1);
1343         }
1344         return err;
1345 }
1346
1347 /*
1348  * helper to add new inline back ref
1349  */
1350 static noinline_for_stack
1351 int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1352                                 struct btrfs_root *root,
1353                                 struct btrfs_path *path,
1354                                 struct btrfs_extent_inline_ref *iref,
1355                                 u64 parent, u64 root_objectid,
1356                                 u64 owner, u64 offset, int refs_to_add,
1357                                 struct btrfs_delayed_extent_op *extent_op)
1358 {
1359         struct extent_buffer *leaf;
1360         struct btrfs_extent_item *ei;
1361         unsigned long ptr;
1362         unsigned long end;
1363         unsigned long item_offset;
1364         u64 refs;
1365         int size;
1366         int type;
1367         int ret;
1368
1369         leaf = path->nodes[0];
1370         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1371         item_offset = (unsigned long)iref - (unsigned long)ei;
1372
1373         type = extent_ref_type(parent, owner);
1374         size = btrfs_extent_inline_ref_size(type);
1375
1376         ret = btrfs_extend_item(trans, root, path, size);
1377         BUG_ON(ret);
1378
1379         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1380         refs = btrfs_extent_refs(leaf, ei);
1381         refs += refs_to_add;
1382         btrfs_set_extent_refs(leaf, ei, refs);
1383         if (extent_op)
1384                 __run_delayed_extent_op(extent_op, leaf, ei);
1385
1386         ptr = (unsigned long)ei + item_offset;
1387         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1388         if (ptr < end - size)
1389                 memmove_extent_buffer(leaf, ptr + size, ptr,
1390                                       end - size - ptr);
1391
1392         iref = (struct btrfs_extent_inline_ref *)ptr;
1393         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1394         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1395                 struct btrfs_extent_data_ref *dref;
1396                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1397                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1398                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1399                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1400                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1401         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1402                 struct btrfs_shared_data_ref *sref;
1403                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1404                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1405                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1406         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1407                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1408         } else {
1409                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1410         }
1411         btrfs_mark_buffer_dirty(leaf);
1412         return 0;
1413 }
1414
1415 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1416                                  struct btrfs_root *root,
1417                                  struct btrfs_path *path,
1418                                  struct btrfs_extent_inline_ref **ref_ret,
1419                                  u64 bytenr, u64 num_bytes, u64 parent,
1420                                  u64 root_objectid, u64 owner, u64 offset)
1421 {
1422         int ret;
1423
1424         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1425                                            bytenr, num_bytes, parent,
1426                                            root_objectid, owner, offset, 0);
1427         if (ret != -ENOENT)
1428                 return ret;
1429
1430         btrfs_release_path(root, path);
1431         *ref_ret = NULL;
1432
1433         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1434                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1435                                             root_objectid);
1436         } else {
1437                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1438                                              root_objectid, owner, offset);
1439         }
1440         return ret;
1441 }
1442
1443 /*
1444  * helper to update/remove inline back ref
1445  */
1446 static noinline_for_stack
1447 int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1448                                  struct btrfs_root *root,
1449                                  struct btrfs_path *path,
1450                                  struct btrfs_extent_inline_ref *iref,
1451                                  int refs_to_mod,
1452                                  struct btrfs_delayed_extent_op *extent_op)
1453 {
1454         struct extent_buffer *leaf;
1455         struct btrfs_extent_item *ei;
1456         struct btrfs_extent_data_ref *dref = NULL;
1457         struct btrfs_shared_data_ref *sref = NULL;
1458         unsigned long ptr;
1459         unsigned long end;
1460         u32 item_size;
1461         int size;
1462         int type;
1463         int ret;
1464         u64 refs;
1465
1466         leaf = path->nodes[0];
1467         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1468         refs = btrfs_extent_refs(leaf, ei);
1469         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1470         refs += refs_to_mod;
1471         btrfs_set_extent_refs(leaf, ei, refs);
1472         if (extent_op)
1473                 __run_delayed_extent_op(extent_op, leaf, ei);
1474
1475         type = btrfs_extent_inline_ref_type(leaf, iref);
1476
1477         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1478                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1479                 refs = btrfs_extent_data_ref_count(leaf, dref);
1480         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1481                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1482                 refs = btrfs_shared_data_ref_count(leaf, sref);
1483         } else {
1484                 refs = 1;
1485                 BUG_ON(refs_to_mod != -1);
1486         }
1487
1488         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1489         refs += refs_to_mod;
1490
1491         if (refs > 0) {
1492                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1493                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1494                 else
1495                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1496         } else {
1497                 size =  btrfs_extent_inline_ref_size(type);
1498                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1499                 ptr = (unsigned long)iref;
1500                 end = (unsigned long)ei + item_size;
1501                 if (ptr + size < end)
1502                         memmove_extent_buffer(leaf, ptr, ptr + size,
1503                                               end - ptr - size);
1504                 item_size -= size;
1505                 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1506                 BUG_ON(ret);
1507         }
1508         btrfs_mark_buffer_dirty(leaf);
1509         return 0;
1510 }
1511
1512 static noinline_for_stack
1513 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1514                                  struct btrfs_root *root,
1515                                  struct btrfs_path *path,
1516                                  u64 bytenr, u64 num_bytes, u64 parent,
1517                                  u64 root_objectid, u64 owner,
1518                                  u64 offset, int refs_to_add,
1519                                  struct btrfs_delayed_extent_op *extent_op)
1520 {
1521         struct btrfs_extent_inline_ref *iref;
1522         int ret;
1523
1524         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1525                                            bytenr, num_bytes, parent,
1526                                            root_objectid, owner, offset, 1);
1527         if (ret == 0) {
1528                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1529                 ret = update_inline_extent_backref(trans, root, path, iref,
1530                                                    refs_to_add, extent_op);
1531         } else if (ret == -ENOENT) {
1532                 ret = setup_inline_extent_backref(trans, root, path, iref,
1533                                                   parent, root_objectid,
1534                                                   owner, offset, refs_to_add,
1535                                                   extent_op);
1536         }
1537         return ret;
1538 }
1539
1540 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1541                                  struct btrfs_root *root,
1542                                  struct btrfs_path *path,
1543                                  u64 bytenr, u64 parent, u64 root_objectid,
1544                                  u64 owner, u64 offset, int refs_to_add)
1545 {
1546         int ret;
1547         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1548                 BUG_ON(refs_to_add != 1);
1549                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1550                                             parent, root_objectid);
1551         } else {
1552                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1553                                              parent, root_objectid,
1554                                              owner, offset, refs_to_add);
1555         }
1556         return ret;
1557 }
1558
1559 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1560                                  struct btrfs_root *root,
1561                                  struct btrfs_path *path,
1562                                  struct btrfs_extent_inline_ref *iref,
1563                                  int refs_to_drop, int is_data)
1564 {
1565         int ret;
1566
1567         BUG_ON(!is_data && refs_to_drop != 1);
1568         if (iref) {
1569                 ret = update_inline_extent_backref(trans, root, path, iref,
1570                                                    -refs_to_drop, NULL);
1571         } else if (is_data) {
1572                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1573         } else {
1574                 ret = btrfs_del_item(trans, root, path);
1575         }
1576         return ret;
1577 }
1578
1579 static void btrfs_issue_discard(struct block_device *bdev,
1580                                 u64 start, u64 len)
1581 {
1582         blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1583                              DISCARD_FL_BARRIER);
1584 }
1585
1586 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1587                                 u64 num_bytes)
1588 {
1589         int ret;
1590         u64 map_length = num_bytes;
1591         struct btrfs_multi_bio *multi = NULL;
1592
1593         if (!btrfs_test_opt(root, DISCARD))
1594                 return 0;
1595
1596         /* Tell the block device(s) that the sectors can be discarded */
1597         ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1598                               bytenr, &map_length, &multi, 0);
1599         if (!ret) {
1600                 struct btrfs_bio_stripe *stripe = multi->stripes;
1601                 int i;
1602
1603                 if (map_length > num_bytes)
1604                         map_length = num_bytes;
1605
1606                 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1607                         btrfs_issue_discard(stripe->dev->bdev,
1608                                             stripe->physical,
1609                                             map_length);
1610                 }
1611                 kfree(multi);
1612         }
1613
1614         return ret;
1615 }
1616
1617 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1618                          struct btrfs_root *root,
1619                          u64 bytenr, u64 num_bytes, u64 parent,
1620                          u64 root_objectid, u64 owner, u64 offset)
1621 {
1622         int ret;
1623         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1624                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1625
1626         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1627                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1628                                         parent, root_objectid, (int)owner,
1629                                         BTRFS_ADD_DELAYED_REF, NULL);
1630         } else {
1631                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1632                                         parent, root_objectid, owner, offset,
1633                                         BTRFS_ADD_DELAYED_REF, NULL);
1634         }
1635         return ret;
1636 }
1637
1638 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1639                                   struct btrfs_root *root,
1640                                   u64 bytenr, u64 num_bytes,
1641                                   u64 parent, u64 root_objectid,
1642                                   u64 owner, u64 offset, int refs_to_add,
1643                                   struct btrfs_delayed_extent_op *extent_op)
1644 {
1645         struct btrfs_path *path;
1646         struct extent_buffer *leaf;
1647         struct btrfs_extent_item *item;
1648         u64 refs;
1649         int ret;
1650         int err = 0;
1651
1652         path = btrfs_alloc_path();
1653         if (!path)
1654                 return -ENOMEM;
1655
1656         path->reada = 1;
1657         path->leave_spinning = 1;
1658         /* this will setup the path even if it fails to insert the back ref */
1659         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1660                                            path, bytenr, num_bytes, parent,
1661                                            root_objectid, owner, offset,
1662                                            refs_to_add, extent_op);
1663         if (ret == 0)
1664                 goto out;
1665
1666         if (ret != -EAGAIN) {
1667                 err = ret;
1668                 goto out;
1669         }
1670
1671         leaf = path->nodes[0];
1672         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1673         refs = btrfs_extent_refs(leaf, item);
1674         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1675         if (extent_op)
1676                 __run_delayed_extent_op(extent_op, leaf, item);
1677
1678         btrfs_mark_buffer_dirty(leaf);
1679         btrfs_release_path(root->fs_info->extent_root, path);
1680
1681         path->reada = 1;
1682         path->leave_spinning = 1;
1683
1684         /* now insert the actual backref */
1685         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1686                                     path, bytenr, parent, root_objectid,
1687                                     owner, offset, refs_to_add);
1688         BUG_ON(ret);
1689 out:
1690         btrfs_free_path(path);
1691         return err;
1692 }
1693
1694 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1695                                 struct btrfs_root *root,
1696                                 struct btrfs_delayed_ref_node *node,
1697                                 struct btrfs_delayed_extent_op *extent_op,
1698                                 int insert_reserved)
1699 {
1700         int ret = 0;
1701         struct btrfs_delayed_data_ref *ref;
1702         struct btrfs_key ins;
1703         u64 parent = 0;
1704         u64 ref_root = 0;
1705         u64 flags = 0;
1706
1707         ins.objectid = node->bytenr;
1708         ins.offset = node->num_bytes;
1709         ins.type = BTRFS_EXTENT_ITEM_KEY;
1710
1711         ref = btrfs_delayed_node_to_data_ref(node);
1712         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1713                 parent = ref->parent;
1714         else
1715                 ref_root = ref->root;
1716
1717         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1718                 if (extent_op) {
1719                         BUG_ON(extent_op->update_key);
1720                         flags |= extent_op->flags_to_set;
1721                 }
1722                 ret = alloc_reserved_file_extent(trans, root,
1723                                                  parent, ref_root, flags,
1724                                                  ref->objectid, ref->offset,
1725                                                  &ins, node->ref_mod);
1726         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1727                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1728                                              node->num_bytes, parent,
1729                                              ref_root, ref->objectid,
1730                                              ref->offset, node->ref_mod,
1731                                              extent_op);
1732         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1733                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1734                                           node->num_bytes, parent,
1735                                           ref_root, ref->objectid,
1736                                           ref->offset, node->ref_mod,
1737                                           extent_op);
1738         } else {
1739                 BUG();
1740         }
1741         return ret;
1742 }
1743
1744 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1745                                     struct extent_buffer *leaf,
1746                                     struct btrfs_extent_item *ei)
1747 {
1748         u64 flags = btrfs_extent_flags(leaf, ei);
1749         if (extent_op->update_flags) {
1750                 flags |= extent_op->flags_to_set;
1751                 btrfs_set_extent_flags(leaf, ei, flags);
1752         }
1753
1754         if (extent_op->update_key) {
1755                 struct btrfs_tree_block_info *bi;
1756                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1757                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1758                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1759         }
1760 }
1761
1762 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1763                                  struct btrfs_root *root,
1764                                  struct btrfs_delayed_ref_node *node,
1765                                  struct btrfs_delayed_extent_op *extent_op)
1766 {
1767         struct btrfs_key key;
1768         struct btrfs_path *path;
1769         struct btrfs_extent_item *ei;
1770         struct extent_buffer *leaf;
1771         u32 item_size;
1772         int ret;
1773         int err = 0;
1774
1775         path = btrfs_alloc_path();
1776         if (!path)
1777                 return -ENOMEM;
1778
1779         key.objectid = node->bytenr;
1780         key.type = BTRFS_EXTENT_ITEM_KEY;
1781         key.offset = node->num_bytes;
1782
1783         path->reada = 1;
1784         path->leave_spinning = 1;
1785         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1786                                 path, 0, 1);
1787         if (ret < 0) {
1788                 err = ret;
1789                 goto out;
1790         }
1791         if (ret > 0) {
1792                 err = -EIO;
1793                 goto out;
1794         }
1795
1796         leaf = path->nodes[0];
1797         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1798 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1799         if (item_size < sizeof(*ei)) {
1800                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1801                                              path, (u64)-1, 0);
1802                 if (ret < 0) {
1803                         err = ret;
1804                         goto out;
1805                 }
1806                 leaf = path->nodes[0];
1807                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1808         }
1809 #endif
1810         BUG_ON(item_size < sizeof(*ei));
1811         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1812         __run_delayed_extent_op(extent_op, leaf, ei);
1813
1814         btrfs_mark_buffer_dirty(leaf);
1815 out:
1816         btrfs_free_path(path);
1817         return err;
1818 }
1819
1820 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1821                                 struct btrfs_root *root,
1822                                 struct btrfs_delayed_ref_node *node,
1823                                 struct btrfs_delayed_extent_op *extent_op,
1824                                 int insert_reserved)
1825 {
1826         int ret = 0;
1827         struct btrfs_delayed_tree_ref *ref;
1828         struct btrfs_key ins;
1829         u64 parent = 0;
1830         u64 ref_root = 0;
1831
1832         ins.objectid = node->bytenr;
1833         ins.offset = node->num_bytes;
1834         ins.type = BTRFS_EXTENT_ITEM_KEY;
1835
1836         ref = btrfs_delayed_node_to_tree_ref(node);
1837         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1838                 parent = ref->parent;
1839         else
1840                 ref_root = ref->root;
1841
1842         BUG_ON(node->ref_mod != 1);
1843         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1844                 BUG_ON(!extent_op || !extent_op->update_flags ||
1845                        !extent_op->update_key);
1846                 ret = alloc_reserved_tree_block(trans, root,
1847                                                 parent, ref_root,
1848                                                 extent_op->flags_to_set,
1849                                                 &extent_op->key,
1850                                                 ref->level, &ins);
1851         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1852                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1853                                              node->num_bytes, parent, ref_root,
1854                                              ref->level, 0, 1, extent_op);
1855         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1856                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1857                                           node->num_bytes, parent, ref_root,
1858                                           ref->level, 0, 1, extent_op);
1859         } else {
1860                 BUG();
1861         }
1862         return ret;
1863 }
1864
1865
1866 /* helper function to actually process a single delayed ref entry */
1867 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1868                                struct btrfs_root *root,
1869                                struct btrfs_delayed_ref_node *node,
1870                                struct btrfs_delayed_extent_op *extent_op,
1871                                int insert_reserved)
1872 {
1873         int ret;
1874         if (btrfs_delayed_ref_is_head(node)) {
1875                 struct btrfs_delayed_ref_head *head;
1876                 /*
1877                  * we've hit the end of the chain and we were supposed
1878                  * to insert this extent into the tree.  But, it got
1879                  * deleted before we ever needed to insert it, so all
1880                  * we have to do is clean up the accounting
1881                  */
1882                 BUG_ON(extent_op);
1883                 head = btrfs_delayed_node_to_head(node);
1884                 if (insert_reserved) {
1885                         int mark_free = 0;
1886                         struct extent_buffer *must_clean = NULL;
1887
1888                         ret = pin_down_bytes(trans, root, NULL,
1889                                              node->bytenr, node->num_bytes,
1890                                              head->is_data, 1, &must_clean);
1891                         if (ret > 0)
1892                                 mark_free = 1;
1893
1894                         if (must_clean) {
1895                                 clean_tree_block(NULL, root, must_clean);
1896                                 btrfs_tree_unlock(must_clean);
1897                                 free_extent_buffer(must_clean);
1898                         }
1899                         if (head->is_data) {
1900                                 ret = btrfs_del_csums(trans, root,
1901                                                       node->bytenr,
1902                                                       node->num_bytes);
1903                                 BUG_ON(ret);
1904                         }
1905                         if (mark_free) {
1906                                 ret = btrfs_free_reserved_extent(root,
1907                                                         node->bytenr,
1908                                                         node->num_bytes);
1909                                 BUG_ON(ret);
1910                         }
1911                 }
1912                 mutex_unlock(&head->mutex);
1913                 return 0;
1914         }
1915
1916         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1917             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1918                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
1919                                            insert_reserved);
1920         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1921                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1922                 ret = run_delayed_data_ref(trans, root, node, extent_op,
1923                                            insert_reserved);
1924         else
1925                 BUG();
1926         return ret;
1927 }
1928
1929 static noinline struct btrfs_delayed_ref_node *
1930 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1931 {
1932         struct rb_node *node;
1933         struct btrfs_delayed_ref_node *ref;
1934         int action = BTRFS_ADD_DELAYED_REF;
1935 again:
1936         /*
1937          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1938          * this prevents ref count from going down to zero when
1939          * there still are pending delayed ref.
1940          */
1941         node = rb_prev(&head->node.rb_node);
1942         while (1) {
1943                 if (!node)
1944                         break;
1945                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
1946                                 rb_node);
1947                 if (ref->bytenr != head->node.bytenr)
1948                         break;
1949                 if (ref->action == action)
1950                         return ref;
1951                 node = rb_prev(node);
1952         }
1953         if (action == BTRFS_ADD_DELAYED_REF) {
1954                 action = BTRFS_DROP_DELAYED_REF;
1955                 goto again;
1956         }
1957         return NULL;
1958 }
1959
1960 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1961                                        struct btrfs_root *root,
1962                                        struct list_head *cluster)
1963 {
1964         struct btrfs_delayed_ref_root *delayed_refs;
1965         struct btrfs_delayed_ref_node *ref;
1966         struct btrfs_delayed_ref_head *locked_ref = NULL;
1967         struct btrfs_delayed_extent_op *extent_op;
1968         int ret;
1969         int count = 0;
1970         int must_insert_reserved = 0;
1971
1972         delayed_refs = &trans->transaction->delayed_refs;
1973         while (1) {
1974                 if (!locked_ref) {
1975                         /* pick a new head ref from the cluster list */
1976                         if (list_empty(cluster))
1977                                 break;
1978
1979                         locked_ref = list_entry(cluster->next,
1980                                      struct btrfs_delayed_ref_head, cluster);
1981
1982                         /* grab the lock that says we are going to process
1983                          * all the refs for this head */
1984                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
1985
1986                         /*
1987                          * we may have dropped the spin lock to get the head
1988                          * mutex lock, and that might have given someone else
1989                          * time to free the head.  If that's true, it has been
1990                          * removed from our list and we can move on.
1991                          */
1992                         if (ret == -EAGAIN) {
1993                                 locked_ref = NULL;
1994                                 count++;
1995                                 continue;
1996                         }
1997                 }
1998
1999                 /*
2000                  * record the must insert reserved flag before we
2001                  * drop the spin lock.
2002                  */
2003                 must_insert_reserved = locked_ref->must_insert_reserved;
2004                 locked_ref->must_insert_reserved = 0;
2005
2006                 extent_op = locked_ref->extent_op;
2007                 locked_ref->extent_op = NULL;
2008
2009                 /*
2010                  * locked_ref is the head node, so we have to go one
2011                  * node back for any delayed ref updates
2012                  */
2013                 ref = select_delayed_ref(locked_ref);
2014                 if (!ref) {
2015                         /* All delayed refs have been processed, Go ahead
2016                          * and send the head node to run_one_delayed_ref,
2017                          * so that any accounting fixes can happen
2018                          */
2019                         ref = &locked_ref->node;
2020
2021                         if (extent_op && must_insert_reserved) {
2022                                 kfree(extent_op);
2023                                 extent_op = NULL;
2024                         }
2025
2026                         if (extent_op) {
2027                                 spin_unlock(&delayed_refs->lock);
2028
2029                                 ret = run_delayed_extent_op(trans, root,
2030                                                             ref, extent_op);
2031                                 BUG_ON(ret);
2032                                 kfree(extent_op);
2033
2034                                 cond_resched();
2035                                 spin_lock(&delayed_refs->lock);
2036                                 continue;
2037                         }
2038
2039                         list_del_init(&locked_ref->cluster);
2040                         locked_ref = NULL;
2041                 }
2042
2043                 ref->in_tree = 0;
2044                 rb_erase(&ref->rb_node, &delayed_refs->root);
2045                 delayed_refs->num_entries--;
2046
2047                 spin_unlock(&delayed_refs->lock);
2048
2049                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2050                                           must_insert_reserved);
2051                 BUG_ON(ret);
2052
2053                 btrfs_put_delayed_ref(ref);
2054                 kfree(extent_op);
2055                 count++;
2056
2057                 cond_resched();
2058                 spin_lock(&delayed_refs->lock);
2059         }
2060         return count;
2061 }
2062
2063 /*
2064  * this starts processing the delayed reference count updates and
2065  * extent insertions we have queued up so far.  count can be
2066  * 0, which means to process everything in the tree at the start
2067  * of the run (but not newly added entries), or it can be some target
2068  * number you'd like to process.
2069  */
2070 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2071                            struct btrfs_root *root, unsigned long count)
2072 {
2073         struct rb_node *node;
2074         struct btrfs_delayed_ref_root *delayed_refs;
2075         struct btrfs_delayed_ref_node *ref;
2076         struct list_head cluster;
2077         int ret;
2078         int run_all = count == (unsigned long)-1;
2079         int run_most = 0;
2080
2081         if (root == root->fs_info->extent_root)
2082                 root = root->fs_info->tree_root;
2083
2084         delayed_refs = &trans->transaction->delayed_refs;
2085         INIT_LIST_HEAD(&cluster);
2086 again:
2087         spin_lock(&delayed_refs->lock);
2088         if (count == 0) {
2089                 count = delayed_refs->num_entries * 2;
2090                 run_most = 1;
2091         }
2092         while (1) {
2093                 if (!(run_all || run_most) &&
2094                     delayed_refs->num_heads_ready < 64)
2095                         break;
2096
2097                 /*
2098                  * go find something we can process in the rbtree.  We start at
2099                  * the beginning of the tree, and then build a cluster
2100                  * of refs to process starting at the first one we are able to
2101                  * lock
2102                  */
2103                 ret = btrfs_find_ref_cluster(trans, &cluster,
2104                                              delayed_refs->run_delayed_start);
2105                 if (ret)
2106                         break;
2107
2108                 ret = run_clustered_refs(trans, root, &cluster);
2109                 BUG_ON(ret < 0);
2110
2111                 count -= min_t(unsigned long, ret, count);
2112
2113                 if (count == 0)
2114                         break;
2115         }
2116
2117         if (run_all) {
2118                 node = rb_first(&delayed_refs->root);
2119                 if (!node)
2120                         goto out;
2121                 count = (unsigned long)-1;
2122
2123                 while (node) {
2124                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2125                                        rb_node);
2126                         if (btrfs_delayed_ref_is_head(ref)) {
2127                                 struct btrfs_delayed_ref_head *head;
2128
2129                                 head = btrfs_delayed_node_to_head(ref);
2130                                 atomic_inc(&ref->refs);
2131
2132                                 spin_unlock(&delayed_refs->lock);
2133                                 mutex_lock(&head->mutex);
2134                                 mutex_unlock(&head->mutex);
2135
2136                                 btrfs_put_delayed_ref(ref);
2137                                 cond_resched();
2138                                 goto again;
2139                         }
2140                         node = rb_next(node);
2141                 }
2142                 spin_unlock(&delayed_refs->lock);
2143                 schedule_timeout(1);
2144                 goto again;
2145         }
2146 out:
2147         spin_unlock(&delayed_refs->lock);
2148         return 0;
2149 }
2150
2151 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2152                                 struct btrfs_root *root,
2153                                 u64 bytenr, u64 num_bytes, u64 flags,
2154                                 int is_data)
2155 {
2156         struct btrfs_delayed_extent_op *extent_op;
2157         int ret;
2158
2159         extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2160         if (!extent_op)
2161                 return -ENOMEM;
2162
2163         extent_op->flags_to_set = flags;
2164         extent_op->update_flags = 1;
2165         extent_op->update_key = 0;
2166         extent_op->is_data = is_data ? 1 : 0;
2167
2168         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2169         if (ret)
2170                 kfree(extent_op);
2171         return ret;
2172 }
2173
2174 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2175                                       struct btrfs_root *root,
2176                                       struct btrfs_path *path,
2177                                       u64 objectid, u64 offset, u64 bytenr)
2178 {
2179         struct btrfs_delayed_ref_head *head;
2180         struct btrfs_delayed_ref_node *ref;
2181         struct btrfs_delayed_data_ref *data_ref;
2182         struct btrfs_delayed_ref_root *delayed_refs;
2183         struct rb_node *node;
2184         int ret = 0;
2185
2186         ret = -ENOENT;
2187         delayed_refs = &trans->transaction->delayed_refs;
2188         spin_lock(&delayed_refs->lock);
2189         head = btrfs_find_delayed_ref_head(trans, bytenr);
2190         if (!head)
2191                 goto out;
2192
2193         if (!mutex_trylock(&head->mutex)) {
2194                 atomic_inc(&head->node.refs);
2195                 spin_unlock(&delayed_refs->lock);
2196
2197                 btrfs_release_path(root->fs_info->extent_root, path);
2198
2199                 mutex_lock(&head->mutex);
2200                 mutex_unlock(&head->mutex);
2201                 btrfs_put_delayed_ref(&head->node);
2202                 return -EAGAIN;
2203         }
2204
2205         node = rb_prev(&head->node.rb_node);
2206         if (!node)
2207                 goto out_unlock;
2208
2209         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2210
2211         if (ref->bytenr != bytenr)
2212                 goto out_unlock;
2213
2214         ret = 1;
2215         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2216                 goto out_unlock;
2217
2218         data_ref = btrfs_delayed_node_to_data_ref(ref);
2219
2220         node = rb_prev(node);
2221         if (node) {
2222                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2223                 if (ref->bytenr == bytenr)
2224                         goto out_unlock;
2225         }
2226
2227         if (data_ref->root != root->root_key.objectid ||
2228             data_ref->objectid != objectid || data_ref->offset != offset)
2229                 goto out_unlock;
2230
2231         ret = 0;
2232 out_unlock:
2233         mutex_unlock(&head->mutex);
2234 out:
2235         spin_unlock(&delayed_refs->lock);
2236         return ret;
2237 }
2238
2239 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2240                                         struct btrfs_root *root,
2241                                         struct btrfs_path *path,
2242                                         u64 objectid, u64 offset, u64 bytenr)
2243 {
2244         struct btrfs_root *extent_root = root->fs_info->extent_root;
2245         struct extent_buffer *leaf;
2246         struct btrfs_extent_data_ref *ref;
2247         struct btrfs_extent_inline_ref *iref;
2248         struct btrfs_extent_item *ei;
2249         struct btrfs_key key;
2250         u32 item_size;
2251         int ret;
2252
2253         key.objectid = bytenr;
2254         key.offset = (u64)-1;
2255         key.type = BTRFS_EXTENT_ITEM_KEY;
2256
2257         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2258         if (ret < 0)
2259                 goto out;
2260         BUG_ON(ret == 0);
2261
2262         ret = -ENOENT;
2263         if (path->slots[0] == 0)
2264                 goto out;
2265
2266         path->slots[0]--;
2267         leaf = path->nodes[0];
2268         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2269
2270         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2271                 goto out;
2272
2273         ret = 1;
2274         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2275 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2276         if (item_size < sizeof(*ei)) {
2277                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2278                 goto out;
2279         }
2280 #endif
2281         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2282
2283         if (item_size != sizeof(*ei) +
2284             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2285                 goto out;
2286
2287         if (btrfs_extent_generation(leaf, ei) <=
2288             btrfs_root_last_snapshot(&root->root_item))
2289                 goto out;
2290
2291         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2292         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2293             BTRFS_EXTENT_DATA_REF_KEY)
2294                 goto out;
2295
2296         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2297         if (btrfs_extent_refs(leaf, ei) !=
2298             btrfs_extent_data_ref_count(leaf, ref) ||
2299             btrfs_extent_data_ref_root(leaf, ref) !=
2300             root->root_key.objectid ||
2301             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2302             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2303                 goto out;
2304
2305         ret = 0;
2306 out:
2307         return ret;
2308 }
2309
2310 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2311                           struct btrfs_root *root,
2312                           u64 objectid, u64 offset, u64 bytenr)
2313 {
2314         struct btrfs_path *path;
2315         int ret;
2316         int ret2;
2317
2318         path = btrfs_alloc_path();
2319         if (!path)
2320                 return -ENOENT;
2321
2322         do {
2323                 ret = check_committed_ref(trans, root, path, objectid,
2324                                           offset, bytenr);
2325                 if (ret && ret != -ENOENT)
2326                         goto out;
2327
2328                 ret2 = check_delayed_ref(trans, root, path, objectid,
2329                                          offset, bytenr);
2330         } while (ret2 == -EAGAIN);
2331
2332         if (ret2 && ret2 != -ENOENT) {
2333                 ret = ret2;
2334                 goto out;
2335         }
2336
2337         if (ret != -ENOENT || ret2 != -ENOENT)
2338                 ret = 0;
2339 out:
2340         btrfs_free_path(path);
2341         return ret;
2342 }
2343
2344 #if 0
2345 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2346                     struct extent_buffer *buf, u32 nr_extents)
2347 {
2348         struct btrfs_key key;
2349         struct btrfs_file_extent_item *fi;
2350         u64 root_gen;
2351         u32 nritems;
2352         int i;
2353         int level;
2354         int ret = 0;
2355         int shared = 0;
2356
2357         if (!root->ref_cows)
2358                 return 0;
2359
2360         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2361                 shared = 0;
2362                 root_gen = root->root_key.offset;
2363         } else {
2364                 shared = 1;
2365                 root_gen = trans->transid - 1;
2366         }
2367
2368         level = btrfs_header_level(buf);
2369         nritems = btrfs_header_nritems(buf);
2370
2371         if (level == 0) {
2372                 struct btrfs_leaf_ref *ref;
2373                 struct btrfs_extent_info *info;
2374
2375                 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2376                 if (!ref) {
2377                         ret = -ENOMEM;
2378                         goto out;
2379                 }
2380
2381                 ref->root_gen = root_gen;
2382                 ref->bytenr = buf->start;
2383                 ref->owner = btrfs_header_owner(buf);
2384                 ref->generation = btrfs_header_generation(buf);
2385                 ref->nritems = nr_extents;
2386                 info = ref->extents;
2387
2388                 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2389                         u64 disk_bytenr;
2390                         btrfs_item_key_to_cpu(buf, &key, i);
2391                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2392                                 continue;
2393                         fi = btrfs_item_ptr(buf, i,
2394                                             struct btrfs_file_extent_item);
2395                         if (btrfs_file_extent_type(buf, fi) ==
2396                             BTRFS_FILE_EXTENT_INLINE)
2397                                 continue;
2398                         disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2399                         if (disk_bytenr == 0)
2400                                 continue;
2401
2402                         info->bytenr = disk_bytenr;
2403                         info->num_bytes =
2404                                 btrfs_file_extent_disk_num_bytes(buf, fi);
2405                         info->objectid = key.objectid;
2406                         info->offset = key.offset;
2407                         info++;
2408                 }
2409
2410                 ret = btrfs_add_leaf_ref(root, ref, shared);
2411                 if (ret == -EEXIST && shared) {
2412                         struct btrfs_leaf_ref *old;
2413                         old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2414                         BUG_ON(!old);
2415                         btrfs_remove_leaf_ref(root, old);
2416                         btrfs_free_leaf_ref(root, old);
2417                         ret = btrfs_add_leaf_ref(root, ref, shared);
2418                 }
2419                 WARN_ON(ret);
2420                 btrfs_free_leaf_ref(root, ref);
2421         }
2422 out:
2423         return ret;
2424 }
2425
2426 /* when a block goes through cow, we update the reference counts of
2427  * everything that block points to.  The internal pointers of the block
2428  * can be in just about any order, and it is likely to have clusters of
2429  * things that are close together and clusters of things that are not.
2430  *
2431  * To help reduce the seeks that come with updating all of these reference
2432  * counts, sort them by byte number before actual updates are done.
2433  *
2434  * struct refsort is used to match byte number to slot in the btree block.
2435  * we sort based on the byte number and then use the slot to actually
2436  * find the item.
2437  *
2438  * struct refsort is smaller than strcut btrfs_item and smaller than
2439  * struct btrfs_key_ptr.  Since we're currently limited to the page size
2440  * for a btree block, there's no way for a kmalloc of refsorts for a
2441  * single node to be bigger than a page.
2442  */
2443 struct refsort {
2444         u64 bytenr;
2445         u32 slot;
2446 };
2447
2448 /*
2449  * for passing into sort()
2450  */
2451 static int refsort_cmp(const void *a_void, const void *b_void)
2452 {
2453         const struct refsort *a = a_void;
2454         const struct refsort *b = b_void;
2455
2456         if (a->bytenr < b->bytenr)
2457                 return -1;
2458         if (a->bytenr > b->bytenr)
2459                 return 1;
2460         return 0;
2461 }
2462 #endif
2463
2464 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2465                            struct btrfs_root *root,
2466                            struct extent_buffer *buf,
2467                            int full_backref, int inc)
2468 {
2469         u64 bytenr;
2470         u64 num_bytes;
2471         u64 parent;
2472         u64 ref_root;
2473         u32 nritems;
2474         struct btrfs_key key;
2475         struct btrfs_file_extent_item *fi;
2476         int i;
2477         int level;
2478         int ret = 0;
2479         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2480                             u64, u64, u64, u64, u64, u64);
2481
2482         ref_root = btrfs_header_owner(buf);
2483         nritems = btrfs_header_nritems(buf);
2484         level = btrfs_header_level(buf);
2485
2486         if (!root->ref_cows && level == 0)
2487                 return 0;
2488
2489         if (inc)
2490                 process_func = btrfs_inc_extent_ref;
2491         else
2492                 process_func = btrfs_free_extent;
2493
2494         if (full_backref)
2495                 parent = buf->start;
2496         else
2497                 parent = 0;
2498
2499         for (i = 0; i < nritems; i++) {
2500                 if (level == 0) {
2501                         btrfs_item_key_to_cpu(buf, &key, i);
2502                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2503                                 continue;
2504                         fi = btrfs_item_ptr(buf, i,
2505                                             struct btrfs_file_extent_item);
2506                         if (btrfs_file_extent_type(buf, fi) ==
2507                             BTRFS_FILE_EXTENT_INLINE)
2508                                 continue;
2509                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2510                         if (bytenr == 0)
2511                                 continue;
2512
2513                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2514                         key.offset -= btrfs_file_extent_offset(buf, fi);
2515                         ret = process_func(trans, root, bytenr, num_bytes,
2516                                            parent, ref_root, key.objectid,
2517                                            key.offset);
2518                         if (ret)
2519                                 goto fail;
2520                 } else {
2521                         bytenr = btrfs_node_blockptr(buf, i);
2522                         num_bytes = btrfs_level_size(root, level - 1);
2523                         ret = process_func(trans, root, bytenr, num_bytes,
2524                                            parent, ref_root, level - 1, 0);
2525                         if (ret)
2526                                 goto fail;
2527                 }
2528         }
2529         return 0;
2530 fail:
2531         BUG();
2532         return ret;
2533 }
2534
2535 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2536                   struct extent_buffer *buf, int full_backref)
2537 {
2538         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2539 }
2540
2541 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2542                   struct extent_buffer *buf, int full_backref)
2543 {
2544         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2545 }
2546
2547 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2548                                  struct btrfs_root *root,
2549                                  struct btrfs_path *path,
2550                                  struct btrfs_block_group_cache *cache)
2551 {
2552         int ret;
2553         struct btrfs_root *extent_root = root->fs_info->extent_root;
2554         unsigned long bi;
2555         struct extent_buffer *leaf;
2556
2557         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2558         if (ret < 0)
2559                 goto fail;
2560         BUG_ON(ret);
2561
2562         leaf = path->nodes[0];
2563         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2564         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2565         btrfs_mark_buffer_dirty(leaf);
2566         btrfs_release_path(extent_root, path);
2567 fail:
2568         if (ret)
2569                 return ret;
2570         return 0;
2571
2572 }
2573
2574 static struct btrfs_block_group_cache *
2575 next_block_group(struct btrfs_root *root,
2576                  struct btrfs_block_group_cache *cache)
2577 {
2578         struct rb_node *node;
2579         spin_lock(&root->fs_info->block_group_cache_lock);
2580         node = rb_next(&cache->cache_node);
2581         btrfs_put_block_group(cache);
2582         if (node) {
2583                 cache = rb_entry(node, struct btrfs_block_group_cache,
2584                                  cache_node);
2585                 atomic_inc(&cache->count);
2586         } else
2587                 cache = NULL;
2588         spin_unlock(&root->fs_info->block_group_cache_lock);
2589         return cache;
2590 }
2591
2592 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2593                                    struct btrfs_root *root)
2594 {
2595         struct btrfs_block_group_cache *cache;
2596         int err = 0;
2597         struct btrfs_path *path;
2598         u64 last = 0;
2599
2600         path = btrfs_alloc_path();
2601         if (!path)
2602                 return -ENOMEM;
2603
2604         while (1) {
2605                 if (last == 0) {
2606                         err = btrfs_run_delayed_refs(trans, root,
2607                                                      (unsigned long)-1);
2608                         BUG_ON(err);
2609                 }
2610
2611                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2612                 while (cache) {
2613                         if (cache->dirty)
2614                                 break;
2615                         cache = next_block_group(root, cache);
2616                 }
2617                 if (!cache) {
2618                         if (last == 0)
2619                                 break;
2620                         last = 0;
2621                         continue;
2622                 }
2623
2624                 cache->dirty = 0;
2625                 last = cache->key.objectid + cache->key.offset;
2626
2627                 err = write_one_cache_group(trans, root, path, cache);
2628                 BUG_ON(err);
2629                 btrfs_put_block_group(cache);
2630         }
2631
2632         btrfs_free_path(path);
2633         return 0;
2634 }
2635
2636 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2637 {
2638         struct btrfs_block_group_cache *block_group;
2639         int readonly = 0;
2640
2641         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2642         if (!block_group || block_group->ro)
2643                 readonly = 1;
2644         if (block_group)
2645                 btrfs_put_block_group(block_group);
2646         return readonly;
2647 }
2648
2649 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2650                              u64 total_bytes, u64 bytes_used,
2651                              struct btrfs_space_info **space_info)
2652 {
2653         struct btrfs_space_info *found;
2654
2655         found = __find_space_info(info, flags);
2656         if (found) {
2657                 spin_lock(&found->lock);
2658                 found->total_bytes += total_bytes;
2659                 found->bytes_used += bytes_used;
2660                 found->full = 0;
2661                 spin_unlock(&found->lock);
2662                 *space_info = found;
2663                 return 0;
2664         }
2665         found = kzalloc(sizeof(*found), GFP_NOFS);
2666         if (!found)
2667                 return -ENOMEM;
2668
2669         INIT_LIST_HEAD(&found->block_groups);
2670         init_rwsem(&found->groups_sem);
2671         spin_lock_init(&found->lock);
2672         found->flags = flags;
2673         found->total_bytes = total_bytes;
2674         found->bytes_used = bytes_used;
2675         found->bytes_pinned = 0;
2676         found->bytes_reserved = 0;
2677         found->bytes_readonly = 0;
2678         found->bytes_delalloc = 0;
2679         found->full = 0;
2680         found->force_alloc = 0;
2681         *space_info = found;
2682         list_add_rcu(&found->list, &info->space_info);
2683         atomic_set(&found->caching_threads, 0);
2684         return 0;
2685 }
2686
2687 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2688 {
2689         u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2690                                    BTRFS_BLOCK_GROUP_RAID1 |
2691                                    BTRFS_BLOCK_GROUP_RAID10 |
2692                                    BTRFS_BLOCK_GROUP_DUP);
2693         if (extra_flags) {
2694                 if (flags & BTRFS_BLOCK_GROUP_DATA)
2695                         fs_info->avail_data_alloc_bits |= extra_flags;
2696                 if (flags & BTRFS_BLOCK_GROUP_METADATA)
2697                         fs_info->avail_metadata_alloc_bits |= extra_flags;
2698                 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2699                         fs_info->avail_system_alloc_bits |= extra_flags;
2700         }
2701 }
2702
2703 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2704 {
2705         spin_lock(&cache->space_info->lock);
2706         spin_lock(&cache->lock);
2707         if (!cache->ro) {
2708                 cache->space_info->bytes_readonly += cache->key.offset -
2709                                         btrfs_block_group_used(&cache->item);
2710                 cache->ro = 1;
2711         }
2712         spin_unlock(&cache->lock);
2713         spin_unlock(&cache->space_info->lock);
2714 }
2715
2716 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2717 {
2718         u64 num_devices = root->fs_info->fs_devices->rw_devices;
2719
2720         if (num_devices == 1)
2721                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2722         if (num_devices < 4)
2723                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2724
2725         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2726             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2727                       BTRFS_BLOCK_GROUP_RAID10))) {
2728                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
2729         }
2730
2731         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2732             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2733                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2734         }
2735
2736         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2737             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2738              (flags & BTRFS_BLOCK_GROUP_RAID10) |
2739              (flags & BTRFS_BLOCK_GROUP_DUP)))
2740                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2741         return flags;
2742 }
2743
2744 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2745 {
2746         struct btrfs_fs_info *info = root->fs_info;
2747         u64 alloc_profile;
2748
2749         if (data) {
2750                 alloc_profile = info->avail_data_alloc_bits &
2751                         info->data_alloc_profile;
2752                 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2753         } else if (root == root->fs_info->chunk_root) {
2754                 alloc_profile = info->avail_system_alloc_bits &
2755                         info->system_alloc_profile;
2756                 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2757         } else {
2758                 alloc_profile = info->avail_metadata_alloc_bits &
2759                         info->metadata_alloc_profile;
2760                 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2761         }
2762
2763         return btrfs_reduce_alloc_profile(root, data);
2764 }
2765
2766 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2767 {
2768         u64 alloc_target;
2769
2770         alloc_target = btrfs_get_alloc_profile(root, 1);
2771         BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2772                                                        alloc_target);
2773 }
2774
2775 static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2776 {
2777         u64 num_bytes;
2778         int level;
2779
2780         level = BTRFS_MAX_LEVEL - 2;
2781         /*
2782          * NOTE: these calculations are absolutely the worst possible case.
2783          * This assumes that _every_ item we insert will require a new leaf, and
2784          * that the tree has grown to its maximum level size.
2785          */
2786
2787         /*
2788          * for every item we insert we could insert both an extent item and a
2789          * extent ref item.  Then for ever item we insert, we will need to cow
2790          * both the original leaf, plus the leaf to the left and right of it.
2791          *
2792          * Unless we are talking about the extent root, then we just want the
2793          * number of items * 2, since we just need the extent item plus its ref.
2794          */
2795         if (root == root->fs_info->extent_root)
2796                 num_bytes = num_items * 2;
2797         else
2798                 num_bytes = (num_items + (2 * num_items)) * 3;
2799
2800         /*
2801          * num_bytes is total number of leaves we could need times the leaf
2802          * size, and then for every leaf we could end up cow'ing 2 nodes per
2803          * level, down to the leaf level.
2804          */
2805         num_bytes = (num_bytes * root->leafsize) +
2806                 (num_bytes * (level * 2)) * root->nodesize;
2807
2808         return num_bytes;
2809 }
2810
2811 /*
2812  * Unreserve metadata space for delalloc.  If we have less reserved credits than
2813  * we have extents, this function does nothing.
2814  */
2815 int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2816                                           struct inode *inode, int num_items)
2817 {
2818         struct btrfs_fs_info *info = root->fs_info;
2819         struct btrfs_space_info *meta_sinfo;
2820         u64 num_bytes;
2821         u64 alloc_target;
2822         bool bug = false;
2823
2824         /* get the space info for where the metadata will live */
2825         alloc_target = btrfs_get_alloc_profile(root, 0);
2826         meta_sinfo = __find_space_info(info, alloc_target);
2827
2828         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2829                                            num_items);
2830
2831         spin_lock(&meta_sinfo->lock);
2832         spin_lock(&BTRFS_I(inode)->accounting_lock);
2833         if (BTRFS_I(inode)->reserved_extents <=
2834             BTRFS_I(inode)->outstanding_extents) {
2835                 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2836                 spin_unlock(&meta_sinfo->lock);
2837                 return 0;
2838         }
2839         spin_unlock(&BTRFS_I(inode)->accounting_lock);
2840
2841         BTRFS_I(inode)->reserved_extents--;
2842         BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2843
2844         if (meta_sinfo->bytes_delalloc < num_bytes) {
2845                 bug = true;
2846                 meta_sinfo->bytes_delalloc = 0;
2847         } else {
2848                 meta_sinfo->bytes_delalloc -= num_bytes;
2849         }
2850         spin_unlock(&meta_sinfo->lock);
2851
2852         BUG_ON(bug);
2853
2854         return 0;
2855 }
2856
2857 static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2858 {
2859         u64 thresh;
2860
2861         thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2862                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2863                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2864                 meta_sinfo->bytes_may_use;
2865
2866         thresh = meta_sinfo->total_bytes - thresh;
2867         thresh *= 80;
2868         do_div(thresh, 100);
2869         if (thresh <= meta_sinfo->bytes_delalloc)
2870                 meta_sinfo->force_delalloc = 1;
2871         else
2872                 meta_sinfo->force_delalloc = 0;
2873 }
2874
2875 struct async_flush {
2876         struct btrfs_root *root;
2877         struct btrfs_space_info *info;
2878         struct btrfs_work work;
2879 };
2880
2881 static noinline void flush_delalloc_async(struct btrfs_work *work)
2882 {
2883         struct async_flush *async;
2884         struct btrfs_root *root;
2885         struct btrfs_space_info *info;
2886
2887         async = container_of(work, struct async_flush, work);
2888         root = async->root;
2889         info = async->info;
2890
2891         btrfs_start_delalloc_inodes(root, 0);
2892         wake_up(&info->flush_wait);
2893         btrfs_wait_ordered_extents(root, 0, 0);
2894
2895         spin_lock(&info->lock);
2896         info->flushing = 0;
2897         spin_unlock(&info->lock);
2898         wake_up(&info->flush_wait);
2899
2900         kfree(async);
2901 }
2902
2903 static void wait_on_flush(struct btrfs_space_info *info)
2904 {
2905         DEFINE_WAIT(wait);
2906         u64 used;
2907
2908         while (1) {
2909                 prepare_to_wait(&info->flush_wait, &wait,
2910                                 TASK_UNINTERRUPTIBLE);
2911                 spin_lock(&info->lock);
2912                 if (!info->flushing) {
2913                         spin_unlock(&info->lock);
2914                         break;
2915                 }
2916
2917                 used = info->bytes_used + info->bytes_reserved +
2918                         info->bytes_pinned + info->bytes_readonly +
2919                         info->bytes_super + info->bytes_root +
2920                         info->bytes_may_use + info->bytes_delalloc;
2921                 if (used < info->total_bytes) {
2922                         spin_unlock(&info->lock);
2923                         break;
2924                 }
2925                 spin_unlock(&info->lock);
2926                 schedule();
2927         }
2928         finish_wait(&info->flush_wait, &wait);
2929 }
2930
2931 static void flush_delalloc(struct btrfs_root *root,
2932                                  struct btrfs_space_info *info)
2933 {
2934         struct async_flush *async;
2935         bool wait = false;
2936
2937         spin_lock(&info->lock);
2938
2939         if (!info->flushing) {
2940                 info->flushing = 1;
2941                 init_waitqueue_head(&info->flush_wait);
2942         } else {
2943                 wait = true;
2944         }
2945
2946         spin_unlock(&info->lock);
2947
2948         if (wait) {
2949                 wait_on_flush(info);
2950                 return;
2951         }
2952
2953         async = kzalloc(sizeof(*async), GFP_NOFS);
2954         if (!async)
2955                 goto flush;
2956
2957         async->root = root;
2958         async->info = info;
2959         async->work.func = flush_delalloc_async;
2960
2961         btrfs_queue_worker(&root->fs_info->enospc_workers,
2962                            &async->work);
2963         wait_on_flush(info);
2964         return;
2965
2966 flush:
2967         btrfs_start_delalloc_inodes(root, 0);
2968         btrfs_wait_ordered_extents(root, 0, 0);
2969
2970         spin_lock(&info->lock);
2971         info->flushing = 0;
2972         spin_unlock(&info->lock);
2973         wake_up(&info->flush_wait);
2974 }
2975
2976 static int maybe_allocate_chunk(struct btrfs_root *root,
2977                                  struct btrfs_space_info *info)
2978 {
2979         struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2980         struct btrfs_trans_handle *trans;
2981         bool wait = false;
2982         int ret = 0;
2983         u64 min_metadata;
2984         u64 free_space;
2985
2986         free_space = btrfs_super_total_bytes(disk_super);
2987         /*
2988          * we allow the metadata to grow to a max of either 10gb or 5% of the
2989          * space in the volume.
2990          */
2991         min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2992                              div64_u64(free_space * 5, 100));
2993         if (info->total_bytes >= min_metadata) {
2994                 spin_unlock(&info->lock);
2995                 return 0;
2996         }
2997
2998         if (info->full) {
2999                 spin_unlock(&info->lock);
3000                 return 0;
3001         }
3002
3003         if (!info->allocating_chunk) {
3004                 info->force_alloc = 1;
3005                 info->allocating_chunk = 1;
3006                 init_waitqueue_head(&info->allocate_wait);
3007         } else {
3008                 wait = true;
3009         }
3010
3011         spin_unlock(&info->lock);
3012
3013         if (wait) {
3014                 wait_event(info->allocate_wait,
3015                            !info->allocating_chunk);
3016                 return 1;
3017         }
3018
3019         trans = btrfs_start_transaction(root, 1);
3020         if (!trans) {
3021                 ret = -ENOMEM;
3022                 goto out;
3023         }
3024
3025         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3026                              4096 + 2 * 1024 * 1024,
3027                              info->flags, 0);
3028         btrfs_end_transaction(trans, root);
3029         if (ret)
3030                 goto out;
3031 out:
3032         spin_lock(&info->lock);
3033         info->allocating_chunk = 0;
3034         spin_unlock(&info->lock);
3035         wake_up(&info->allocate_wait);
3036
3037         if (ret)
3038                 return 0;
3039         return 1;
3040 }
3041
3042 /*
3043  * Reserve metadata space for delalloc.
3044  */
3045 int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3046                                         struct inode *inode, int num_items)
3047 {
3048         struct btrfs_fs_info *info = root->fs_info;
3049         struct btrfs_space_info *meta_sinfo;
3050         u64 num_bytes;
3051         u64 used;
3052         u64 alloc_target;
3053         int flushed = 0;
3054         int force_delalloc;
3055
3056         /* get the space info for where the metadata will live */
3057         alloc_target = btrfs_get_alloc_profile(root, 0);
3058         meta_sinfo = __find_space_info(info, alloc_target);
3059
3060         num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3061                                            num_items);
3062 again:
3063         spin_lock(&meta_sinfo->lock);
3064
3065         force_delalloc = meta_sinfo->force_delalloc;
3066
3067         if (unlikely(!meta_sinfo->bytes_root))
3068                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3069
3070         if (!flushed)
3071                 meta_sinfo->bytes_delalloc += num_bytes;
3072
3073         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3074                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3075                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3076                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3077
3078         if (used > meta_sinfo->total_bytes) {
3079                 flushed++;
3080
3081                 if (flushed == 1) {
3082                         if (maybe_allocate_chunk(root, meta_sinfo))
3083                                 goto again;
3084                         flushed++;
3085                 } else {
3086                         spin_unlock(&meta_sinfo->lock);
3087                 }
3088
3089                 if (flushed == 2) {
3090                         filemap_flush(inode->i_mapping);
3091                         goto again;
3092                 } else if (flushed == 3) {
3093                         flush_delalloc(root, meta_sinfo);
3094                         goto again;
3095                 }
3096                 spin_lock(&meta_sinfo->lock);
3097                 meta_sinfo->bytes_delalloc -= num_bytes;
3098                 spin_unlock(&meta_sinfo->lock);
3099                 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3100                        BTRFS_I(inode)->outstanding_extents,
3101                        BTRFS_I(inode)->reserved_extents);
3102                 dump_space_info(meta_sinfo, 0, 0);
3103                 return -ENOSPC;
3104         }
3105
3106         BTRFS_I(inode)->reserved_extents++;
3107         check_force_delalloc(meta_sinfo);
3108         spin_unlock(&meta_sinfo->lock);
3109
3110         if (!flushed && force_delalloc)
3111                 filemap_flush(inode->i_mapping);
3112
3113         return 0;
3114 }
3115
3116 /*
3117  * unreserve num_items number of items worth of metadata space.  This needs to
3118  * be paired with btrfs_reserve_metadata_space.
3119  *
3120  * NOTE: if you have the option, run this _AFTER_ you do a
3121  * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3122  * oprations which will result in more used metadata, so we want to make sure we
3123  * can do that without issue.
3124  */
3125 int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3126 {
3127         struct btrfs_fs_info *info = root->fs_info;
3128         struct btrfs_space_info *meta_sinfo;
3129         u64 num_bytes;
3130         u64 alloc_target;
3131         bool bug = false;
3132
3133         /* get the space info for where the metadata will live */
3134         alloc_target = btrfs_get_alloc_profile(root, 0);
3135         meta_sinfo = __find_space_info(info, alloc_target);
3136
3137         num_bytes = calculate_bytes_needed(root, num_items);
3138
3139         spin_lock(&meta_sinfo->lock);
3140         if (meta_sinfo->bytes_may_use < num_bytes) {
3141                 bug = true;
3142                 meta_sinfo->bytes_may_use = 0;
3143         } else {
3144                 meta_sinfo->bytes_may_use -= num_bytes;
3145         }
3146         spin_unlock(&meta_sinfo->lock);
3147
3148         BUG_ON(bug);
3149
3150         return 0;
3151 }
3152
3153 /*
3154  * Reserve some metadata space for use.  We'll calculate the worste case number
3155  * of bytes that would be needed to modify num_items number of items.  If we
3156  * have space, fantastic, if not, you get -ENOSPC.  Please call
3157  * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3158  * items you reserved, since whatever metadata you needed should have already
3159  * been allocated.
3160  *
3161  * This will commit the transaction to make more space if we don't have enough
3162  * metadata space.  THe only time we don't do this is if we're reserving space
3163  * inside of a transaction, then we will just return -ENOSPC and it is the
3164  * callers responsibility to handle it properly.
3165  */
3166 int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3167 {
3168         struct btrfs_fs_info *info = root->fs_info;
3169         struct btrfs_space_info *meta_sinfo;
3170         u64 num_bytes;
3171         u64 used;
3172         u64 alloc_target;
3173         int retries = 0;
3174
3175         /* get the space info for where the metadata will live */
3176         alloc_target = btrfs_get_alloc_profile(root, 0);
3177         meta_sinfo = __find_space_info(info, alloc_target);
3178
3179         num_bytes = calculate_bytes_needed(root, num_items);
3180 again:
3181         spin_lock(&meta_sinfo->lock);
3182
3183         if (unlikely(!meta_sinfo->bytes_root))
3184                 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3185
3186         if (!retries)
3187                 meta_sinfo->bytes_may_use += num_bytes;
3188
3189         used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3190                 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3191                 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3192                 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3193
3194         if (used > meta_sinfo->total_bytes) {
3195                 retries++;
3196                 if (retries == 1) {
3197                         if (maybe_allocate_chunk(root, meta_sinfo))
3198                                 goto again;
3199                         retries++;
3200                 } else {
3201                         spin_unlock(&meta_sinfo->lock);
3202                 }
3203
3204                 if (retries == 2) {
3205                         flush_delalloc(root, meta_sinfo);
3206                         goto again;
3207                 }
3208                 spin_lock(&meta_sinfo->lock);
3209                 meta_sinfo->bytes_may_use -= num_bytes;
3210                 spin_unlock(&meta_sinfo->lock);
3211
3212                 dump_space_info(meta_sinfo, 0, 0);
3213                 return -ENOSPC;
3214         }
3215
3216         check_force_delalloc(meta_sinfo);
3217         spin_unlock(&meta_sinfo->lock);
3218
3219         return 0;
3220 }
3221
3222 /*
3223  * This will check the space that the inode allocates from to make sure we have
3224  * enough space for bytes.
3225  */
3226 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3227                                 u64 bytes)
3228 {
3229         struct btrfs_space_info *data_sinfo;
3230         int ret = 0, committed = 0;
3231
3232         /* make sure bytes are sectorsize aligned */
3233         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3234
3235         data_sinfo = BTRFS_I(inode)->space_info;
3236         if (!data_sinfo)
3237                 goto alloc;
3238
3239 again:
3240         /* make sure we have enough space to handle the data first */
3241         spin_lock(&data_sinfo->lock);
3242         if (data_sinfo->total_bytes - data_sinfo->bytes_used -
3243             data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
3244             data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
3245             data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
3246                 struct btrfs_trans_handle *trans;
3247
3248                 /*
3249                  * if we don't have enough free bytes in this space then we need
3250                  * to alloc a new chunk.
3251                  */
3252                 if (!data_sinfo->full) {
3253                         u64 alloc_target;
3254
3255                         data_sinfo->force_alloc = 1;
3256                         spin_unlock(&data_sinfo->lock);
3257 alloc:
3258                         alloc_target = btrfs_get_alloc_profile(root, 1);
3259                         trans = btrfs_start_transaction(root, 1);
3260                         if (!trans)
3261                                 return -ENOMEM;
3262
3263                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3264                                              bytes + 2 * 1024 * 1024,
3265                                              alloc_target, 0);
3266                         btrfs_end_transaction(trans, root);
3267                         if (ret)
3268                                 return ret;
3269
3270                         if (!data_sinfo) {
3271                                 btrfs_set_inode_space_info(root, inode);
3272                                 data_sinfo = BTRFS_I(inode)->space_info;
3273                         }
3274                         goto again;
3275                 }
3276                 spin_unlock(&data_sinfo->lock);
3277
3278                 /* commit the current transaction and try again */
3279                 if (!committed && !root->fs_info->open_ioctl_trans) {
3280                         committed = 1;
3281                         trans = btrfs_join_transaction(root, 1);
3282                         if (!trans)
3283                                 return -ENOMEM;
3284                         ret = btrfs_commit_transaction(trans, root);
3285                         if (ret)
3286                                 return ret;
3287                         goto again;
3288                 }
3289
3290                 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3291                        ", %llu bytes_used, %llu bytes_reserved, "
3292                        "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
3293                        "%llu total\n", (unsigned long long)bytes,
3294                        (unsigned long long)data_sinfo->bytes_delalloc,
3295                        (unsigned long long)data_sinfo->bytes_used,
3296                        (unsigned long long)data_sinfo->bytes_reserved,
3297                        (unsigned long long)data_sinfo->bytes_pinned,
3298                        (unsigned long long)data_sinfo->bytes_readonly,
3299                        (unsigned long long)data_sinfo->bytes_may_use,
3300                        (unsigned long long)data_sinfo->total_bytes);
3301                 return -ENOSPC;
3302         }
3303         data_sinfo->bytes_may_use += bytes;
3304         BTRFS_I(inode)->reserved_bytes += bytes;
3305         spin_unlock(&data_sinfo->lock);
3306
3307         return 0;
3308 }
3309
3310 /*
3311  * if there was an error for whatever reason after calling
3312  * btrfs_check_data_free_space, call this so we can cleanup the counters.
3313  */
3314 void btrfs_free_reserved_data_space(struct btrfs_root *root,
3315                                     struct inode *inode, u64 bytes)
3316 {
3317         struct btrfs_space_info *data_sinfo;
3318
3319         /* make sure bytes are sectorsize aligned */
3320         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3321
3322         data_sinfo = BTRFS_I(inode)->space_info;
3323         spin_lock(&data_sinfo->lock);
3324         data_sinfo->bytes_may_use -= bytes;
3325         BTRFS_I(inode)->reserved_bytes -= bytes;
3326         spin_unlock(&data_sinfo->lock);
3327 }
3328
3329 /* called when we are adding a delalloc extent to the inode's io_tree */
3330 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3331                                   u64 bytes)
3332 {
3333         struct btrfs_space_info *data_sinfo;
3334
3335         /* get the space info for where this inode will be storing its data */
3336         data_sinfo = BTRFS_I(inode)->space_info;
3337
3338         /* make sure we have enough space to handle the data first */
3339         spin_lock(&data_sinfo->lock);
3340         data_sinfo->bytes_delalloc += bytes;
3341
3342         /*
3343          * we are adding a delalloc extent without calling
3344          * btrfs_check_data_free_space first.  This happens on a weird
3345          * writepage condition, but shouldn't hurt our accounting
3346          */
3347         if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3348                 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3349                 BTRFS_I(inode)->reserved_bytes = 0;
3350         } else {
3351                 data_sinfo->bytes_may_use -= bytes;
3352                 BTRFS_I(inode)->reserved_bytes -= bytes;
3353         }
3354
3355         spin_unlock(&data_sinfo->lock);
3356 }
3357
3358 /* called when we are clearing an delalloc extent from the inode's io_tree */
3359 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3360                               u64 bytes)
3361 {
3362         struct btrfs_space_info *info;
3363
3364         info = BTRFS_I(inode)->space_info;
3365
3366         spin_lock(&info->lock);
3367         info->bytes_delalloc -= bytes;
3368         spin_unlock(&info->lock);
3369 }
3370
3371 static void force_metadata_allocation(struct btrfs_fs_info *info)
3372 {
3373         struct list_head *head = &info->space_info;
3374         struct btrfs_space_info *found;
3375
3376         rcu_read_lock();
3377         list_for_each_entry_rcu(found, head, list) {
3378                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3379                         found->force_alloc = 1;
3380         }
3381         rcu_read_unlock();
3382 }
3383
3384 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3385                           struct btrfs_root *extent_root, u64 alloc_bytes,
3386                           u64 flags, int force)
3387 {
3388         struct btrfs_space_info *space_info;
3389         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3390         u64 thresh;
3391         int ret = 0;
3392
3393         mutex_lock(&fs_info->chunk_mutex);
3394
3395         flags = btrfs_reduce_alloc_profile(extent_root, flags);
3396
3397         space_info = __find_space_info(extent_root->fs_info, flags);
3398         if (!space_info) {
3399                 ret = update_space_info(extent_root->fs_info, flags,
3400                                         0, 0, &space_info);
3401                 BUG_ON(ret);
3402         }
3403         BUG_ON(!space_info);
3404
3405         spin_lock(&space_info->lock);
3406         if (space_info->force_alloc)
3407                 force = 1;
3408         if (space_info->full) {
3409                 spin_unlock(&space_info->lock);
3410                 goto out;
3411         }
3412
3413         thresh = space_info->total_bytes - space_info->bytes_readonly;
3414         thresh = div_factor(thresh, 8);
3415         if (!force &&
3416            (space_info->bytes_used + space_info->bytes_pinned +
3417             space_info->bytes_reserved + alloc_bytes) < thresh) {
3418                 spin_unlock(&space_info->lock);
3419                 goto out;
3420         }
3421         spin_unlock(&space_info->lock);
3422
3423         /*
3424          * if we're doing a data chunk, go ahead and make sure that
3425          * we keep a reasonable number of metadata chunks allocated in the
3426          * FS as well.
3427          */
3428         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3429                 fs_info->data_chunk_allocations++;
3430                 if (!(fs_info->data_chunk_allocations %
3431                       fs_info->metadata_ratio))
3432                         force_metadata_allocation(fs_info);
3433         }
3434
3435         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3436         spin_lock(&space_info->lock);
3437         if (ret)
3438                 space_info->full = 1;
3439         space_info->force_alloc = 0;
3440         spin_unlock(&space_info->lock);
3441 out:
3442         mutex_unlock(&extent_root->fs_info->chunk_mutex);
3443         return ret;
3444 }
3445
3446 static int update_block_group(struct btrfs_trans_handle *trans,
3447                               struct btrfs_root *root,
3448                               u64 bytenr, u64 num_bytes, int alloc,
3449                               int mark_free)
3450 {
3451         struct btrfs_block_group_cache *cache;
3452         struct btrfs_fs_info *info = root->fs_info;
3453         u64 total = num_bytes;
3454         u64 old_val;
3455         u64 byte_in_group;
3456
3457         /* block accounting for super block */
3458         spin_lock(&info->delalloc_lock);
3459         old_val = btrfs_super_bytes_used(&info->super_copy);
3460         if (alloc)
3461                 old_val += num_bytes;
3462         else
3463                 old_val -= num_bytes;
3464         btrfs_set_super_bytes_used(&info->super_copy, old_val);
3465         spin_unlock(&info->delalloc_lock);
3466
3467         while (total) {
3468                 cache = btrfs_lookup_block_group(info, bytenr);
3469                 if (!cache)
3470                         return -1;
3471                 byte_in_group = bytenr - cache->key.objectid;
3472                 WARN_ON(byte_in_group > cache->key.offset);
3473
3474                 spin_lock(&cache->space_info->lock);
3475                 spin_lock(&cache->lock);
3476                 cache->dirty = 1;
3477                 old_val = btrfs_block_group_used(&cache->item);
3478                 num_bytes = min(total, cache->key.offset - byte_in_group);
3479                 if (alloc) {
3480                         old_val += num_bytes;
3481                         btrfs_set_block_group_used(&cache->item, old_val);
3482                         cache->reserved -= num_bytes;
3483                         cache->space_info->bytes_used += num_bytes;
3484                         cache->space_info->bytes_reserved -= num_bytes;
3485                         if (cache->ro)
3486                                 cache->space_info->bytes_readonly -= num_bytes;
3487                         spin_unlock(&cache->lock);
3488                         spin_unlock(&cache->space_info->lock);
3489                 } else {
3490                         old_val -= num_bytes;
3491                         cache->space_info->bytes_used -= num_bytes;
3492                         if (cache->ro)
3493                                 cache->space_info->bytes_readonly += num_bytes;
3494                         btrfs_set_block_group_used(&cache->item, old_val);
3495                         spin_unlock(&cache->lock);
3496                         spin_unlock(&cache->space_info->lock);
3497                         if (mark_free) {
3498                                 int ret;
3499
3500                                 ret = btrfs_discard_extent(root, bytenr,
3501                                                            num_bytes);
3502                                 WARN_ON(ret);
3503
3504                                 ret = btrfs_add_free_space(cache, bytenr,
3505                                                            num_bytes);
3506                                 WARN_ON(ret);
3507                         }
3508                 }
3509                 btrfs_put_block_group(cache);
3510                 total -= num_bytes;
3511                 bytenr += num_bytes;
3512         }
3513         return 0;
3514 }
3515
3516 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3517 {
3518         struct btrfs_block_group_cache *cache;
3519         u64 bytenr;
3520
3521         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3522         if (!cache)
3523                 return 0;
3524
3525         bytenr = cache->key.objectid;
3526         btrfs_put_block_group(cache);
3527
3528         return bytenr;
3529 }
3530
3531 /*
3532  * this function must be called within transaction
3533  */
3534 int btrfs_pin_extent(struct btrfs_root *root,
3535                      u64 bytenr, u64 num_bytes, int reserved)
3536 {
3537         struct btrfs_fs_info *fs_info = root->fs_info;
3538         struct btrfs_block_group_cache *cache;
3539
3540         cache = btrfs_lookup_block_group(fs_info, bytenr);
3541         BUG_ON(!cache);
3542
3543         spin_lock(&cache->space_info->lock);
3544         spin_lock(&cache->lock);
3545         cache->pinned += num_bytes;
3546         cache->space_info->bytes_pinned += num_bytes;
3547         if (reserved) {
3548                 cache->reserved -= num_bytes;
3549                 cache->space_info->bytes_reserved -= num_bytes;
3550         }
3551         spin_unlock(&cache->lock);
3552         spin_unlock(&cache->space_info->lock);
3553
3554         btrfs_put_block_group(cache);
3555
3556         set_extent_dirty(fs_info->pinned_extents,
3557                          bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3558         return 0;
3559 }
3560
3561 static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3562                                    u64 num_bytes, int reserve)
3563 {
3564         spin_lock(&cache->space_info->lock);
3565         spin_lock(&cache->lock);
3566         if (reserve) {
3567                 cache->reserved += num_bytes;
3568                 cache->space_info->bytes_reserved += num_bytes;
3569         } else {
3570                 cache->reserved -= num_bytes;
3571                 cache->space_info->bytes_reserved -= num_bytes;
3572         }
3573         spin_unlock(&cache->lock);
3574         spin_unlock(&cache->space_info->lock);
3575         return 0;
3576 }
3577
3578 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3579                                 struct btrfs_root *root)
3580 {
3581         struct btrfs_fs_info *fs_info = root->fs_info;
3582         struct btrfs_caching_control *next;
3583         struct btrfs_caching_control *caching_ctl;
3584         struct btrfs_block_group_cache *cache;
3585
3586         down_write(&fs_info->extent_commit_sem);
3587
3588         list_for_each_entry_safe(caching_ctl, next,
3589                                  &fs_info->caching_block_groups, list) {
3590                 cache = caching_ctl->block_group;
3591                 if (block_group_cache_done(cache)) {
3592                         cache->last_byte_to_unpin = (u64)-1;
3593                         list_del_init(&caching_ctl->list);
3594                         put_caching_control(caching_ctl);
3595                 } else {
3596                         cache->last_byte_to_unpin = caching_ctl->progress;
3597                 }
3598         }
3599
3600         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3601                 fs_info->pinned_extents = &fs_info->freed_extents[1];
3602         else
3603                 fs_info->pinned_extents = &fs_info->freed_extents[0];
3604
3605         up_write(&fs_info->extent_commit_sem);
3606         return 0;
3607 }
3608
3609 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3610 {
3611         struct btrfs_fs_info *fs_info = root->fs_info;
3612         struct btrfs_block_group_cache *cache = NULL;
3613         u64 len;
3614
3615         while (start <= end) {
3616                 if (!cache ||
3617                     start >= cache->key.objectid + cache->key.offset) {
3618                         if (cache)
3619                                 btrfs_put_block_group(cache);
3620                         cache = btrfs_lookup_block_group(fs_info, start);
3621                         BUG_ON(!cache);
3622                 }
3623
3624                 len = cache->key.objectid + cache->key.offset - start;
3625                 len = min(len, end + 1 - start);
3626
3627                 if (start < cache->last_byte_to_unpin) {
3628                         len = min(len, cache->last_byte_to_unpin - start);
3629                         btrfs_add_free_space(cache, start, len);
3630                 }
3631
3632                 spin_lock(&cache->space_info->lock);
3633                 spin_lock(&cache->lock);
3634                 cache->pinned -= len;
3635                 cache->space_info->bytes_pinned -= len;
3636                 spin_unlock(&cache->lock);
3637                 spin_unlock(&cache->space_info->lock);
3638
3639                 start += len;
3640         }
3641
3642         if (cache)
3643                 btrfs_put_block_group(cache);
3644         return 0;
3645 }
3646
3647 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3648                                struct btrfs_root *root)
3649 {
3650         struct btrfs_fs_info *fs_info = root->fs_info;
3651         struct extent_io_tree *unpin;
3652         u64 start;
3653         u64 end;
3654         int ret;
3655
3656         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3657                 unpin = &fs_info->freed_extents[1];
3658         else
3659                 unpin = &fs_info->freed_extents[0];
3660
3661         while (1) {
3662                 ret = find_first_extent_bit(unpin, 0, &start, &end,
3663                                             EXTENT_DIRTY);
3664                 if (ret)
3665                         break;
3666
3667                 ret = btrfs_discard_extent(root, start, end + 1 - start);
3668
3669                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3670                 unpin_extent_range(root, start, end);
3671                 cond_resched();
3672         }
3673
3674         return ret;
3675 }
3676
3677 static int pin_down_bytes(struct btrfs_trans_handle *trans,
3678                           struct btrfs_root *root,
3679                           struct btrfs_path *path,
3680                           u64 bytenr, u64 num_bytes,
3681                           int is_data, int reserved,
3682                           struct extent_buffer **must_clean)
3683 {
3684         int err = 0;
3685         struct extent_buffer *buf;
3686
3687         if (is_data)
3688                 goto pinit;
3689
3690         /*
3691          * discard is sloooow, and so triggering discards on
3692          * individual btree blocks isn't a good plan.  Just
3693          * pin everything in discard mode.
3694          */
3695         if (btrfs_test_opt(root, DISCARD))
3696                 goto pinit;
3697
3698         buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3699         if (!buf)
3700                 goto pinit;
3701
3702         /* we can reuse a block if it hasn't been written
3703          * and it is from this transaction.  We can't
3704          * reuse anything from the tree log root because
3705          * it has tiny sub-transactions.
3706          */
3707         if (btrfs_buffer_uptodate(buf, 0) &&
3708             btrfs_try_tree_lock(buf)) {
3709                 u64 header_owner = btrfs_header_owner(buf);
3710                 u64 header_transid = btrfs_header_generation(buf);
3711                 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3712                     header_transid == trans->transid &&
3713                     !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3714                         *must_clean = buf;
3715                         return 1;
3716                 }
3717                 btrfs_tree_unlock(buf);
3718         }
3719         free_extent_buffer(buf);
3720 pinit:
3721         if (path)
3722                 btrfs_set_path_blocking(path);
3723         /* unlocks the pinned mutex */
3724         btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3725
3726         BUG_ON(err < 0);
3727         return 0;
3728 }
3729
3730 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3731                                 struct btrfs_root *root,
3732                                 u64 bytenr, u64 num_bytes, u64 parent,
3733                                 u64 root_objectid, u64 owner_objectid,
3734                                 u64 owner_offset, int refs_to_drop,
3735                                 struct btrfs_delayed_extent_op *extent_op)
3736 {
3737         struct btrfs_key key;
3738         struct btrfs_path *path;
3739         struct btrfs_fs_info *info = root->fs_info;
3740         struct btrfs_root *extent_root = info->extent_root;
3741         struct extent_buffer *leaf;
3742         struct btrfs_extent_item *ei;
3743         struct btrfs_extent_inline_ref *iref;
3744         int ret;
3745         int is_data;
3746         int extent_slot = 0;
3747         int found_extent = 0;
3748         int num_to_del = 1;
3749         u32 item_size;
3750         u64 refs;
3751
3752         path = btrfs_alloc_path();
3753         if (!path)
3754                 return -ENOMEM;
3755
3756         path->reada = 1;
3757         path->leave_spinning = 1;
3758
3759         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3760         BUG_ON(!is_data && refs_to_drop != 1);
3761
3762         ret = lookup_extent_backref(trans, extent_root, path, &iref,
3763                                     bytenr, num_bytes, parent,
3764                                     root_objectid, owner_objectid,
3765                                     owner_offset);
3766         if (ret == 0) {
3767                 extent_slot = path->slots[0];
3768                 while (extent_slot >= 0) {
3769                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3770                                               extent_slot);
3771                         if (key.objectid != bytenr)
3772                                 break;
3773                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3774                             key.offset == num_bytes) {
3775                                 found_extent = 1;
3776                                 break;
3777                         }
3778                         if (path->slots[0] - extent_slot > 5)
3779                                 break;
3780                         extent_slot--;
3781                 }
3782 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3783                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3784                 if (found_extent && item_size < sizeof(*ei))
3785                         found_extent = 0;
3786 #endif
3787                 if (!found_extent) {
3788                         BUG_ON(iref);
3789                         ret = remove_extent_backref(trans, extent_root, path,
3790                                                     NULL, refs_to_drop,
3791                                                     is_data);
3792                         BUG_ON(ret);
3793                         btrfs_release_path(extent_root, path);
3794                         path->leave_spinning = 1;
3795
3796                         key.objectid = bytenr;
3797                         key.type = BTRFS_EXTENT_ITEM_KEY;
3798                         key.offset = num_bytes;
3799
3800                         ret = btrfs_search_slot(trans, extent_root,
3801                                                 &key, path, -1, 1);
3802                         if (ret) {
3803                                 printk(KERN_ERR "umm, got %d back from search"
3804                                        ", was looking for %llu\n", ret,
3805                                        (unsigned long long)bytenr);
3806                                 btrfs_print_leaf(extent_root, path->nodes[0]);
3807                         }
3808                         BUG_ON(ret);
3809                         extent_slot = path->slots[0];
3810                 }
3811         } else {
3812                 btrfs_print_leaf(extent_root, path->nodes[0]);
3813                 WARN_ON(1);
3814                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
3815                        "parent %llu root %llu  owner %llu offset %llu\n",
3816                        (unsigned long long)bytenr,
3817                        (unsigned long long)parent,
3818                        (unsigned long long)root_objectid,
3819                        (unsigned long long)owner_objectid,
3820                        (unsigned long long)owner_offset);
3821         }
3822
3823         leaf = path->nodes[0];
3824         item_size = btrfs_item_size_nr(leaf, extent_slot);
3825 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3826         if (item_size < sizeof(*ei)) {
3827                 BUG_ON(found_extent || extent_slot != path->slots[0]);
3828                 ret = convert_extent_item_v0(trans, extent_root, path,
3829                                              owner_objectid, 0);
3830                 BUG_ON(ret < 0);
3831
3832                 btrfs_release_path(extent_root, path);
3833                 path->leave_spinning = 1;
3834
3835                 key.objectid = bytenr;
3836                 key.type = BTRFS_EXTENT_ITEM_KEY;
3837                 key.offset = num_bytes;
3838
3839                 ret = btrfs_search_slot(trans, extent_root, &key, path,
3840                                         -1, 1);
3841                 if (ret) {
3842                         printk(KERN_ERR "umm, got %d back from search"
3843                                ", was looking for %llu\n", ret,
3844                                (unsigned long long)bytenr);
3845                         btrfs_print_leaf(extent_root, path->nodes[0]);
3846                 }
3847                 BUG_ON(ret);
3848                 extent_slot = path->slots[0];
3849                 leaf = path->nodes[0];
3850                 item_size = btrfs_item_size_nr(leaf, extent_slot);
3851         }
3852 #endif
3853         BUG_ON(item_size < sizeof(*ei));
3854         ei = btrfs_item_ptr(leaf, extent_slot,
3855                             struct btrfs_extent_item);
3856         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3857                 struct btrfs_tree_block_info *bi;
3858                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3859                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3860                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3861         }
3862
3863         refs = btrfs_extent_refs(leaf, ei);
3864         BUG_ON(refs < refs_to_drop);
3865         refs -= refs_to_drop;
3866
3867         if (refs > 0) {
3868                 if (extent_op)
3869                         __run_delayed_extent_op(extent_op, leaf, ei);
3870                 /*
3871                  * In the case of inline back ref, reference count will
3872                  * be updated by remove_extent_backref
3873                  */
3874                 if (iref) {
3875                         BUG_ON(!found_extent);
3876                 } else {
3877                         btrfs_set_extent_refs(leaf, ei, refs);
3878                         btrfs_mark_buffer_dirty(leaf);
3879                 }
3880                 if (found_extent) {
3881                         ret = remove_extent_backref(trans, extent_root, path,
3882                                                     iref, refs_to_drop,
3883                                                     is_data);
3884                         BUG_ON(ret);
3885                 }
3886         } else {
3887                 int mark_free = 0;
3888                 struct extent_buffer *must_clean = NULL;
3889
3890                 if (found_extent) {
3891                         BUG_ON(is_data && refs_to_drop !=
3892                                extent_data_ref_count(root, path, iref));
3893                         if (iref) {
3894                                 BUG_ON(path->slots[0] != extent_slot);
3895                         } else {
3896                                 BUG_ON(path->slots[0] != extent_slot + 1);
3897                                 path->slots[0] = extent_slot;
3898                                 num_to_del = 2;
3899                         }
3900                 }
3901
3902                 ret = pin_down_bytes(trans, root, path, bytenr,
3903                                      num_bytes, is_data, 0, &must_clean);
3904                 if (ret > 0)
3905                         mark_free = 1;
3906                 BUG_ON(ret < 0);
3907                 /*
3908                  * it is going to be very rare for someone to be waiting
3909                  * on the block we're freeing.  del_items might need to
3910                  * schedule, so rather than get fancy, just force it
3911                  * to blocking here
3912                  */
3913                 if (must_clean)
3914                         btrfs_set_lock_blocking(must_clean);
3915
3916                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3917                                       num_to_del);
3918                 BUG_ON(ret);
3919                 btrfs_release_path(extent_root, path);
3920
3921                 if (must_clean) {
3922                         clean_tree_block(NULL, root, must_clean);
3923                         btrfs_tree_unlock(must_clean);
3924                         free_extent_buffer(must_clean);
3925                 }
3926
3927                 if (is_data) {
3928                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3929                         BUG_ON(ret);
3930                 } else {
3931                         invalidate_mapping_pages(info->btree_inode->i_mapping,
3932                              bytenr >> PAGE_CACHE_SHIFT,
3933                              (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3934                 }
3935
3936                 ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3937                                          mark_free);
3938                 BUG_ON(ret);
3939         }
3940         btrfs_free_path(path);
3941         return ret;
3942 }
3943
3944 /*
3945  * when we free an extent, it is possible (and likely) that we free the last
3946  * delayed ref for that extent as well.  This searches the delayed ref tree for
3947  * a given extent, and if there are no other delayed refs to be processed, it
3948  * removes it from the tree.
3949  */
3950 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3951                                       struct btrfs_root *root, u64 bytenr)
3952 {
3953         struct btrfs_delayed_ref_head *head;
3954         struct btrfs_delayed_ref_root *delayed_refs;
3955         struct btrfs_delayed_ref_node *ref;
3956         struct rb_node *node;
3957         int ret;
3958
3959         delayed_refs = &trans->transaction->delayed_refs;
3960         spin_lock(&delayed_refs->lock);
3961         head = btrfs_find_delayed_ref_head(trans, bytenr);
3962         if (!head)
3963                 goto out;
3964
3965         node = rb_prev(&head->node.rb_node);
3966         if (!node)
3967                 goto out;
3968
3969         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3970
3971         /* there are still entries for this ref, we can't drop it */
3972         if (ref->bytenr == bytenr)
3973                 goto out;
3974
3975         if (head->extent_op) {
3976                 if (!head->must_insert_reserved)
3977                         goto out;
3978                 kfree(head->extent_op);
3979                 head->extent_op = NULL;
3980         }
3981
3982         /*
3983          * waiting for the lock here would deadlock.  If someone else has it
3984          * locked they are already in the process of dropping it anyway
3985          */
3986         if (!mutex_trylock(&head->mutex))
3987                 goto out;
3988
3989         /*
3990          * at this point we have a head with no other entries.  Go
3991          * ahead and process it.
3992          */
3993         head->node.in_tree = 0;
3994         rb_erase(&head->node.rb_node, &delayed_refs->root);
3995
3996         delayed_refs->num_entries--;
3997
3998         /*
3999          * we don't take a ref on the node because we're removing it from the
4000          * tree, so we just steal the ref the tree was holding.
4001          */
4002         delayed_refs->num_heads--;
4003         if (list_empty(&head->cluster))
4004                 delayed_refs->num_heads_ready--;
4005
4006         list_del_init(&head->cluster);
4007         spin_unlock(&delayed_refs->lock);
4008
4009         ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
4010                                   &head->node, head->extent_op,
4011                                   head->must_insert_reserved);
4012         BUG_ON(ret);
4013         btrfs_put_delayed_ref(&head->node);
4014         return 0;
4015 out:
4016         spin_unlock(&delayed_refs->lock);
4017         return 0;
4018 }
4019
4020 int btrfs_free_extent(struct btrfs_trans_handle *trans,
4021                       struct btrfs_root *root,
4022                       u64 bytenr, u64 num_bytes, u64 parent,
4023                       u64 root_objectid, u64 owner, u64 offset)
4024 {
4025         int ret;
4026
4027         /*
4028          * tree log blocks never actually go into the extent allocation
4029          * tree, just update pinning info and exit early.
4030          */
4031         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4032                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4033                 /* unlocks the pinned mutex */
4034                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
4035                 ret = 0;
4036         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4037                 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4038                                         parent, root_objectid, (int)owner,
4039                                         BTRFS_DROP_DELAYED_REF, NULL);
4040                 BUG_ON(ret);
4041                 ret = check_ref_cleanup(trans, root, bytenr);
4042                 BUG_ON(ret);
4043         } else {
4044                 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4045                                         parent, root_objectid, owner,
4046                                         offset, BTRFS_DROP_DELAYED_REF, NULL);
4047                 BUG_ON(ret);
4048         }
4049         return ret;
4050 }
4051
4052 int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4053                           struct btrfs_root *root,
4054                           u64 bytenr, u32 blocksize,
4055                           u64 parent, u64 root_objectid, int level)
4056 {
4057         u64 used;
4058         spin_lock(&root->node_lock);
4059         used = btrfs_root_used(&root->root_item) - blocksize;
4060         btrfs_set_root_used(&root->root_item, used);
4061         spin_unlock(&root->node_lock);
4062
4063         return btrfs_free_extent(trans, root, bytenr, blocksize,
4064                                  parent, root_objectid, level, 0);
4065 }
4066
4067 static u64 stripe_align(struct btrfs_root *root, u64 val)
4068 {
4069         u64 mask = ((u64)root->stripesize - 1);
4070         u64 ret = (val + mask) & ~mask;
4071         return ret;
4072 }
4073
4074 /*
4075  * when we wait for progress in the block group caching, its because
4076  * our allocation attempt failed at least once.  So, we must sleep
4077  * and let some progress happen before we try again.
4078  *
4079  * This function will sleep at least once waiting for new free space to
4080  * show up, and then it will check the block group free space numbers
4081  * for our min num_bytes.  Another option is to have it go ahead
4082  * and look in the rbtree for a free extent of a given size, but this
4083  * is a good start.
4084  */
4085 static noinline int
4086 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4087                                 u64 num_bytes)
4088 {
4089         struct btrfs_caching_control *caching_ctl;
4090         DEFINE_WAIT(wait);
4091
4092         caching_ctl = get_caching_control(cache);
4093         if (!caching_ctl)
4094                 return 0;
4095
4096         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4097                    (cache->free_space >= num_bytes));
4098
4099         put_caching_control(caching_ctl);
4100         return 0;
4101 }
4102
4103 static noinline int
4104 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4105 {
4106         struct btrfs_caching_control *caching_ctl;
4107         DEFINE_WAIT(wait);
4108
4109         caching_ctl = get_caching_control(cache);
4110         if (!caching_ctl)
4111                 return 0;
4112
4113         wait_event(caching_ctl->wait, block_group_cache_done(cache));
4114
4115         put_caching_control(caching_ctl);
4116         return 0;
4117 }
4118
4119 enum btrfs_loop_type {
4120         LOOP_FIND_IDEAL = 0,
4121         LOOP_CACHING_NOWAIT = 1,
4122         LOOP_CACHING_WAIT = 2,
4123         LOOP_ALLOC_CHUNK = 3,
4124         LOOP_NO_EMPTY_SIZE = 4,
4125 };
4126
4127 /*
4128  * walks the btree of allocated extents and find a hole of a given size.
4129  * The key ins is changed to record the hole:
4130  * ins->objectid == block start
4131  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4132  * ins->offset == number of blocks
4133  * Any available blocks before search_start are skipped.
4134  */
4135 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4136                                      struct btrfs_root *orig_root,
4137                                      u64 num_bytes, u64 empty_size,
4138                                      u64 search_start, u64 search_end,
4139                                      u64 hint_byte, struct btrfs_key *ins,
4140                                      u64 exclude_start, u64 exclude_nr,
4141                                      int data)
4142 {
4143         int ret = 0;
4144         struct btrfs_root *root = orig_root->fs_info->extent_root;
4145         struct btrfs_free_cluster *last_ptr = NULL;
4146         struct btrfs_block_group_cache *block_group = NULL;
4147         int empty_cluster = 2 * 1024 * 1024;
4148         int allowed_chunk_alloc = 0;
4149         int done_chunk_alloc = 0;
4150         struct btrfs_space_info *space_info;
4151         int last_ptr_loop = 0;
4152         int loop = 0;
4153         bool found_uncached_bg = false;
4154         bool failed_cluster_refill = false;
4155         bool failed_alloc = false;
4156         u64 ideal_cache_percent = 0;
4157         u64 ideal_cache_offset = 0;
4158
4159         WARN_ON(num_bytes < root->sectorsize);
4160         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4161         ins->objectid = 0;
4162         ins->offset = 0;
4163
4164         space_info = __find_space_info(root->fs_info, data);
4165
4166         if (orig_root->ref_cows || empty_size)
4167                 allowed_chunk_alloc = 1;
4168
4169         if (data & BTRFS_BLOCK_GROUP_METADATA) {
4170                 last_ptr = &root->fs_info->meta_alloc_cluster;
4171                 if (!btrfs_test_opt(root, SSD))
4172                         empty_cluster = 64 * 1024;
4173         }
4174
4175         if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4176                 last_ptr = &root->fs_info->data_alloc_cluster;
4177         }
4178
4179         if (last_ptr) {
4180                 spin_lock(&last_ptr->lock);
4181                 if (last_ptr->block_group)
4182                         hint_byte = last_ptr->window_start;
4183                 spin_unlock(&last_ptr->lock);
4184         }
4185
4186         search_start = max(search_start, first_logical_byte(root, 0));
4187         search_start = max(search_start, hint_byte);
4188
4189         if (!last_ptr)
4190                 empty_cluster = 0;
4191
4192         if (search_start == hint_byte) {
4193 ideal_cache:
4194                 block_group = btrfs_lookup_block_group(root->fs_info,
4195                                                        search_start);
4196                 /*
4197                  * we don't want to use the block group if it doesn't match our
4198                  * allocation bits, or if its not cached.
4199                  *
4200                  * However if we are re-searching with an ideal block group
4201                  * picked out then we don't care that the block group is cached.
4202                  */
4203                 if (block_group && block_group_bits(block_group, data) &&
4204                     (block_group->cached != BTRFS_CACHE_NO ||
4205                      search_start == ideal_cache_offset)) {
4206                         down_read(&space_info->groups_sem);
4207                         if (list_empty(&block_group->list) ||
4208                             block_group->ro) {
4209                                 /*
4210                                  * someone is removing this block group,
4211                                  * we can't jump into the have_block_group
4212                                  * target because our list pointers are not
4213                                  * valid
4214                                  */
4215                                 btrfs_put_block_group(block_group);
4216                                 up_read(&space_info->groups_sem);
4217                         } else {
4218                                 goto have_block_group;
4219                         }
4220                 } else if (block_group) {
4221                         btrfs_put_block_group(block_group);
4222                 }
4223         }
4224 search:
4225         down_read(&space_info->groups_sem);
4226         list_for_each_entry(block_group, &space_info->block_groups, list) {
4227                 u64 offset;
4228                 int cached;
4229
4230                 atomic_inc(&block_group->count);
4231                 search_start = block_group->key.objectid;
4232
4233 have_block_group:
4234                 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4235                         u64 free_percent;
4236
4237                         free_percent = btrfs_block_group_used(&block_group->item);
4238                         free_percent *= 100;
4239                         free_percent = div64_u64(free_percent,
4240                                                  block_group->key.offset);
4241                         free_percent = 100 - free_percent;
4242                         if (free_percent > ideal_cache_percent &&
4243                             likely(!block_group->ro)) {
4244                                 ideal_cache_offset = block_group->key.objectid;
4245                                 ideal_cache_percent = free_percent;
4246                         }
4247
4248                         /*
4249                          * We only want to start kthread caching if we are at
4250                          * the point where we will wait for caching to make
4251                          * progress, or if our ideal search is over and we've
4252                          * found somebody to start caching.
4253                          */
4254                         if (loop > LOOP_CACHING_NOWAIT ||
4255                             (loop > LOOP_FIND_IDEAL &&
4256                              atomic_read(&space_info->caching_threads) < 2)) {
4257                                 ret = cache_block_group(block_group);
4258                                 BUG_ON(ret);
4259                         }
4260                         found_uncached_bg = true;
4261
4262                         /*
4263                          * If loop is set for cached only, try the next block
4264                          * group.
4265                          */
4266                         if (loop == LOOP_FIND_IDEAL)
4267                                 goto loop;
4268                 }
4269
4270                 cached = block_group_cache_done(block_group);
4271                 if (unlikely(!cached))
4272                         found_uncached_bg = true;
4273
4274                 if (unlikely(block_group->ro))
4275                         goto loop;
4276
4277                 /*
4278                  * Ok we want to try and use the cluster allocator, so lets look
4279                  * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4280                  * have tried the cluster allocator plenty of times at this
4281                  * point and not have found anything, so we are likely way too
4282                  * fragmented for the clustering stuff to find anything, so lets
4283                  * just skip it and let the allocator find whatever block it can
4284                  * find
4285                  */
4286                 if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4287                         /*
4288                          * the refill lock keeps out other
4289                          * people trying to start a new cluster
4290                          */
4291                         spin_lock(&last_ptr->refill_lock);
4292                         if (last_ptr->block_group &&
4293                             (last_ptr->block_group->ro ||
4294                             !block_group_bits(last_ptr->block_group, data))) {
4295                                 offset = 0;
4296                                 goto refill_cluster;
4297                         }
4298
4299                         offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4300                                                  num_bytes, search_start);
4301                         if (offset) {
4302                                 /* we have a block, we're done */
4303                                 spin_unlock(&last_ptr->refill_lock);
4304                                 goto checks;
4305                         }
4306
4307                         spin_lock(&last_ptr->lock);
4308                         /*
4309                          * whoops, this cluster doesn't actually point to
4310                          * this block group.  Get a ref on the block
4311                          * group is does point to and try again
4312                          */
4313                         if (!last_ptr_loop && last_ptr->block_group &&
4314                             last_ptr->block_group != block_group) {
4315
4316                                 btrfs_put_block_group(block_group);
4317                                 block_group = last_ptr->block_group;
4318                                 atomic_inc(&block_group->count);
4319                                 spin_unlock(&last_ptr->lock);
4320                                 spin_unlock(&last_ptr->refill_lock);
4321
4322                                 last_ptr_loop = 1;
4323                                 search_start = block_group->key.objectid;
4324                                 /*
4325                                  * we know this block group is properly
4326                                  * in the list because
4327                                  * btrfs_remove_block_group, drops the
4328                                  * cluster before it removes the block
4329                                  * group from the list
4330                                  */
4331                                 goto have_block_group;
4332                         }
4333                         spin_unlock(&last_ptr->lock);
4334 refill_cluster:
4335                         /*
4336                          * this cluster didn't work out, free it and
4337                          * start over
4338                          */
4339                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4340
4341                         last_ptr_loop = 0;
4342
4343                         /* allocate a cluster in this block group */
4344                         ret = btrfs_find_space_cluster(trans, root,
4345                                                block_group, last_ptr,
4346                                                offset, num_bytes,
4347                                                empty_cluster + empty_size);
4348                         if (ret == 0) {
4349                                 /*
4350                                  * now pull our allocation out of this
4351                                  * cluster
4352                                  */
4353                                 offset = btrfs_alloc_from_cluster(block_group,
4354                                                   last_ptr, num_bytes,
4355                                                   search_start);
4356                                 if (offset) {
4357                                         /* we found one, proceed */
4358                                         spin_unlock(&last_ptr->refill_lock);
4359                                         goto checks;
4360                                 }
4361                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
4362                                    && !failed_cluster_refill) {
4363                                 spin_unlock(&last_ptr->refill_lock);
4364
4365                                 failed_cluster_refill = true;
4366                                 wait_block_group_cache_progress(block_group,
4367                                        num_bytes + empty_cluster + empty_size);
4368                                 goto have_block_group;
4369                         }
4370
4371                         /*
4372                          * at this point we either didn't find a cluster
4373                          * or we weren't able to allocate a block from our
4374                          * cluster.  Free the cluster we've been trying
4375                          * to use, and go to the next block group
4376                          */
4377                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
4378                         spin_unlock(&last_ptr->refill_lock);
4379                         goto loop;
4380                 }
4381
4382                 offset = btrfs_find_space_for_alloc(block_group, search_start,
4383                                                     num_bytes, empty_size);
4384                 /*
4385                  * If we didn't find a chunk, and we haven't failed on this
4386                  * block group before, and this block group is in the middle of
4387                  * caching and we are ok with waiting, then go ahead and wait
4388                  * for progress to be made, and set failed_alloc to true.
4389                  *
4390                  * If failed_alloc is true then we've already waited on this
4391                  * block group once and should move on to the next block group.
4392                  */
4393                 if (!offset && !failed_alloc && !cached &&
4394                     loop > LOOP_CACHING_NOWAIT) {
4395                         wait_block_group_cache_progress(block_group,
4396                                                 num_bytes + empty_size);
4397                         failed_alloc = true;
4398                         goto have_block_group;
4399                 } else if (!offset) {
4400                         goto loop;
4401                 }
4402 checks:
4403                 search_start = stripe_align(root, offset);
4404                 /* move on to the next group */
4405                 if (search_start + num_bytes >= search_end) {
4406                         btrfs_add_free_space(block_group, offset, num_bytes);
4407                         goto loop;
4408                 }
4409
4410                 /* move on to the next group */
4411                 if (search_start + num_bytes >
4412                     block_group->key.objectid + block_group->key.offset) {
4413                         btrfs_add_free_space(block_group, offset, num_bytes);
4414                         goto loop;
4415                 }
4416
4417                 if (exclude_nr > 0 &&
4418                     (search_start + num_bytes > exclude_start &&
4419                      search_start < exclude_start + exclude_nr)) {
4420                         search_start = exclude_start + exclude_nr;
4421
4422                         btrfs_add_free_space(block_group, offset, num_bytes);
4423                         /*
4424                          * if search_start is still in this block group
4425                          * then we just re-search this block group
4426                          */
4427                         if (search_start >= block_group->key.objectid &&
4428                             search_start < (block_group->key.objectid +
4429                                             block_group->key.offset))
4430                                 goto have_block_group;
4431                         goto loop;
4432                 }
4433
4434                 ins->objectid = search_start;
4435                 ins->offset = num_bytes;
4436
4437                 if (offset < search_start)
4438                         btrfs_add_free_space(block_group, offset,
4439                                              search_start - offset);
4440                 BUG_ON(offset > search_start);
4441
4442                 update_reserved_extents(block_group, num_bytes, 1);
4443
4444                 /* we are all good, lets return */
4445                 break;
4446 loop:
4447                 failed_cluster_refill = false;
4448                 failed_alloc = false;
4449                 btrfs_put_block_group(block_group);
4450         }
4451         up_read(&space_info->groups_sem);
4452
4453         /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4454          *                      for them to make caching progress.  Also
4455          *                      determine the best possible bg to cache
4456          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4457          *                      caching kthreads as we move along
4458          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4459          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4460          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4461          *                      again
4462          */
4463         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4464             (found_uncached_bg || empty_size || empty_cluster ||
4465              allowed_chunk_alloc)) {
4466                 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4467                         found_uncached_bg = false;
4468                         loop++;
4469                         if (!ideal_cache_percent &&
4470                             atomic_read(&space_info->caching_threads))
4471                                 goto search;
4472
4473                         /*
4474                          * 1 of the following 2 things have happened so far
4475                          *
4476                          * 1) We found an ideal block group for caching that
4477                          * is mostly full and will cache quickly, so we might
4478                          * as well wait for it.
4479                          *
4480                          * 2) We searched for cached only and we didn't find
4481                          * anything, and we didn't start any caching kthreads
4482                          * either, so chances are we will loop through and
4483                          * start a couple caching kthreads, and then come back
4484                          * around and just wait for them.  This will be slower
4485                          * because we will have 2 caching kthreads reading at
4486                          * the same time when we could have just started one
4487                          * and waited for it to get far enough to give us an
4488                          * allocation, so go ahead and go to the wait caching
4489                          * loop.
4490                          */
4491                         loop = LOOP_CACHING_WAIT;
4492                         search_start = ideal_cache_offset;
4493                         ideal_cache_percent = 0;
4494                         goto ideal_cache;
4495                 } else if (loop == LOOP_FIND_IDEAL) {
4496                         /*
4497                          * Didn't find a uncached bg, wait on anything we find
4498                          * next.
4499                          */
4500                         loop = LOOP_CACHING_WAIT;
4501                         goto search;
4502                 }
4503
4504                 if (loop < LOOP_CACHING_WAIT) {
4505                         loop++;
4506                         goto search;
4507                 }
4508
4509                 if (loop == LOOP_ALLOC_CHUNK) {
4510                         empty_size = 0;
4511                         empty_cluster = 0;
4512                 }
4513
4514                 if (allowed_chunk_alloc) {
4515                         ret = do_chunk_alloc(trans, root, num_bytes +
4516                                              2 * 1024 * 1024, data, 1);
4517                         allowed_chunk_alloc = 0;
4518                         done_chunk_alloc = 1;
4519                 } else if (!done_chunk_alloc) {
4520                         space_info->force_alloc = 1;
4521                 }
4522
4523                 if (loop < LOOP_NO_EMPTY_SIZE) {
4524                         loop++;
4525                         goto search;
4526                 }
4527                 ret = -ENOSPC;
4528         } else if (!ins->objectid) {
4529                 ret = -ENOSPC;
4530         }
4531
4532         /* we found what we needed */
4533         if (ins->objectid) {
4534                 if (!(data & BTRFS_BLOCK_GROUP_DATA))
4535                         trans->block_group = block_group->key.objectid;
4536
4537                 btrfs_put_block_group(block_group);
4538                 ret = 0;
4539         }
4540
4541         return ret;
4542 }
4543
4544 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4545                             int dump_block_groups)
4546 {
4547         struct btrfs_block_group_cache *cache;
4548
4549         spin_lock(&info->lock);
4550         printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4551                (unsigned long long)(info->total_bytes - info->bytes_used -
4552                                     info->bytes_pinned - info->bytes_reserved -
4553                                     info->bytes_super),
4554                (info->full) ? "" : "not ");
4555         printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4556                " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4557                "\n",
4558                (unsigned long long)info->total_bytes,
4559                (unsigned long long)info->bytes_pinned,
4560                (unsigned long long)info->bytes_delalloc,
4561                (unsigned long long)info->bytes_may_use,
4562                (unsigned long long)info->bytes_used,
4563                (unsigned long long)info->bytes_root,
4564                (unsigned long long)info->bytes_super,
4565                (unsigned long long)info->bytes_reserved);
4566         spin_unlock(&info->lock);
4567
4568         if (!dump_block_groups)
4569                 return;
4570
4571         down_read(&info->groups_sem);
4572         list_for_each_entry(cache, &info->block_groups, list) {
4573                 spin_lock(&cache->lock);
4574                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4575                        "%llu pinned %llu reserved\n",
4576                        (unsigned long long)cache->key.objectid,
4577                        (unsigned long long)cache->key.offset,
4578                        (unsigned long long)btrfs_block_group_used(&cache->item),
4579                        (unsigned long long)cache->pinned,
4580                        (unsigned long long)cache->reserved);
4581                 btrfs_dump_free_space(cache, bytes);
4582                 spin_unlock(&cache->lock);
4583         }
4584         up_read(&info->groups_sem);
4585 }
4586
4587 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4588                          struct btrfs_root *root,
4589                          u64 num_bytes, u64 min_alloc_size,
4590                          u64 empty_size, u64 hint_byte,
4591                          u64 search_end, struct btrfs_key *ins,
4592                          u64 data)
4593 {
4594         int ret;
4595         u64 search_start = 0;
4596         struct btrfs_fs_info *info = root->fs_info;
4597
4598         data = btrfs_get_alloc_profile(root, data);
4599 again:
4600         /*
4601          * the only place that sets empty_size is btrfs_realloc_node, which
4602          * is not called recursively on allocations
4603          */
4604         if (empty_size || root->ref_cows) {
4605                 if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
4606                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4607                                      2 * 1024 * 1024,
4608                                      BTRFS_BLOCK_GROUP_METADATA |
4609                                      (info->metadata_alloc_profile &
4610                                       info->avail_metadata_alloc_bits), 0);
4611                 }
4612                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4613                                      num_bytes + 2 * 1024 * 1024, data, 0);
4614         }
4615
4616         WARN_ON(num_bytes < root->sectorsize);
4617         ret = find_free_extent(trans, root, num_bytes, empty_size,
4618                                search_start, search_end, hint_byte, ins,
4619                                trans->alloc_exclude_start,
4620                                trans->alloc_exclude_nr, data);
4621
4622         if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4623                 num_bytes = num_bytes >> 1;
4624                 num_bytes = num_bytes & ~(root->sectorsize - 1);
4625                 num_bytes = max(num_bytes, min_alloc_size);
4626                 do_chunk_alloc(trans, root->fs_info->extent_root,
4627                                num_bytes, data, 1);
4628                 goto again;
4629         }
4630         if (ret == -ENOSPC) {
4631                 struct btrfs_space_info *sinfo;
4632
4633                 sinfo = __find_space_info(root->fs_info, data);
4634                 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4635                        "wanted %llu\n", (unsigned long long)data,
4636                        (unsigned long long)num_bytes);
4637                 dump_space_info(sinfo, num_bytes, 1);
4638         }
4639
4640         return ret;
4641 }
4642
4643 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4644 {
4645         struct btrfs_block_group_cache *cache;
4646         int ret = 0;
4647
4648         cache = btrfs_lookup_block_group(root->fs_info, start);
4649         if (!cache) {
4650                 printk(KERN_ERR "Unable to find block group for %llu\n",
4651                        (unsigned long long)start);
4652                 return -ENOSPC;
4653         }
4654
4655         ret = btrfs_discard_extent(root, start, len);
4656
4657         btrfs_add_free_space(cache, start, len);
4658         update_reserved_extents(cache, len, 0);
4659         btrfs_put_block_group(cache);
4660
4661         return ret;
4662 }
4663
4664 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4665                                       struct btrfs_root *root,
4666                                       u64 parent, u64 root_objectid,
4667                                       u64 flags, u64 owner, u64 offset,
4668                                       struct btrfs_key *ins, int ref_mod)
4669 {
4670         int ret;
4671         struct btrfs_fs_info *fs_info = root->fs_info;
4672         struct btrfs_extent_item *extent_item;
4673         struct btrfs_extent_inline_ref *iref;
4674         struct btrfs_path *path;
4675         struct extent_buffer *leaf;
4676         int type;
4677         u32 size;
4678
4679         if (parent > 0)
4680                 type = BTRFS_SHARED_DATA_REF_KEY;
4681         else
4682                 type = BTRFS_EXTENT_DATA_REF_KEY;
4683
4684         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4685
4686         path = btrfs_alloc_path();
4687         BUG_ON(!path);
4688
4689         path->leave_spinning = 1;
4690         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4691                                       ins, size);
4692         BUG_ON(ret);
4693
4694         leaf = path->nodes[0];
4695         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4696                                      struct btrfs_extent_item);
4697         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4698         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4699         btrfs_set_extent_flags(leaf, extent_item,
4700                                flags | BTRFS_EXTENT_FLAG_DATA);
4701
4702         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4703         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4704         if (parent > 0) {
4705                 struct btrfs_shared_data_ref *ref;
4706                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4707                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4708                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4709         } else {
4710                 struct btrfs_extent_data_ref *ref;
4711                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4712                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4713                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4714                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4715                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4716         }
4717
4718         btrfs_mark_buffer_dirty(path->nodes[0]);
4719         btrfs_free_path(path);
4720
4721         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4722                                  1, 0);
4723         if (ret) {
4724                 printk(KERN_ERR "btrfs update block group failed for %llu "
4725                        "%llu\n", (unsigned long long)ins->objectid,
4726                        (unsigned long long)ins->offset);
4727                 BUG();
4728         }
4729         return ret;
4730 }
4731
4732 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4733                                      struct btrfs_root *root,
4734                                      u64 parent, u64 root_objectid,
4735                                      u64 flags, struct btrfs_disk_key *key,
4736                                      int level, struct btrfs_key *ins)
4737 {
4738         int ret;
4739         struct btrfs_fs_info *fs_info = root->fs_info;
4740         struct btrfs_extent_item *extent_item;
4741         struct btrfs_tree_block_info *block_info;
4742         struct btrfs_extent_inline_ref *iref;
4743         struct btrfs_path *path;
4744         struct extent_buffer *leaf;
4745         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
4746
4747         path = btrfs_alloc_path();
4748         BUG_ON(!path);
4749
4750         path->leave_spinning = 1;
4751         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4752                                       ins, size);
4753         BUG_ON(ret);
4754
4755         leaf = path->nodes[0];
4756         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4757                                      struct btrfs_extent_item);
4758         btrfs_set_extent_refs(leaf, extent_item, 1);
4759         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4760         btrfs_set_extent_flags(leaf, extent_item,
4761                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4762         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4763
4764         btrfs_set_tree_block_key(leaf, block_info, key);
4765         btrfs_set_tree_block_level(leaf, block_info, level);
4766
4767         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4768         if (parent > 0) {
4769                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4770                 btrfs_set_extent_inline_ref_type(leaf, iref,
4771                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4772                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4773         } else {
4774                 btrfs_set_extent_inline_ref_type(leaf, iref,
4775                                                  BTRFS_TREE_BLOCK_REF_KEY);
4776                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4777         }
4778
4779         btrfs_mark_buffer_dirty(leaf);
4780         btrfs_free_path(path);
4781
4782         ret = update_block_group(trans, root, ins->objectid, ins->offset,
4783                                  1, 0);
4784         if (ret) {
4785                 printk(KERN_ERR "btrfs update block group failed for %llu "
4786                        "%llu\n", (unsigned long long)ins->objectid,
4787                        (unsigned long long)ins->offset);
4788                 BUG();
4789         }
4790         return ret;
4791 }
4792
4793 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4794                                      struct btrfs_root *root,
4795                                      u64 root_objectid, u64 owner,
4796                                      u64 offset, struct btrfs_key *ins)
4797 {
4798         int ret;
4799
4800         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4801
4802         ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4803                                          0, root_objectid, owner, offset,
4804                                          BTRFS_ADD_DELAYED_EXTENT, NULL);
4805         return ret;
4806 }
4807
4808 /*
4809  * this is used by the tree logging recovery code.  It records that
4810  * an extent has been allocated and makes sure to clear the free
4811  * space cache bits as well
4812  */
4813 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4814                                    struct btrfs_root *root,
4815                                    u64 root_objectid, u64 owner, u64 offset,
4816                                    struct btrfs_key *ins)
4817 {
4818         int ret;
4819         struct btrfs_block_group_cache *block_group;
4820         struct btrfs_caching_control *caching_ctl;
4821         u64 start = ins->objectid;
4822         u64 num_bytes = ins->offset;
4823
4824         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4825         cache_block_group(block_group);
4826         caching_ctl = get_caching_control(block_group);
4827
4828         if (!caching_ctl) {
4829                 BUG_ON(!block_group_cache_done(block_group));
4830                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
4831                 BUG_ON(ret);
4832         } else {
4833                 mutex_lock(&caching_ctl->mutex);
4834
4835                 if (start >= caching_ctl->progress) {
4836                         ret = add_excluded_extent(root, start, num_bytes);
4837                         BUG_ON(ret);
4838                 } else if (start + num_bytes <= caching_ctl->progress) {
4839                         ret = btrfs_remove_free_space(block_group,
4840                                                       start, num_bytes);
4841                         BUG_ON(ret);
4842                 } else {
4843                         num_bytes = caching_ctl->progress - start;
4844                         ret = btrfs_remove_free_space(block_group,
4845                                                       start, num_bytes);
4846                         BUG_ON(ret);
4847
4848                         start = caching_ctl->progress;
4849                         num_bytes = ins->objectid + ins->offset -
4850                                     caching_ctl->progress;
4851                         ret = add_excluded_extent(root, start, num_bytes);
4852                         BUG_ON(ret);
4853                 }
4854
4855                 mutex_unlock(&caching_ctl->mutex);
4856                 put_caching_control(caching_ctl);
4857         }
4858
4859         update_reserved_extents(block_group, ins->offset, 1);
4860         btrfs_put_block_group(block_group);
4861         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4862                                          0, owner, offset, ins, 1);
4863         return ret;
4864 }
4865
4866 /*
4867  * finds a free extent and does all the dirty work required for allocation
4868  * returns the key for the extent through ins, and a tree buffer for
4869  * the first block of the extent through buf.
4870  *
4871  * returns 0 if everything worked, non-zero otherwise.
4872  */
4873 static int alloc_tree_block(struct btrfs_trans_handle *trans,
4874                             struct btrfs_root *root,
4875                             u64 num_bytes, u64 parent, u64 root_objectid,
4876                             struct btrfs_disk_key *key, int level,
4877                             u64 empty_size, u64 hint_byte, u64 search_end,
4878                             struct btrfs_key *ins)
4879 {
4880         int ret;
4881         u64 flags = 0;
4882
4883         ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4884                                    empty_size, hint_byte, search_end,
4885                                    ins, 0);
4886         if (ret)
4887                 return ret;
4888
4889         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4890                 if (parent == 0)
4891                         parent = ins->objectid;
4892                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4893         } else
4894                 BUG_ON(parent > 0);
4895
4896         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4897                 struct btrfs_delayed_extent_op *extent_op;
4898                 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4899                 BUG_ON(!extent_op);
4900                 if (key)
4901                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4902                 else
4903                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4904                 extent_op->flags_to_set = flags;
4905                 extent_op->update_key = 1;
4906                 extent_op->update_flags = 1;
4907                 extent_op->is_data = 0;
4908
4909                 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4910                                         ins->offset, parent, root_objectid,
4911                                         level, BTRFS_ADD_DELAYED_EXTENT,
4912                                         extent_op);
4913                 BUG_ON(ret);
4914         }
4915
4916         if (root_objectid == root->root_key.objectid) {
4917                 u64 used;
4918                 spin_lock(&root->node_lock);
4919                 used = btrfs_root_used(&root->root_item) + num_bytes;
4920                 btrfs_set_root_used(&root->root_item, used);
4921                 spin_unlock(&root->node_lock);
4922         }
4923         return ret;
4924 }
4925
4926 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4927                                             struct btrfs_root *root,
4928                                             u64 bytenr, u32 blocksize,
4929                                             int level)
4930 {
4931         struct extent_buffer *buf;
4932
4933         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
4934         if (!buf)
4935                 return ERR_PTR(-ENOMEM);
4936         btrfs_set_header_generation(buf, trans->transid);
4937         btrfs_set_buffer_lockdep_class(buf, level);
4938         btrfs_tree_lock(buf);
4939         clean_tree_block(trans, root, buf);
4940
4941         btrfs_set_lock_blocking(buf);
4942         btrfs_set_buffer_uptodate(buf);
4943
4944         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4945                 /*
4946                  * we allow two log transactions at a time, use different
4947                  * EXENT bit to differentiate dirty pages.
4948                  */
4949                 if (root->log_transid % 2 == 0)
4950                         set_extent_dirty(&root->dirty_log_pages, buf->start,
4951                                         buf->start + buf->len - 1, GFP_NOFS);
4952                 else
4953                         set_extent_new(&root->dirty_log_pages, buf->start,
4954                                         buf->start + buf->len - 1, GFP_NOFS);
4955         } else {
4956                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4957                          buf->start + buf->len - 1, GFP_NOFS);
4958         }
4959         trans->blocks_used++;
4960         /* this returns a buffer locked for blocking */
4961         return buf;
4962 }
4963
4964 /*
4965  * helper function to allocate a block for a given tree
4966  * returns the tree buffer or NULL.
4967  */
4968 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4969                                         struct btrfs_root *root, u32 blocksize,
4970                                         u64 parent, u64 root_objectid,
4971                                         struct btrfs_disk_key *key, int level,
4972                                         u64 hint, u64 empty_size)
4973 {
4974         struct btrfs_key ins;
4975         int ret;
4976         struct extent_buffer *buf;
4977
4978         ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
4979                                key, level, empty_size, hint, (u64)-1, &ins);
4980         if (ret) {
4981                 BUG_ON(ret > 0);
4982                 return ERR_PTR(ret);
4983         }
4984
4985         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4986                                     blocksize, level);
4987         return buf;
4988 }
4989
4990 struct walk_control {
4991         u64 refs[BTRFS_MAX_LEVEL];
4992         u64 flags[BTRFS_MAX_LEVEL];
4993         struct btrfs_key update_progress;
4994         int stage;
4995         int level;
4996         int shared_level;
4997         int update_ref;
4998         int keep_locks;
4999         int reada_slot;
5000         int reada_count;
5001 };
5002
5003 #define DROP_REFERENCE  1
5004 #define UPDATE_BACKREF  2
5005
5006 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5007                                      struct btrfs_root *root,
5008                                      struct walk_control *wc,
5009                                      struct btrfs_path *path)
5010 {
5011         u64 bytenr;
5012         u64 generation;
5013         u64 refs;
5014         u64 flags;
5015         u64 last = 0;
5016         u32 nritems;
5017         u32 blocksize;
5018         struct btrfs_key key;
5019         struct extent_buffer *eb;
5020         int ret;
5021         int slot;
5022         int nread = 0;
5023
5024         if (path->slots[wc->level] < wc->reada_slot) {
5025                 wc->reada_count = wc->reada_count * 2 / 3;
5026                 wc->reada_count = max(wc->reada_count, 2);
5027         } else {
5028                 wc->reada_count = wc->reada_count * 3 / 2;
5029                 wc->reada_count = min_t(int, wc->reada_count,
5030                                         BTRFS_NODEPTRS_PER_BLOCK(root));
5031         }
5032
5033         eb = path->nodes[wc->level];
5034         nritems = btrfs_header_nritems(eb);
5035         blocksize = btrfs_level_size(root, wc->level - 1);
5036
5037         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5038                 if (nread >= wc->reada_count)
5039                         break;
5040
5041                 cond_resched();
5042                 bytenr = btrfs_node_blockptr(eb, slot);
5043                 generation = btrfs_node_ptr_generation(eb, slot);
5044
5045                 if (slot == path->slots[wc->level])
5046                         goto reada;
5047
5048                 if (wc->stage == UPDATE_BACKREF &&
5049                     generation <= root->root_key.offset)
5050                         continue;
5051
5052                 /* We don't lock the tree block, it's OK to be racy here */
5053                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5054                                                &refs, &flags);
5055                 BUG_ON(ret);
5056                 BUG_ON(refs == 0);
5057
5058                 if (wc->stage == DROP_REFERENCE) {
5059                         if (refs == 1)
5060                                 goto reada;
5061
5062                         if (wc->level == 1 &&
5063                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5064                                 continue;
5065                         if (!wc->update_ref ||
5066                             generation <= root->root_key.offset)
5067                                 continue;
5068                         btrfs_node_key_to_cpu(eb, &key, slot);
5069                         ret = btrfs_comp_cpu_keys(&key,
5070                                                   &wc->update_progress);
5071                         if (ret < 0)
5072                                 continue;
5073                 } else {
5074                         if (wc->level == 1 &&
5075                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5076                                 continue;
5077                 }
5078 reada:
5079                 ret = readahead_tree_block(root, bytenr, blocksize,
5080                                            generation);
5081                 if (ret)
5082                         break;
5083                 last = bytenr + blocksize;
5084                 nread++;
5085         }
5086         wc->reada_slot = slot;
5087 }
5088
5089 /*
5090  * hepler to process tree block while walking down the tree.
5091  *
5092  * when wc->stage == UPDATE_BACKREF, this function updates
5093  * back refs for pointers in the block.
5094  *
5095  * NOTE: return value 1 means we should stop walking down.
5096  */
5097 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5098                                    struct btrfs_root *root,
5099                                    struct btrfs_path *path,
5100                                    struct walk_control *wc, int lookup_info)
5101 {
5102         int level = wc->level;
5103         struct extent_buffer *eb = path->nodes[level];
5104         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5105         int ret;
5106
5107         if (wc->stage == UPDATE_BACKREF &&
5108             btrfs_header_owner(eb) != root->root_key.objectid)
5109                 return 1;
5110
5111         /*
5112          * when reference count of tree block is 1, it won't increase
5113          * again. once full backref flag is set, we never clear it.
5114          */
5115         if (lookup_info &&
5116             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5117              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5118                 BUG_ON(!path->locks[level]);
5119                 ret = btrfs_lookup_extent_info(trans, root,
5120                                                eb->start, eb->len,
5121                                                &wc->refs[level],
5122                                                &wc->flags[level]);
5123                 BUG_ON(ret);
5124                 BUG_ON(wc->refs[level] == 0);
5125         }
5126
5127         if (wc->stage == DROP_REFERENCE) {
5128                 if (wc->refs[level] > 1)
5129                         return 1;
5130
5131                 if (path->locks[level] && !wc->keep_locks) {
5132                         btrfs_tree_unlock(eb);
5133                         path->locks[level] = 0;
5134                 }
5135                 return 0;
5136         }
5137
5138         /* wc->stage == UPDATE_BACKREF */
5139         if (!(wc->flags[level] & flag)) {
5140                 BUG_ON(!path->locks[level]);
5141                 ret = btrfs_inc_ref(trans, root, eb, 1);
5142                 BUG_ON(ret);
5143                 ret = btrfs_dec_ref(trans, root, eb, 0);
5144                 BUG_ON(ret);
5145                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5146                                                   eb->len, flag, 0);
5147                 BUG_ON(ret);
5148                 wc->flags[level] |= flag;
5149         }
5150
5151         /*
5152          * the block is shared by multiple trees, so it's not good to
5153          * keep the tree lock
5154          */
5155         if (path->locks[level] && level > 0) {
5156                 btrfs_tree_unlock(eb);
5157                 path->locks[level] = 0;
5158         }
5159         return 0;
5160 }
5161
5162 /*
5163  * hepler to process tree block pointer.
5164  *
5165  * when wc->stage == DROP_REFERENCE, this function checks
5166  * reference count of the block pointed to. if the block
5167  * is shared and we need update back refs for the subtree
5168  * rooted at the block, this function changes wc->stage to
5169  * UPDATE_BACKREF. if the block is shared and there is no
5170  * need to update back, this function drops the reference
5171  * to the block.
5172  *
5173  * NOTE: return value 1 means we should stop walking down.
5174  */
5175 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5176                                  struct btrfs_root *root,
5177                                  struct btrfs_path *path,
5178                                  struct walk_control *wc, int *lookup_info)
5179 {
5180         u64 bytenr;
5181         u64 generation;
5182         u64 parent;
5183         u32 blocksize;
5184         struct btrfs_key key;
5185         struct extent_buffer *next;
5186         int level = wc->level;
5187         int reada = 0;
5188         int ret = 0;
5189
5190         generation = btrfs_node_ptr_generation(path->nodes[level],
5191                                                path->slots[level]);
5192         /*
5193          * if the lower level block was created before the snapshot
5194          * was created, we know there is no need to update back refs
5195          * for the subtree
5196          */
5197         if (wc->stage == UPDATE_BACKREF &&
5198             generation <= root->root_key.offset) {
5199                 *lookup_info = 1;
5200                 return 1;
5201         }
5202
5203         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5204         blocksize = btrfs_level_size(root, level - 1);
5205
5206         next = btrfs_find_tree_block(root, bytenr, blocksize);
5207         if (!next) {
5208                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5209                 reada = 1;
5210         }
5211         btrfs_tree_lock(next);
5212         btrfs_set_lock_blocking(next);
5213
5214         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5215                                        &wc->refs[level - 1],
5216                                        &wc->flags[level - 1]);
5217         BUG_ON(ret);
5218         BUG_ON(wc->refs[level - 1] == 0);
5219         *lookup_info = 0;
5220
5221         if (wc->stage == DROP_REFERENCE) {
5222                 if (wc->refs[level - 1] > 1) {
5223                         if (level == 1 &&
5224                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5225                                 goto skip;
5226
5227                         if (!wc->update_ref ||
5228                             generation <= root->root_key.offset)
5229                                 goto skip;
5230
5231                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5232                                               path->slots[level]);
5233                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5234                         if (ret < 0)
5235                                 goto skip;
5236
5237                         wc->stage = UPDATE_BACKREF;
5238                         wc->shared_level = level - 1;
5239                 }
5240         } else {
5241                 if (level == 1 &&
5242                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5243                         goto skip;
5244         }
5245
5246         if (!btrfs_buffer_uptodate(next, generation)) {
5247                 btrfs_tree_unlock(next);
5248                 free_extent_buffer(next);
5249                 next = NULL;
5250                 *lookup_info = 1;
5251         }
5252
5253         if (!next) {
5254                 if (reada && level == 1)
5255                         reada_walk_down(trans, root, wc, path);
5256                 next = read_tree_block(root, bytenr, blocksize, generation);
5257                 btrfs_tree_lock(next);
5258                 btrfs_set_lock_blocking(next);
5259         }
5260
5261         level--;
5262         BUG_ON(level != btrfs_header_level(next));
5263         path->nodes[level] = next;
5264         path->slots[level] = 0;
5265         path->locks[level] = 1;
5266         wc->level = level;
5267         if (wc->level == 1)
5268                 wc->reada_slot = 0;
5269         return 0;
5270 skip:
5271         wc->refs[level - 1] = 0;
5272         wc->flags[level - 1] = 0;
5273         if (wc->stage == DROP_REFERENCE) {
5274                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5275                         parent = path->nodes[level]->start;
5276                 } else {
5277                         BUG_ON(root->root_key.objectid !=
5278                                btrfs_header_owner(path->nodes[level]));
5279                         parent = 0;
5280                 }
5281
5282                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5283                                         root->root_key.objectid, level - 1, 0);
5284                 BUG_ON(ret);
5285         }
5286         btrfs_tree_unlock(next);
5287         free_extent_buffer(next);
5288         *lookup_info = 1;
5289         return 1;
5290 }
5291
5292 /*
5293  * hepler to process tree block while walking up the tree.
5294  *
5295  * when wc->stage == DROP_REFERENCE, this function drops
5296  * reference count on the block.
5297  *
5298  * when wc->stage == UPDATE_BACKREF, this function changes
5299  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5300  * to UPDATE_BACKREF previously while processing the block.
5301  *
5302  * NOTE: return value 1 means we should stop walking up.
5303  */
5304 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5305                                  struct btrfs_root *root,
5306                                  struct btrfs_path *path,
5307                                  struct walk_control *wc)
5308 {
5309         int ret = 0;
5310         int level = wc->level;
5311         struct extent_buffer *eb = path->nodes[level];
5312         u64 parent = 0;
5313
5314         if (wc->stage == UPDATE_BACKREF) {
5315                 BUG_ON(wc->shared_level < level);
5316                 if (level < wc->shared_level)
5317                         goto out;
5318
5319                 ret = find_next_key(path, level + 1, &wc->update_progress);
5320                 if (ret > 0)
5321                         wc->update_ref = 0;
5322
5323                 wc->stage = DROP_REFERENCE;
5324                 wc->shared_level = -1;
5325                 path->slots[level] = 0;
5326
5327                 /*
5328                  * check reference count again if the block isn't locked.
5329                  * we should start walking down the tree again if reference
5330                  * count is one.
5331                  */
5332                 if (!path->locks[level]) {
5333                         BUG_ON(level == 0);
5334                         btrfs_tree_lock(eb);
5335                         btrfs_set_lock_blocking(eb);
5336                         path->locks[level] = 1;
5337
5338                         ret = btrfs_lookup_extent_info(trans, root,
5339                                                        eb->start, eb->len,
5340                                                        &wc->refs[level],
5341                                                        &wc->flags[level]);
5342                         BUG_ON(ret);
5343                         BUG_ON(wc->refs[level] == 0);
5344                         if (wc->refs[level] == 1) {
5345                                 btrfs_tree_unlock(eb);
5346                                 path->locks[level] = 0;
5347                                 return 1;
5348                         }
5349                 }
5350         }
5351
5352         /* wc->stage == DROP_REFERENCE */
5353         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5354
5355         if (wc->refs[level] == 1) {
5356                 if (level == 0) {
5357                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5358                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5359                         else
5360                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5361                         BUG_ON(ret);
5362                 }
5363                 /* make block locked assertion in clean_tree_block happy */
5364                 if (!path->locks[level] &&
5365                     btrfs_header_generation(eb) == trans->transid) {
5366                         btrfs_tree_lock(eb);
5367                         btrfs_set_lock_blocking(eb);
5368                         path->locks[level] = 1;
5369                 }
5370                 clean_tree_block(trans, root, eb);
5371         }
5372
5373         if (eb == root->node) {
5374                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5375                         parent = eb->start;
5376                 else
5377                         BUG_ON(root->root_key.objectid !=
5378                                btrfs_header_owner(eb));
5379         } else {
5380                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5381                         parent = path->nodes[level + 1]->start;
5382                 else
5383                         BUG_ON(root->root_key.objectid !=
5384                                btrfs_header_owner(path->nodes[level + 1]));
5385         }
5386
5387         ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
5388                                 root->root_key.objectid, level, 0);
5389         BUG_ON(ret);
5390 out:
5391         wc->refs[level] = 0;
5392         wc->flags[level] = 0;
5393         return ret;
5394 }
5395
5396 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5397                                    struct btrfs_root *root,
5398                                    struct btrfs_path *path,
5399                                    struct walk_control *wc)
5400 {
5401         int level = wc->level;
5402         int lookup_info = 1;
5403         int ret;
5404
5405         while (level >= 0) {
5406                 if (path->slots[level] >=
5407                     btrfs_header_nritems(path->nodes[level]))
5408                         break;
5409
5410                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5411                 if (ret > 0)
5412                         break;
5413
5414                 if (level == 0)
5415                         break;
5416
5417                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5418                 if (ret > 0) {
5419                         path->slots[level]++;
5420                         continue;
5421                 }
5422                 level = wc->level;
5423         }
5424         return 0;
5425 }
5426
5427 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5428                                  struct btrfs_root *root,
5429                                  struct btrfs_path *path,
5430                                  struct walk_control *wc, int max_level)
5431 {
5432         int level = wc->level;
5433         int ret;
5434
5435         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5436         while (level < max_level && path->nodes[level]) {
5437                 wc->level = level;
5438                 if (path->slots[level] + 1 <
5439                     btrfs_header_nritems(path->nodes[level])) {
5440                         path->slots[level]++;
5441                         return 0;
5442                 } else {
5443                         ret = walk_up_proc(trans, root, path, wc);
5444                         if (ret > 0)
5445                                 return 0;
5446
5447                         if (path->locks[level]) {
5448                                 btrfs_tree_unlock(path->nodes[level]);
5449                                 path->locks[level] = 0;
5450                         }
5451                         free_extent_buffer(path->nodes[level]);
5452                         path->nodes[level] = NULL;
5453                         level++;
5454                 }
5455         }
5456         return 1;
5457 }
5458
5459 /*
5460  * drop a subvolume tree.
5461  *
5462  * this function traverses the tree freeing any blocks that only
5463  * referenced by the tree.
5464  *
5465  * when a shared tree block is found. this function decreases its
5466  * reference count by one. if update_ref is true, this function
5467  * also make sure backrefs for the shared block and all lower level
5468  * blocks are properly updated.
5469  */
5470 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5471 {
5472         struct btrfs_path *path;
5473         struct btrfs_trans_handle *trans;
5474         struct btrfs_root *tree_root = root->fs_info->tree_root;
5475         struct btrfs_root_item *root_item = &root->root_item;
5476         struct walk_control *wc;
5477         struct btrfs_key key;
5478         int err = 0;
5479         int ret;
5480         int level;
5481
5482         path = btrfs_alloc_path();
5483         BUG_ON(!path);
5484
5485         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5486         BUG_ON(!wc);
5487
5488         trans = btrfs_start_transaction(tree_root, 1);
5489
5490         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5491                 level = btrfs_header_level(root->node);
5492                 path->nodes[level] = btrfs_lock_root_node(root);
5493                 btrfs_set_lock_blocking(path->nodes[level]);
5494                 path->slots[level] = 0;
5495                 path->locks[level] = 1;
5496                 memset(&wc->update_progress, 0,
5497                        sizeof(wc->update_progress));
5498         } else {
5499                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5500                 memcpy(&wc->update_progress, &key,
5501                        sizeof(wc->update_progress));
5502
5503                 level = root_item->drop_level;
5504                 BUG_ON(level == 0);
5505                 path->lowest_level = level;
5506                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5507                 path->lowest_level = 0;
5508                 if (ret < 0) {
5509                         err = ret;
5510                         goto out;
5511                 }
5512                 WARN_ON(ret > 0);
5513
5514                 /*
5515                  * unlock our path, this is safe because only this
5516                  * function is allowed to delete this snapshot
5517                  */
5518                 btrfs_unlock_up_safe(path, 0);
5519
5520                 level = btrfs_header_level(root->node);
5521                 while (1) {
5522                         btrfs_tree_lock(path->nodes[level]);
5523                         btrfs_set_lock_blocking(path->nodes[level]);
5524
5525                         ret = btrfs_lookup_extent_info(trans, root,
5526                                                 path->nodes[level]->start,
5527                                                 path->nodes[level]->len,
5528                                                 &wc->refs[level],
5529                                                 &wc->flags[level]);
5530                         BUG_ON(ret);
5531                         BUG_ON(wc->refs[level] == 0);
5532
5533                         if (level == root_item->drop_level)
5534                                 break;
5535
5536                         btrfs_tree_unlock(path->nodes[level]);
5537                         WARN_ON(wc->refs[level] != 1);
5538                         level--;
5539                 }
5540         }
5541
5542         wc->level = level;
5543         wc->shared_level = -1;
5544         wc->stage = DROP_REFERENCE;
5545         wc->update_ref = update_ref;
5546         wc->keep_locks = 0;
5547         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5548
5549         while (1) {
5550                 ret = walk_down_tree(trans, root, path, wc);
5551                 if (ret < 0) {
5552                         err = ret;
5553                         break;
5554                 }
5555
5556                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5557                 if (ret < 0) {
5558                         err = ret;
5559                         break;
5560                 }
5561
5562                 if (ret > 0) {
5563                         BUG_ON(wc->stage != DROP_REFERENCE);
5564                         break;
5565                 }
5566
5567                 if (wc->stage == DROP_REFERENCE) {
5568                         level = wc->level;
5569                         btrfs_node_key(path->nodes[level],
5570                                        &root_item->drop_progress,
5571                                        path->slots[level]);
5572                         root_item->drop_level = level;
5573                 }
5574
5575                 BUG_ON(wc->level == 0);
5576                 if (trans->transaction->in_commit ||
5577                     trans->transaction->delayed_refs.flushing) {
5578                         ret = btrfs_update_root(trans, tree_root,
5579                                                 &root->root_key,
5580                                                 root_item);
5581                         BUG_ON(ret);
5582
5583                         btrfs_end_transaction(trans, tree_root);
5584                         trans = btrfs_start_transaction(tree_root, 1);
5585                 } else {
5586                         unsigned long update;
5587                         update = trans->delayed_ref_updates;
5588                         trans->delayed_ref_updates = 0;
5589                         if (update)
5590                                 btrfs_run_delayed_refs(trans, tree_root,
5591                                                        update);
5592                 }
5593         }
5594         btrfs_release_path(root, path);
5595         BUG_ON(err);
5596
5597         ret = btrfs_del_root(trans, tree_root, &root->root_key);
5598         BUG_ON(ret);
5599
5600         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5601                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
5602                                            NULL, NULL);
5603                 BUG_ON(ret < 0);
5604                 if (ret > 0) {
5605                         ret = btrfs_del_orphan_item(trans, tree_root,
5606                                                     root->root_key.objectid);
5607                         BUG_ON(ret);
5608                 }
5609         }
5610
5611         if (root->in_radix) {
5612                 btrfs_free_fs_root(tree_root->fs_info, root);
5613         } else {
5614                 free_extent_buffer(root->node);
5615                 free_extent_buffer(root->commit_root);
5616                 kfree(root);
5617         }
5618 out:
5619         btrfs_end_transaction(trans, tree_root);
5620         kfree(wc);
5621         btrfs_free_path(path);
5622         return err;
5623 }
5624
5625 /*
5626  * drop subtree rooted at tree block 'node'.
5627  *
5628  * NOTE: this function will unlock and release tree block 'node'
5629  */
5630 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5631                         struct btrfs_root *root,
5632                         struct extent_buffer *node,
5633                         struct extent_buffer *parent)
5634 {
5635         struct btrfs_path *path;
5636         struct walk_control *wc;
5637         int level;
5638         int parent_level;
5639         int ret = 0;
5640         int wret;
5641
5642         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5643
5644         path = btrfs_alloc_path();
5645         BUG_ON(!path);
5646
5647         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5648         BUG_ON(!wc);
5649
5650         btrfs_assert_tree_locked(parent);
5651         parent_level = btrfs_header_level(parent);
5652         extent_buffer_get(parent);
5653         path->nodes[parent_level] = parent;
5654         path->slots[parent_level] = btrfs_header_nritems(parent);
5655
5656         btrfs_assert_tree_locked(node);
5657         level = btrfs_header_level(node);
5658         path->nodes[level] = node;
5659         path->slots[level] = 0;
5660         path->locks[level] = 1;
5661
5662         wc->refs[parent_level] = 1;
5663         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5664         wc->level = level;
5665         wc->shared_level = -1;
5666         wc->stage = DROP_REFERENCE;
5667         wc->update_ref = 0;
5668         wc->keep_locks = 1;
5669         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
5670
5671         while (1) {
5672                 wret = walk_down_tree(trans, root, path, wc);
5673                 if (wret < 0) {
5674                         ret = wret;
5675                         break;
5676                 }
5677
5678                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5679                 if (wret < 0)
5680                         ret = wret;
5681                 if (wret != 0)
5682                         break;
5683         }
5684
5685         kfree(wc);
5686         btrfs_free_path(path);
5687         return ret;
5688 }
5689
5690 #if 0
5691 static unsigned long calc_ra(unsigned long start, unsigned long last,
5692                              unsigned long nr)
5693 {
5694         return min(last, start + nr - 1);
5695 }
5696
5697 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
5698                                          u64 len)
5699 {
5700         u64 page_start;
5701         u64 page_end;
5702         unsigned long first_index;
5703         unsigned long last_index;
5704         unsigned long i;
5705         struct page *page;
5706         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5707         struct file_ra_state *ra;
5708         struct btrfs_ordered_extent *ordered;
5709         unsigned int total_read = 0;
5710         unsigned int total_dirty = 0;
5711         int ret = 0;
5712
5713         ra = kzalloc(sizeof(*ra), GFP_NOFS);
5714
5715         mutex_lock(&inode->i_mutex);
5716         first_index = start >> PAGE_CACHE_SHIFT;
5717         last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
5718
5719         /* make sure the dirty trick played by the caller work */
5720         ret = invalidate_inode_pages2_range(inode->i_mapping,
5721                                             first_index, last_index);
5722         if (ret)
5723                 goto out_unlock;
5724
5725         file_ra_state_init(ra, inode->i_mapping);
5726
5727         for (i = first_index ; i <= last_index; i++) {
5728                 if (total_read % ra->ra_pages == 0) {
5729                         btrfs_force_ra(inode->i_mapping, ra, NULL, i,
5730                                        calc_ra(i, last_index, ra->ra_pages));
5731                 }
5732                 total_read++;
5733 again:
5734                 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
5735                         BUG_ON(1);
5736                 page = grab_cache_page(inode->i_mapping, i);
5737                 if (!page) {
5738                         ret = -ENOMEM;
5739                         goto out_unlock;
5740                 }
5741                 if (!PageUptodate(page)) {
5742                         btrfs_readpage(NULL, page);
5743                         lock_page(page);
5744                         if (!PageUptodate(page)) {
5745                                 unlock_page(page);
5746                                 page_cache_release(page);
5747                                 ret = -EIO;
5748                                 goto out_unlock;
5749                         }
5750                 }
5751                 wait_on_page_writeback(page);
5752
5753                 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
5754                 page_end = page_start + PAGE_CACHE_SIZE - 1;
5755                 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
5756
5757                 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5758                 if (ordered) {
5759                         unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5760                         unlock_page(page);
5761                         page_cache_release(page);
5762                         btrfs_start_ordered_extent(inode, ordered, 1);
5763                         btrfs_put_ordered_extent(ordered);
5764                         goto again;
5765                 }
5766                 set_page_extent_mapped(page);
5767
5768                 if (i == first_index)
5769                         set_extent_bits(io_tree, page_start, page_end,
5770                                         EXTENT_BOUNDARY, GFP_NOFS);
5771                 btrfs_set_extent_delalloc(inode, page_start, page_end);
5772
5773                 set_page_dirty(page);
5774                 total_dirty++;
5775
5776                 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5777                 unlock_page(page);
5778                 page_cache_release(page);
5779         }
5780
5781 out_unlock:
5782         kfree(ra);
5783         mutex_unlock(&inode->i_mutex);
5784         balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
5785         return ret;
5786 }
5787
5788 static noinline int relocate_data_extent(struct inode *reloc_inode,
5789                                          struct btrfs_key *extent_key,
5790                                          u64 offset)
5791 {
5792         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
5793         struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
5794         struct extent_map *em;
5795         u64 start = extent_key->objectid - offset;
5796         u64 end = start + extent_key->offset - 1;
5797
5798         em = alloc_extent_map(GFP_NOFS);
5799         BUG_ON(!em || IS_ERR(em));
5800
5801         em->start = start;
5802         em->len = extent_key->offset;
5803         em->block_len = extent_key->offset;
5804         em->block_start = extent_key->objectid;
5805         em->bdev = root->fs_info->fs_devices->latest_bdev;
5806         set_bit(EXTENT_FLAG_PINNED, &em->flags);
5807
5808         /* setup extent map to cheat btrfs_readpage */
5809         lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5810         while (1) {
5811                 int ret;
5812                 write_lock(&em_tree->lock);
5813                 ret = add_extent_mapping(em_tree, em);
5814                 write_unlock(&em_tree->lock);
5815                 if (ret != -EEXIST) {
5816                         free_extent_map(em);
5817                         break;
5818                 }
5819                 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
5820         }
5821         unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
5822
5823         return relocate_inode_pages(reloc_inode, start, extent_key->offset);
5824 }
5825
5826 struct btrfs_ref_path {
5827         u64 extent_start;
5828         u64 nodes[BTRFS_MAX_LEVEL];
5829         u64 root_objectid;
5830         u64 root_generation;
5831         u64 owner_objectid;
5832         u32 num_refs;
5833         int lowest_level;
5834         int current_level;
5835         int shared_level;
5836
5837         struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
5838         u64 new_nodes[BTRFS_MAX_LEVEL];
5839 };
5840
5841 struct disk_extent {
5842         u64 ram_bytes;
5843         u64 disk_bytenr;
5844         u64 disk_num_bytes;
5845         u64 offset;
5846         u64 num_bytes;
5847         u8 compression;
5848         u8 encryption;
5849         u16 other_encoding;
5850 };
5851
5852 static int is_cowonly_root(u64 root_objectid)
5853 {
5854         if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
5855             root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
5856             root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
5857             root_objectid == BTRFS_DEV_TREE_OBJECTID ||
5858             root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5859             root_objectid == BTRFS_CSUM_TREE_OBJECTID)
5860                 return 1;
5861         return 0;
5862 }
5863
5864 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
5865                                     struct btrfs_root *extent_root,
5866                                     struct btrfs_ref_path *ref_path,
5867                                     int first_time)
5868 {
5869         struct extent_buffer *leaf;
5870         struct btrfs_path *path;
5871         struct btrfs_extent_ref *ref;
5872         struct btrfs_key key;
5873         struct btrfs_key found_key;
5874         u64 bytenr;
5875         u32 nritems;
5876         int level;
5877         int ret = 1;
5878
5879         path = btrfs_alloc_path();
5880         if (!path)
5881                 return -ENOMEM;
5882
5883         if (first_time) {
5884                 ref_path->lowest_level = -1;
5885                 ref_path->current_level = -1;
5886                 ref_path->shared_level = -1;
5887                 goto walk_up;
5888         }
5889 walk_down:
5890         level = ref_path->current_level - 1;
5891         while (level >= -1) {
5892                 u64 parent;
5893                 if (level < ref_path->lowest_level)
5894                         break;
5895
5896                 if (level >= 0)
5897                         bytenr = ref_path->nodes[level];
5898                 else
5899                         bytenr = ref_path->extent_start;
5900                 BUG_ON(bytenr == 0);
5901
5902                 parent = ref_path->nodes[level + 1];
5903                 ref_path->nodes[level + 1] = 0;
5904                 ref_path->current_level = level;
5905                 BUG_ON(parent == 0);
5906
5907                 key.objectid = bytenr;
5908                 key.offset = parent + 1;
5909                 key.type = BTRFS_EXTENT_REF_KEY;
5910
5911                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5912                 if (ret < 0)
5913                         goto out;
5914                 BUG_ON(ret == 0);
5915
5916                 leaf = path->nodes[0];
5917                 nritems = btrfs_header_nritems(leaf);
5918                 if (path->slots[0] >= nritems) {
5919                         ret = btrfs_next_leaf(extent_root, path);
5920                         if (ret < 0)
5921                                 goto out;
5922                         if (ret > 0)
5923                                 goto next;
5924                         leaf = path->nodes[0];
5925                 }
5926
5927                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5928                 if (found_key.objectid == bytenr &&
5929                     found_key.type == BTRFS_EXTENT_REF_KEY) {
5930                         if (level < ref_path->shared_level)
5931                                 ref_path->shared_level = level;
5932                         goto found;
5933                 }
5934 next:
5935                 level--;
5936                 btrfs_release_path(extent_root, path);
5937                 cond_resched();
5938         }
5939         /* reached lowest level */
5940         ret = 1;
5941         goto out;
5942 walk_up:
5943         level = ref_path->current_level;
5944         while (level < BTRFS_MAX_LEVEL - 1) {
5945                 u64 ref_objectid;
5946
5947                 if (level >= 0)
5948                         bytenr = ref_path->nodes[level];
5949                 else
5950                         bytenr = ref_path->extent_start;
5951
5952                 BUG_ON(bytenr == 0);
5953
5954                 key.objectid = bytenr;
5955                 key.offset = 0;
5956                 key.type = BTRFS_EXTENT_REF_KEY;
5957
5958                 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
5959                 if (ret < 0)
5960                         goto out;
5961
5962                 leaf = path->nodes[0];
5963                 nritems = btrfs_header_nritems(leaf);
5964                 if (path->slots[0] >= nritems) {
5965                         ret = btrfs_next_leaf(extent_root, path);
5966                         if (ret < 0)
5967                                 goto out;
5968                         if (ret > 0) {
5969                                 /* the extent was freed by someone */
5970                                 if (ref_path->lowest_level == level)
5971                                         goto out;
5972                                 btrfs_release_path(extent_root, path);
5973                                 goto walk_down;
5974                         }
5975                         leaf = path->nodes[0];
5976                 }
5977
5978                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5979                 if (found_key.objectid != bytenr ||
5980                                 found_key.type != BTRFS_EXTENT_REF_KEY) {
5981                         /* the extent was freed by someone */
5982                         if (ref_path->lowest_level == level) {
5983                                 ret = 1;
5984                                 goto out;
5985                         }
5986                         btrfs_release_path(extent_root, path);
5987                         goto walk_down;
5988                 }
5989 found:
5990                 ref = btrfs_item_ptr(leaf, path->slots[0],
5991                                 struct btrfs_extent_ref);
5992                 ref_objectid = btrfs_ref_objectid(leaf, ref);
5993                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5994                         if (first_time) {
5995                                 level = (int)ref_objectid;
5996                                 BUG_ON(level >= BTRFS_MAX_LEVEL);
5997                                 ref_path->lowest_level = level;
5998                                 ref_path->current_level = level;
5999                                 ref_path->nodes[level] = bytenr;
6000                         } else {
6001                                 WARN_ON(ref_objectid != level);
6002                         }
6003                 } else {
6004                         WARN_ON(level != -1);
6005                 }
6006                 first_time = 0;
6007
6008                 if (ref_path->lowest_level == level) {
6009                         ref_path->owner_objectid = ref_objectid;
6010                         ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6011                 }
6012
6013                 /*
6014                  * the block is tree root or the block isn't in reference
6015                  * counted tree.
6016                  */
6017                 if (found_key.objectid == found_key.offset ||
6018                     is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6019                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6020                         ref_path->root_generation =
6021                                 btrfs_ref_generation(leaf, ref);
6022                         if (level < 0) {
6023                                 /* special reference from the tree log */
6024                                 ref_path->nodes[0] = found_key.offset;
6025                                 ref_path->current_level = 0;
6026                         }
6027                         ret = 0;
6028                         goto out;
6029                 }
6030
6031                 level++;
6032                 BUG_ON(ref_path->nodes[level] != 0);
6033                 ref_path->nodes[level] = found_key.offset;
6034                 ref_path->current_level = level;
6035
6036                 /*
6037                  * the reference was created in the running transaction,
6038                  * no need to continue walking up.
6039                  */
6040                 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6041                         ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6042                         ref_path->root_generation =
6043                                 btrfs_ref_generation(leaf, ref);
6044                         ret = 0;
6045                         goto out;
6046                 }
6047
6048                 btrfs_release_path(extent_root, path);
6049                 cond_resched();
6050         }
6051         /* reached max tree level, but no tree root found. */
6052         BUG();
6053 out:
6054         btrfs_free_path(path);
6055         return ret;
6056 }
6057
6058 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6059                                 struct btrfs_root *extent_root,
6060                                 struct btrfs_ref_path *ref_path,
6061                                 u64 extent_start)
6062 {
6063         memset(ref_path, 0, sizeof(*ref_path));
6064         ref_path->extent_start = extent_start;
6065
6066         return __next_ref_path(trans, extent_root, ref_path, 1);
6067 }
6068
6069 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6070                                struct btrfs_root *extent_root,
6071                                struct btrfs_ref_path *ref_path)
6072 {
6073         return __next_ref_path(trans, extent_root, ref_path, 0);
6074 }
6075
6076 static noinline int get_new_locations(struct inode *reloc_inode,
6077                                       struct btrfs_key *extent_key,
6078                                       u64 offset, int no_fragment,
6079                                       struct disk_extent **extents,
6080                                       int *nr_extents)
6081 {
6082         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6083         struct btrfs_path *path;
6084         struct btrfs_file_extent_item *fi;
6085         struct extent_buffer *leaf;
6086         struct disk_extent *exts = *extents;
6087         struct btrfs_key found_key;
6088         u64 cur_pos;
6089         u64 last_byte;
6090         u32 nritems;
6091         int nr = 0;
6092         int max = *nr_extents;
6093         int ret;
6094
6095         WARN_ON(!no_fragment && *extents);
6096         if (!exts) {
6097                 max = 1;
6098                 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6099                 if (!exts)
6100                         return -ENOMEM;
6101         }
6102
6103         path = btrfs_alloc_path();
6104         BUG_ON(!path);
6105
6106         cur_pos = extent_key->objectid - offset;
6107         last_byte = extent_key->objectid + extent_key->offset;
6108         ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6109                                        cur_pos, 0);
6110         if (ret < 0)
6111                 goto out;
6112         if (ret > 0) {
6113                 ret = -ENOENT;
6114                 goto out;
6115         }
6116
6117         while (1) {
6118                 leaf = path->nodes[0];
6119                 nritems = btrfs_header_nritems(leaf);
6120                 if (path->slots[0] >= nritems) {
6121                         ret = btrfs_next_leaf(root, path);
6122                         if (ret < 0)
6123                                 goto out;
6124                         if (ret > 0)
6125                                 break;
6126                         leaf = path->nodes[0];
6127                 }
6128
6129                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6130                 if (found_key.offset != cur_pos ||
6131                     found_key.type != BTRFS_EXTENT_DATA_KEY ||
6132                     found_key.objectid != reloc_inode->i_ino)
6133                         break;
6134
6135                 fi = btrfs_item_ptr(leaf, path->slots[0],
6136                                     struct btrfs_file_extent_item);
6137                 if (btrfs_file_extent_type(leaf, fi) !=
6138                     BTRFS_FILE_EXTENT_REG ||
6139                     btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6140                         break;
6141
6142                 if (nr == max) {
6143                         struct disk_extent *old = exts;
6144                         max *= 2;
6145                         exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6146                         memcpy(exts, old, sizeof(*exts) * nr);
6147                         if (old != *extents)
6148                                 kfree(old);
6149                 }
6150
6151                 exts[nr].disk_bytenr =
6152                         btrfs_file_extent_disk_bytenr(leaf, fi);
6153                 exts[nr].disk_num_bytes =
6154                         btrfs_file_extent_disk_num_bytes(leaf, fi);
6155                 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6156                 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6157                 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6158                 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6159                 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6160                 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6161                                                                            fi);
6162                 BUG_ON(exts[nr].offset > 0);
6163                 BUG_ON(exts[nr].compression || exts[nr].encryption);
6164                 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6165
6166                 cur_pos += exts[nr].num_bytes;
6167                 nr++;
6168
6169                 if (cur_pos + offset >= last_byte)
6170                         break;
6171
6172                 if (no_fragment) {
6173                         ret = 1;
6174                         goto out;
6175                 }
6176                 path->slots[0]++;
6177         }
6178
6179         BUG_ON(cur_pos + offset > last_byte);
6180         if (cur_pos + offset < last_byte) {
6181                 ret = -ENOENT;
6182                 goto out;
6183         }
6184         ret = 0;
6185 out:
6186         btrfs_free_path(path);
6187         if (ret) {
6188                 if (exts != *extents)
6189                         kfree(exts);
6190         } else {
6191                 *extents = exts;
6192                 *nr_extents = nr;
6193         }
6194         return ret;
6195 }
6196
6197 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6198                                         struct btrfs_root *root,
6199                                         struct btrfs_path *path,
6200                                         struct btrfs_key *extent_key,
6201                                         struct btrfs_key *leaf_key,
6202                                         struct btrfs_ref_path *ref_path,
6203                                         struct disk_extent *new_extents,
6204                                         int nr_extents)
6205 {
6206         struct extent_buffer *leaf;
6207         struct btrfs_file_extent_item *fi;
6208         struct inode *inode = NULL;
6209         struct btrfs_key key;
6210         u64 lock_start = 0;
6211         u64 lock_end = 0;
6212         u64 num_bytes;
6213         u64 ext_offset;
6214         u64 search_end = (u64)-1;
6215         u32 nritems;
6216         int nr_scaned = 0;
6217         int extent_locked = 0;
6218         int extent_type;
6219         int ret;
6220
6221         memcpy(&key, leaf_key, sizeof(key));
6222         if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6223                 if (key.objectid < ref_path->owner_objectid ||
6224                     (key.objectid == ref_path->owner_objectid &&
6225                      key.type < BTRFS_EXTENT_DATA_KEY)) {
6226                         key.objectid = ref_path->owner_objectid;
6227                         key.type = BTRFS_EXTENT_DATA_KEY;
6228                         key.offset = 0;
6229                 }
6230         }
6231
6232         while (1) {
6233                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6234                 if (ret < 0)
6235                         goto out;
6236
6237                 leaf = path->nodes[0];
6238                 nritems = btrfs_header_nritems(leaf);
6239 next:
6240                 if (extent_locked && ret > 0) {
6241                         /*
6242                          * the file extent item was modified by someone
6243                          * before the extent got locked.
6244                          */
6245                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6246                                       lock_end, GFP_NOFS);
6247                         extent_locked = 0;
6248                 }
6249
6250                 if (path->slots[0] >= nritems) {
6251                         if (++nr_scaned > 2)
6252                                 break;
6253
6254                         BUG_ON(extent_locked);
6255                         ret = btrfs_next_leaf(root, path);
6256                         if (ret < 0)
6257                                 goto out;
6258                         if (ret > 0)
6259                                 break;
6260                         leaf = path->nodes[0];
6261                         nritems = btrfs_header_nritems(leaf);
6262                 }
6263
6264                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6265
6266                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6267                         if ((key.objectid > ref_path->owner_objectid) ||
6268                             (key.objectid == ref_path->owner_objectid &&
6269                              key.type > BTRFS_EXTENT_DATA_KEY) ||
6270                             key.offset >= search_end)
6271                                 break;
6272                 }
6273
6274                 if (inode && key.objectid != inode->i_ino) {
6275                         BUG_ON(extent_locked);
6276                         btrfs_release_path(root, path);
6277                         mutex_unlock(&inode->i_mutex);
6278                         iput(inode);
6279                         inode = NULL;
6280                         continue;
6281                 }
6282
6283                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6284                         path->slots[0]++;
6285                         ret = 1;
6286                         goto next;
6287                 }
6288                 fi = btrfs_item_ptr(leaf, path->slots[0],
6289                                     struct btrfs_file_extent_item);
6290                 extent_type = btrfs_file_extent_type(leaf, fi);
6291                 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6292                      extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6293                     (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6294                      extent_key->objectid)) {
6295                         path->slots[0]++;
6296                         ret = 1;
6297                         goto next;
6298                 }
6299
6300                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6301                 ext_offset = btrfs_file_extent_offset(leaf, fi);
6302
6303                 if (search_end == (u64)-1) {
6304                         search_end = key.offset - ext_offset +
6305                                 btrfs_file_extent_ram_bytes(leaf, fi);
6306                 }
6307
6308                 if (!extent_locked) {
6309                         lock_start = key.offset;
6310                         lock_end = lock_start + num_bytes - 1;
6311                 } else {
6312                         if (lock_start > key.offset ||
6313                             lock_end + 1 < key.offset + num_bytes) {
6314                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6315                                               lock_start, lock_end, GFP_NOFS);
6316                                 extent_locked = 0;
6317                         }
6318                 }
6319
6320                 if (!inode) {
6321                         btrfs_release_path(root, path);
6322
6323                         inode = btrfs_iget_locked(root->fs_info->sb,
6324                                                   key.objectid, root);
6325                         if (inode->i_state & I_NEW) {
6326                                 BTRFS_I(inode)->root = root;
6327                                 BTRFS_I(inode)->location.objectid =
6328                                         key.objectid;
6329                                 BTRFS_I(inode)->location.type =
6330                                         BTRFS_INODE_ITEM_KEY;
6331                                 BTRFS_I(inode)->location.offset = 0;
6332                                 btrfs_read_locked_inode(inode);
6333                                 unlock_new_inode(inode);
6334                         }
6335                         /*
6336                          * some code call btrfs_commit_transaction while
6337                          * holding the i_mutex, so we can't use mutex_lock
6338                          * here.
6339                          */
6340                         if (is_bad_inode(inode) ||
6341                             !mutex_trylock(&inode->i_mutex)) {
6342                                 iput(inode);
6343                                 inode = NULL;
6344                                 key.offset = (u64)-1;
6345                                 goto skip;
6346                         }
6347                 }
6348
6349                 if (!extent_locked) {
6350                         struct btrfs_ordered_extent *ordered;
6351
6352                         btrfs_release_path(root, path);
6353
6354                         lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6355                                     lock_end, GFP_NOFS);
6356                         ordered = btrfs_lookup_first_ordered_extent(inode,
6357                                                                     lock_end);
6358                         if (ordered &&
6359                             ordered->file_offset <= lock_end &&
6360                             ordered->file_offset + ordered->len > lock_start) {
6361                                 unlock_extent(&BTRFS_I(inode)->io_tree,
6362                                               lock_start, lock_end, GFP_NOFS);
6363                                 btrfs_start_ordered_extent(inode, ordered, 1);
6364                                 btrfs_put_ordered_extent(ordered);
6365                                 key.offset += num_bytes;
6366                                 goto skip;
6367                         }
6368                         if (ordered)
6369                                 btrfs_put_ordered_extent(ordered);
6370
6371                         extent_locked = 1;
6372                         continue;
6373                 }
6374
6375                 if (nr_extents == 1) {
6376                         /* update extent pointer in place */
6377                         btrfs_set_file_extent_disk_bytenr(leaf, fi,
6378                                                 new_extents[0].disk_bytenr);
6379                         btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6380                                                 new_extents[0].disk_num_bytes);
6381                         btrfs_mark_buffer_dirty(leaf);
6382
6383                         btrfs_drop_extent_cache(inode, key.offset,
6384                                                 key.offset + num_bytes - 1, 0);
6385
6386                         ret = btrfs_inc_extent_ref(trans, root,
6387                                                 new_extents[0].disk_bytenr,
6388                                                 new_extents[0].disk_num_bytes,
6389                                                 leaf->start,
6390                                                 root->root_key.objectid,
6391                                                 trans->transid,
6392                                                 key.objectid);
6393                         BUG_ON(ret);
6394
6395                         ret = btrfs_free_extent(trans, root,
6396                                                 extent_key->objectid,
6397                                                 extent_key->offset,
6398                                                 leaf->start,
6399                                                 btrfs_header_owner(leaf),
6400                                                 btrfs_header_generation(leaf),
6401                                                 key.objectid, 0);
6402                         BUG_ON(ret);
6403
6404                         btrfs_release_path(root, path);
6405                         key.offset += num_bytes;
6406                 } else {
6407                         BUG_ON(1);
6408 #if 0
6409                         u64 alloc_hint;
6410                         u64 extent_len;
6411                         int i;
6412                         /*
6413                          * drop old extent pointer at first, then insert the
6414                          * new pointers one bye one
6415                          */
6416                         btrfs_release_path(root, path);
6417                         ret = btrfs_drop_extents(trans, root, inode, key.offset,
6418                                                  key.offset + num_bytes,
6419                                                  key.offset, &alloc_hint);
6420                         BUG_ON(ret);
6421
6422                         for (i = 0; i < nr_extents; i++) {
6423                                 if (ext_offset >= new_extents[i].num_bytes) {
6424                                         ext_offset -= new_extents[i].num_bytes;
6425                                         continue;
6426                                 }
6427                                 extent_len = min(new_extents[i].num_bytes -
6428                                                  ext_offset, num_bytes);
6429
6430                                 ret = btrfs_insert_empty_item(trans, root,
6431                                                               path, &key,
6432                                                               sizeof(*fi));
6433                                 BUG_ON(ret);
6434
6435                                 leaf = path->nodes[0];
6436                                 fi = btrfs_item_ptr(leaf, path->slots[0],
6437                                                 struct btrfs_file_extent_item);
6438                                 btrfs_set_file_extent_generation(leaf, fi,
6439                                                         trans->transid);
6440                                 btrfs_set_file_extent_type(leaf, fi,
6441                                                         BTRFS_FILE_EXTENT_REG);
6442                                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6443                                                 new_extents[i].disk_bytenr);
6444                                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6445                                                 new_extents[i].disk_num_bytes);
6446                                 btrfs_set_file_extent_ram_bytes(leaf, fi,
6447                                                 new_extents[i].ram_bytes);
6448
6449                                 btrfs_set_file_extent_compression(leaf, fi,
6450                                                 new_extents[i].compression);
6451                                 btrfs_set_file_extent_encryption(leaf, fi,
6452                                                 new_extents[i].encryption);
6453                                 btrfs_set_file_extent_other_encoding(leaf, fi,
6454                                                 new_extents[i].other_encoding);
6455
6456                                 btrfs_set_file_extent_num_bytes(leaf, fi,
6457                                                         extent_len);
6458                                 ext_offset += new_extents[i].offset;
6459                                 btrfs_set_file_extent_offset(leaf, fi,
6460                                                         ext_offset);
6461                                 btrfs_mark_buffer_dirty(leaf);
6462
6463                                 btrfs_drop_extent_cache(inode, key.offset,
6464                                                 key.offset + extent_len - 1, 0);
6465
6466                                 ret = btrfs_inc_extent_ref(trans, root,
6467                                                 new_extents[i].disk_bytenr,
6468                                                 new_extents[i].disk_num_bytes,
6469                                                 leaf->start,
6470                                                 root->root_key.objectid,
6471                                                 trans->transid, key.objectid);
6472                                 BUG_ON(ret);
6473                                 btrfs_release_path(root, path);
6474
6475                                 inode_add_bytes(inode, extent_len);
6476
6477                                 ext_offset = 0;
6478                                 num_bytes -= extent_len;
6479                                 key.offset += extent_len;
6480
6481                                 if (num_bytes == 0)
6482                                         break;
6483                         }
6484                         BUG_ON(i >= nr_extents);
6485 #endif
6486                 }
6487
6488                 if (extent_locked) {
6489                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6490                                       lock_end, GFP_NOFS);
6491                         extent_locked = 0;
6492                 }
6493 skip:
6494                 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6495                     key.offset >= search_end)
6496                         break;
6497
6498                 cond_resched();
6499         }
6500         ret = 0;
6501 out:
6502         btrfs_release_path(root, path);
6503         if (inode) {
6504                 mutex_unlock(&inode->i_mutex);
6505                 if (extent_locked) {
6506                         unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6507                                       lock_end, GFP_NOFS);
6508                 }
6509                 iput(inode);
6510         }
6511         return ret;
6512 }
6513
6514 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6515                                struct btrfs_root *root,
6516                                struct extent_buffer *buf, u64 orig_start)
6517 {
6518         int level;
6519         int ret;
6520
6521         BUG_ON(btrfs_header_generation(buf) != trans->transid);
6522         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6523
6524         level = btrfs_header_level(buf);
6525         if (level == 0) {
6526                 struct btrfs_leaf_ref *ref;
6527                 struct btrfs_leaf_ref *orig_ref;
6528
6529                 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6530                 if (!orig_ref)
6531                         return -ENOENT;
6532
6533                 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6534                 if (!ref) {
6535                         btrfs_free_leaf_ref(root, orig_ref);
6536                         return -ENOMEM;
6537                 }
6538
6539                 ref->nritems = orig_ref->nritems;
6540                 memcpy(ref->extents, orig_ref->extents,
6541                         sizeof(ref->extents[0]) * ref->nritems);
6542
6543                 btrfs_free_leaf_ref(root, orig_ref);
6544
6545                 ref->root_gen = trans->transid;
6546                 ref->bytenr = buf->start;
6547                 ref->owner = btrfs_header_owner(buf);
6548                 ref->generation = btrfs_header_generation(buf);
6549
6550                 ret = btrfs_add_leaf_ref(root, ref, 0);
6551                 WARN_ON(ret);
6552                 btrfs_free_leaf_ref(root, ref);
6553         }
6554         return 0;
6555 }
6556
6557 static noinline int invalidate_extent_cache(struct btrfs_root *root,
6558                                         struct extent_buffer *leaf,
6559                                         struct btrfs_block_group_cache *group,
6560                                         struct btrfs_root *target_root)
6561 {
6562         struct btrfs_key key;
6563         struct inode *inode = NULL;
6564         struct btrfs_file_extent_item *fi;
6565         u64 num_bytes;
6566         u64 skip_objectid = 0;
6567         u32 nritems;
6568         u32 i;
6569
6570         nritems = btrfs_header_nritems(leaf);
6571         for (i = 0; i < nritems; i++) {
6572                 btrfs_item_key_to_cpu(leaf, &key, i);
6573                 if (key.objectid == skip_objectid ||
6574                     key.type != BTRFS_EXTENT_DATA_KEY)
6575                         continue;
6576                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6577                 if (btrfs_file_extent_type(leaf, fi) ==
6578                     BTRFS_FILE_EXTENT_INLINE)
6579                         continue;
6580                 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6581                         continue;
6582                 if (!inode || inode->i_ino != key.objectid) {
6583                         iput(inode);
6584                         inode = btrfs_ilookup(target_root->fs_info->sb,
6585                                               key.objectid, target_root, 1);
6586                 }
6587                 if (!inode) {
6588                         skip_objectid = key.objectid;
6589                         continue;
6590                 }
6591                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6592
6593                 lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6594                             key.offset + num_bytes - 1, GFP_NOFS);
6595                 btrfs_drop_extent_cache(inode, key.offset,
6596                                         key.offset + num_bytes - 1, 1);
6597                 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
6598                               key.offset + num_bytes - 1, GFP_NOFS);
6599                 cond_resched();
6600         }
6601         iput(inode);
6602         return 0;
6603 }
6604
6605 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
6606                                         struct btrfs_root *root,
6607                                         struct extent_buffer *leaf,
6608                                         struct btrfs_block_group_cache *group,
6609                                         struct inode *reloc_inode)
6610 {
6611         struct btrfs_key key;
6612         struct btrfs_key extent_key;
6613         struct btrfs_file_extent_item *fi;
6614         struct btrfs_leaf_ref *ref;
6615         struct disk_extent *new_extent;
6616         u64 bytenr;
6617         u64 num_bytes;
6618         u32 nritems;
6619         u32 i;
6620         int ext_index;
6621         int nr_extent;
6622         int ret;
6623
6624         new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
6625         BUG_ON(!new_extent);
6626
6627         ref = btrfs_lookup_leaf_ref(root, leaf->start);
6628         BUG_ON(!ref);
6629
6630         ext_index = -1;
6631         nritems = btrfs_header_nritems(leaf);
6632         for (i = 0; i < nritems; i++) {
6633                 btrfs_item_key_to_cpu(leaf, &key, i);
6634                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
6635                         continue;
6636                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6637                 if (btrfs_file_extent_type(leaf, fi) ==
6638                     BTRFS_FILE_EXTENT_INLINE)
6639                         continue;
6640                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6641                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6642                 if (bytenr == 0)
6643                         continue;
6644
6645                 ext_index++;
6646                 if (bytenr >= group->key.objectid + group->key.offset ||
6647                     bytenr + num_bytes <= group->key.objectid)
6648                         continue;
6649
6650                 extent_key.objectid = bytenr;
6651                 extent_key.offset = num_bytes;
6652                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
6653                 nr_extent = 1;
6654                 ret = get_new_locations(reloc_inode, &extent_key,
6655                                         group->key.objectid, 1,
6656                                         &new_extent, &nr_extent);
6657                 if (ret > 0)
6658                         continue;
6659                 BUG_ON(ret < 0);
6660
6661                 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
6662                 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
6663                 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
6664                 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
6665
6666                 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6667                                                 new_extent->disk_bytenr);
6668                 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6669                                                 new_extent->disk_num_bytes);
6670                 btrfs_mark_buffer_dirty(leaf);
6671
6672                 ret = btrfs_inc_extent_ref(trans, root,
6673                                         new_extent->disk_bytenr,
6674                                         new_extent->disk_num_bytes,
6675                                         leaf->start,
6676                                         root->root_key.objectid,
6677                                         trans->transid, key.objectid);
6678                 BUG_ON(ret);
6679
6680                 ret = btrfs_free_extent(trans, root,
6681                                         bytenr, num_bytes, leaf->start,
6682                                         btrfs_header_owner(leaf),
6683                                         btrfs_header_generation(leaf),
6684                                         key.objectid, 0);
6685                 BUG_ON(ret);
6686                 cond_resched();
6687         }
6688         kfree(new_extent);
6689         BUG_ON(ext_index + 1 != ref->nritems);
6690         btrfs_free_leaf_ref(root, ref);
6691         return 0;
6692 }
6693
6694 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
6695                           struct btrfs_root *root)
6696 {
6697         struct btrfs_root *reloc_root;
6698         int ret;
6699
6700         if (root->reloc_root) {
6701                 reloc_root = root->reloc_root;
6702                 root->reloc_root = NULL;
6703                 list_add(&reloc_root->dead_list,
6704                          &root->fs_info->dead_reloc_roots);
6705
6706                 btrfs_set_root_bytenr(&reloc_root->root_item,
6707                                       reloc_root->node->start);
6708                 btrfs_set_root_level(&root->root_item,
6709                                      btrfs_header_level(reloc_root->node));
6710                 memset(&reloc_root->root_item.drop_progress, 0,
6711                         sizeof(struct btrfs_disk_key));
6712                 reloc_root->root_item.drop_level = 0;
6713
6714                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
6715                                         &reloc_root->root_key,
6716                                         &reloc_root->root_item);
6717                 BUG_ON(ret);
6718         }
6719         return 0;
6720 }
6721
6722 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
6723 {
6724         struct btrfs_trans_handle *trans;
6725         struct btrfs_root *reloc_root;
6726         struct btrfs_root *prev_root = NULL;
6727         struct list_head dead_roots;
6728         int ret;
6729         unsigned long nr;
6730
6731         INIT_LIST_HEAD(&dead_roots);
6732         list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
6733
6734         while (!list_empty(&dead_roots)) {
6735                 reloc_root = list_entry(dead_roots.prev,
6736                                         struct btrfs_root, dead_list);
6737                 list_del_init(&reloc_root->dead_list);
6738
6739                 BUG_ON(reloc_root->commit_root != NULL);
6740                 while (1) {
6741                         trans = btrfs_join_transaction(root, 1);
6742                         BUG_ON(!trans);
6743
6744                         mutex_lock(&root->fs_info->drop_mutex);
6745                         ret = btrfs_drop_snapshot(trans, reloc_root);
6746                         if (ret != -EAGAIN)
6747                                 break;
6748                         mutex_unlock(&root->fs_info->drop_mutex);
6749
6750                         nr = trans->blocks_used;
6751                         ret = btrfs_end_transaction(trans, root);
6752                         BUG_ON(ret);
6753                         btrfs_btree_balance_dirty(root, nr);
6754                 }
6755
6756                 free_extent_buffer(reloc_root->node);
6757
6758                 ret = btrfs_del_root(trans, root->fs_info->tree_root,
6759                                      &reloc_root->root_key);
6760                 BUG_ON(ret);
6761                 mutex_unlock(&root->fs_info->drop_mutex);
6762
6763                 nr = trans->blocks_used;
6764                 ret = btrfs_end_transaction(trans, root);
6765                 BUG_ON(ret);
6766                 btrfs_btree_balance_dirty(root, nr);
6767
6768                 kfree(prev_root);
6769                 prev_root = reloc_root;
6770         }
6771         if (prev_root) {
6772                 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
6773                 kfree(prev_root);
6774         }
6775         return 0;
6776 }
6777
6778 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
6779 {
6780         list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
6781         return 0;
6782 }
6783
6784 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
6785 {
6786         struct btrfs_root *reloc_root;
6787         struct btrfs_trans_handle *trans;
6788         struct btrfs_key location;
6789         int found;
6790         int ret;
6791
6792         mutex_lock(&root->fs_info->tree_reloc_mutex);
6793         ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
6794         BUG_ON(ret);
6795         found = !list_empty(&root->fs_info->dead_reloc_roots);
6796         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6797
6798         if (found) {
6799                 trans = btrfs_start_transaction(root, 1);
6800                 BUG_ON(!trans);
6801                 ret = btrfs_commit_transaction(trans, root);
6802                 BUG_ON(ret);
6803         }
6804
6805         location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6806         location.offset = (u64)-1;
6807         location.type = BTRFS_ROOT_ITEM_KEY;
6808
6809         reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
6810         BUG_ON(!reloc_root);
6811         btrfs_orphan_cleanup(reloc_root);
6812         return 0;
6813 }
6814
6815 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
6816                                     struct btrfs_root *root)
6817 {
6818         struct btrfs_root *reloc_root;
6819         struct extent_buffer *eb;
6820         struct btrfs_root_item *root_item;
6821         struct btrfs_key root_key;
6822         int ret;
6823
6824         BUG_ON(!root->ref_cows);
6825         if (root->reloc_root)
6826                 return 0;
6827
6828         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
6829         BUG_ON(!root_item);
6830
6831         ret = btrfs_copy_root(trans, root, root->commit_root,
6832                               &eb, BTRFS_TREE_RELOC_OBJECTID);
6833         BUG_ON(ret);
6834
6835         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
6836         root_key.offset = root->root_key.objectid;
6837         root_key.type = BTRFS_ROOT_ITEM_KEY;
6838
6839         memcpy(root_item, &root->root_item, sizeof(root_item));
6840         btrfs_set_root_refs(root_item, 0);
6841         btrfs_set_root_bytenr(root_item, eb->start);
6842         btrfs_set_root_level(root_item, btrfs_header_level(eb));
6843         btrfs_set_root_generation(root_item, trans->transid);
6844
6845         btrfs_tree_unlock(eb);
6846         free_extent_buffer(eb);
6847
6848         ret = btrfs_insert_root(trans, root->fs_info->tree_root,
6849                                 &root_key, root_item);
6850         BUG_ON(ret);
6851         kfree(root_item);
6852
6853         reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
6854                                                  &root_key);
6855         BUG_ON(!reloc_root);
6856         reloc_root->last_trans = trans->transid;
6857         reloc_root->commit_root = NULL;
6858         reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
6859
6860         root->reloc_root = reloc_root;
6861         return 0;
6862 }
6863
6864 /*
6865  * Core function of space balance.
6866  *
6867  * The idea is using reloc trees to relocate tree blocks in reference
6868  * counted roots. There is one reloc tree for each subvol, and all
6869  * reloc trees share same root key objectid. Reloc trees are snapshots
6870  * of the latest committed roots of subvols (root->commit_root).
6871  *
6872  * To relocate a tree block referenced by a subvol, there are two steps.
6873  * COW the block through subvol's reloc tree, then update block pointer
6874  * in the subvol to point to the new block. Since all reloc trees share
6875  * same root key objectid, doing special handing for tree blocks owned
6876  * by them is easy. Once a tree block has been COWed in one reloc tree,
6877  * we can use the resulting new block directly when the same block is
6878  * required to COW again through other reloc trees. By this way, relocated
6879  * tree blocks are shared between reloc trees, so they are also shared
6880  * between subvols.
6881  */
6882 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
6883                                       struct btrfs_root *root,
6884                                       struct btrfs_path *path,
6885                                       struct btrfs_key *first_key,
6886                                       struct btrfs_ref_path *ref_path,
6887                                       struct btrfs_block_group_cache *group,
6888                                       struct inode *reloc_inode)
6889 {
6890         struct btrfs_root *reloc_root;
6891         struct extent_buffer *eb = NULL;
6892         struct btrfs_key *keys;
6893         u64 *nodes;
6894         int level;
6895         int shared_level;
6896         int lowest_level = 0;
6897         int ret;
6898
6899         if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
6900                 lowest_level = ref_path->owner_objectid;
6901
6902         if (!root->ref_cows) {
6903                 path->lowest_level = lowest_level;
6904                 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
6905                 BUG_ON(ret < 0);
6906                 path->lowest_level = 0;
6907                 btrfs_release_path(root, path);
6908                 return 0;
6909         }
6910
6911         mutex_lock(&root->fs_info->tree_reloc_mutex);
6912         ret = init_reloc_tree(trans, root);
6913         BUG_ON(ret);
6914         reloc_root = root->reloc_root;
6915
6916         shared_level = ref_path->shared_level;
6917         ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
6918
6919         keys = ref_path->node_keys;
6920         nodes = ref_path->new_nodes;
6921         memset(&keys[shared_level + 1], 0,
6922                sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
6923         memset(&nodes[shared_level + 1], 0,
6924                sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
6925
6926         if (nodes[lowest_level] == 0) {
6927                 path->lowest_level = lowest_level;
6928                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6929                                         0, 1);
6930                 BUG_ON(ret);
6931                 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
6932                         eb = path->nodes[level];
6933                         if (!eb || eb == reloc_root->node)
6934                                 break;
6935                         nodes[level] = eb->start;
6936                         if (level == 0)
6937                                 btrfs_item_key_to_cpu(eb, &keys[level], 0);
6938                         else
6939                                 btrfs_node_key_to_cpu(eb, &keys[level], 0);
6940                 }
6941                 if (nodes[0] &&
6942                     ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6943                         eb = path->nodes[0];
6944                         ret = replace_extents_in_leaf(trans, reloc_root, eb,
6945                                                       group, reloc_inode);
6946                         BUG_ON(ret);
6947                 }
6948                 btrfs_release_path(reloc_root, path);
6949         } else {
6950                 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
6951                                        lowest_level);
6952                 BUG_ON(ret);
6953         }
6954
6955         /*
6956          * replace tree blocks in the fs tree with tree blocks in
6957          * the reloc tree.
6958          */
6959         ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
6960         BUG_ON(ret < 0);
6961
6962         if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
6963                 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
6964                                         0, 0);
6965                 BUG_ON(ret);
6966                 extent_buffer_get(path->nodes[0]);
6967                 eb = path->nodes[0];
6968                 btrfs_release_path(reloc_root, path);
6969                 ret = invalidate_extent_cache(reloc_root, eb, group, root);
6970                 BUG_ON(ret);
6971                 free_extent_buffer(eb);
6972         }
6973
6974         mutex_unlock(&root->fs_info->tree_reloc_mutex);
6975         path->lowest_level = 0;
6976         return 0;
6977 }
6978
6979 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
6980                                         struct btrfs_root *root,
6981                                         struct btrfs_path *path,
6982                                         struct btrfs_key *first_key,
6983                                         struct btrfs_ref_path *ref_path)
6984 {
6985         int ret;
6986
6987         ret = relocate_one_path(trans, root, path, first_key,
6988                                 ref_path, NULL, NULL);
6989         BUG_ON(ret);
6990
6991         return 0;
6992 }
6993
6994 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
6995                                     struct btrfs_root *extent_root,
6996                                     struct btrfs_path *path,
6997                                     struct btrfs_key *extent_key)
6998 {
6999         int ret;
7000
7001         ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7002         if (ret)
7003                 goto out;
7004         ret = btrfs_del_item(trans, extent_root, path);
7005 out:
7006         btrfs_release_path(extent_root, path);
7007         return ret;
7008 }
7009
7010 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7011                                                 struct btrfs_ref_path *ref_path)
7012 {
7013         struct btrfs_key root_key;
7014
7015         root_key.objectid = ref_path->root_objectid;
7016         root_key.type = BTRFS_ROOT_ITEM_KEY;
7017         if (is_cowonly_root(ref_path->root_objectid))
7018                 root_key.offset = 0;
7019         else
7020                 root_key.offset = (u64)-1;
7021
7022         return btrfs_read_fs_root_no_name(fs_info, &root_key);
7023 }
7024
7025 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7026                                         struct btrfs_path *path,
7027                                         struct btrfs_key *extent_key,
7028                                         struct btrfs_block_group_cache *group,
7029                                         struct inode *reloc_inode, int pass)
7030 {
7031         struct btrfs_trans_handle *trans;
7032         struct btrfs_root *found_root;
7033         struct btrfs_ref_path *ref_path = NULL;
7034         struct disk_extent *new_extents = NULL;
7035         int nr_extents = 0;
7036         int loops;
7037         int ret;
7038         int level;
7039         struct btrfs_key first_key;
7040         u64 prev_block = 0;
7041
7042
7043         trans = btrfs_start_transaction(extent_root, 1);
7044         BUG_ON(!trans);
7045
7046         if (extent_key->objectid == 0) {
7047                 ret = del_extent_zero(trans, extent_root, path, extent_key);
7048                 goto out;
7049         }
7050
7051         ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7052         if (!ref_path) {
7053                 ret = -ENOMEM;
7054                 goto out;
7055         }
7056
7057         for (loops = 0; ; loops++) {
7058                 if (loops == 0) {
7059                         ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7060                                                    extent_key->objectid);
7061                 } else {
7062                         ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7063                 }
7064                 if (ret < 0)
7065                         goto out;
7066                 if (ret > 0)
7067                         break;
7068
7069                 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7070                     ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7071                         continue;
7072
7073                 found_root = read_ref_root(extent_root->fs_info, ref_path);
7074                 BUG_ON(!found_root);
7075                 /*
7076                  * for reference counted tree, only process reference paths
7077                  * rooted at the latest committed root.
7078                  */
7079                 if (found_root->ref_cows &&
7080                     ref_path->root_generation != found_root->root_key.offset)
7081                         continue;
7082
7083                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7084                         if (pass == 0) {
7085                                 /*
7086                                  * copy data extents to new locations
7087                                  */
7088                                 u64 group_start = group->key.objectid;
7089                                 ret = relocate_data_extent(reloc_inode,
7090                                                            extent_key,
7091                                                            group_start);
7092                                 if (ret < 0)
7093                                         goto out;
7094                                 break;
7095                         }
7096                         level = 0;
7097                 } else {
7098                         level = ref_path->owner_objectid;
7099                 }
7100
7101                 if (prev_block != ref_path->nodes[level]) {
7102                         struct extent_buffer *eb;
7103                         u64 block_start = ref_path->nodes[level];
7104                         u64 block_size = btrfs_level_size(found_root, level);
7105
7106                         eb = read_tree_block(found_root, block_start,
7107                                              block_size, 0);
7108                         btrfs_tree_lock(eb);
7109                         BUG_ON(level != btrfs_header_level(eb));
7110
7111                         if (level == 0)
7112                                 btrfs_item_key_to_cpu(eb, &first_key, 0);
7113                         else
7114                                 btrfs_node_key_to_cpu(eb, &first_key, 0);
7115
7116                         btrfs_tree_unlock(eb);
7117                         free_extent_buffer(eb);
7118                         prev_block = block_start;
7119                 }
7120
7121                 mutex_lock(&extent_root->fs_info->trans_mutex);
7122                 btrfs_record_root_in_trans(found_root);
7123                 mutex_unlock(&extent_root->fs_info->trans_mutex);
7124                 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7125                         /*
7126                          * try to update data extent references while
7127                          * keeping metadata shared between snapshots.
7128                          */
7129                         if (pass == 1) {
7130                                 ret = relocate_one_path(trans, found_root,
7131                                                 path, &first_key, ref_path,
7132                                                 group, reloc_inode);
7133                                 if (ret < 0)
7134                                         goto out;
7135                                 continue;
7136                         }
7137                         /*
7138                          * use fallback method to process the remaining
7139                          * references.
7140                          */
7141                         if (!new_extents) {
7142                                 u64 group_start = group->key.objectid;
7143                                 new_extents = kmalloc(sizeof(*new_extents),
7144                                                       GFP_NOFS);
7145                                 nr_extents = 1;
7146                                 ret = get_new_locations(reloc_inode,
7147                                                         extent_key,
7148                                                         group_start, 1,
7149                                                         &new_extents,
7150                                                         &nr_extents);
7151                                 if (ret)
7152                                         goto out;
7153                         }
7154                         ret = replace_one_extent(trans, found_root,
7155                                                 path, extent_key,
7156                                                 &first_key, ref_path,
7157                                                 new_extents, nr_extents);
7158                 } else {
7159                         ret = relocate_tree_block(trans, found_root, path,
7160                                                   &first_key, ref_path);
7161                 }
7162                 if (ret < 0)
7163                         goto out;
7164         }
7165         ret = 0;
7166 out:
7167         btrfs_end_transaction(trans, extent_root);
7168         kfree(new_extents);
7169         kfree(ref_path);
7170         return ret;
7171 }
7172 #endif
7173
7174 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7175 {
7176         u64 num_devices;
7177         u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7178                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7179
7180         num_devices = root->fs_info->fs_devices->rw_devices;
7181         if (num_devices == 1) {
7182                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7183                 stripped = flags & ~stripped;
7184
7185                 /* turn raid0 into single device chunks */
7186                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7187                         return stripped;
7188
7189                 /* turn mirroring into duplication */
7190                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7191                              BTRFS_BLOCK_GROUP_RAID10))
7192                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7193                 return flags;
7194         } else {
7195                 /* they already had raid on here, just return */
7196                 if (flags & stripped)
7197                         return flags;
7198
7199                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7200                 stripped = flags & ~stripped;
7201
7202                 /* switch duplicated blocks with raid1 */
7203                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7204                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7205
7206                 /* turn single device chunks into raid0 */
7207                 return stripped | BTRFS_BLOCK_GROUP_RAID0;
7208         }
7209         return flags;
7210 }
7211
7212 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
7213                      struct btrfs_block_group_cache *shrink_block_group,
7214                      int force)
7215 {
7216         struct btrfs_trans_handle *trans;
7217         u64 new_alloc_flags;
7218         u64 calc;
7219
7220         spin_lock(&shrink_block_group->lock);
7221         if (btrfs_block_group_used(&shrink_block_group->item) +
7222             shrink_block_group->reserved > 0) {
7223                 spin_unlock(&shrink_block_group->lock);
7224
7225                 trans = btrfs_start_transaction(root, 1);
7226                 spin_lock(&shrink_block_group->lock);
7227
7228                 new_alloc_flags = update_block_group_flags(root,
7229                                                    shrink_block_group->flags);
7230                 if (new_alloc_flags != shrink_block_group->flags) {
7231                         calc =
7232                              btrfs_block_group_used(&shrink_block_group->item);
7233                 } else {
7234                         calc = shrink_block_group->key.offset;
7235                 }
7236                 spin_unlock(&shrink_block_group->lock);
7237
7238                 do_chunk_alloc(trans, root->fs_info->extent_root,
7239                                calc + 2 * 1024 * 1024, new_alloc_flags, force);
7240
7241                 btrfs_end_transaction(trans, root);
7242         } else
7243                 spin_unlock(&shrink_block_group->lock);
7244         return 0;
7245 }
7246
7247
7248 int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
7249                                          struct btrfs_block_group_cache *group)
7250
7251 {
7252         __alloc_chunk_for_shrink(root, group, 1);
7253         set_block_group_readonly(group);
7254         return 0;
7255 }
7256
7257 /*
7258  * checks to see if its even possible to relocate this block group.
7259  *
7260  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7261  * ok to go ahead and try.
7262  */
7263 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7264 {
7265         struct btrfs_block_group_cache *block_group;
7266         struct btrfs_space_info *space_info;
7267         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7268         struct btrfs_device *device;
7269         int full = 0;
7270         int ret = 0;
7271
7272         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7273
7274         /* odd, couldn't find the block group, leave it alone */
7275         if (!block_group)
7276                 return -1;
7277
7278         /* no bytes used, we're good */
7279         if (!btrfs_block_group_used(&block_group->item))
7280                 goto out;
7281
7282         space_info = block_group->space_info;
7283         spin_lock(&space_info->lock);
7284
7285         full = space_info->full;
7286
7287         /*
7288          * if this is the last block group we have in this space, we can't
7289          * relocate it unless we're able to allocate a new chunk below.
7290          *
7291          * Otherwise, we need to make sure we have room in the space to handle
7292          * all of the extents from this block group.  If we can, we're good
7293          */
7294         if ((space_info->total_bytes != block_group->key.offset) &&
7295            (space_info->bytes_used + space_info->bytes_reserved +
7296             space_info->bytes_pinned + space_info->bytes_readonly +
7297             btrfs_block_group_used(&block_group->item) <
7298             space_info->total_bytes)) {
7299                 spin_unlock(&space_info->lock);
7300                 goto out;
7301         }
7302         spin_unlock(&space_info->lock);
7303
7304         /*
7305          * ok we don't have enough space, but maybe we have free space on our
7306          * devices to allocate new chunks for relocation, so loop through our
7307          * alloc devices and guess if we have enough space.  However, if we
7308          * were marked as full, then we know there aren't enough chunks, and we
7309          * can just return.
7310          */
7311         ret = -1;
7312         if (full)
7313                 goto out;
7314
7315         mutex_lock(&root->fs_info->chunk_mutex);
7316         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7317                 u64 min_free = btrfs_block_group_used(&block_group->item);
7318                 u64 dev_offset, max_avail;
7319
7320                 /*
7321                  * check to make sure we can actually find a chunk with enough
7322                  * space to fit our block group in.
7323                  */
7324                 if (device->total_bytes > device->bytes_used + min_free) {
7325                         ret = find_free_dev_extent(NULL, device, min_free,
7326                                                    &dev_offset, &max_avail);
7327                         if (!ret)
7328                                 break;
7329                         ret = -1;
7330                 }
7331         }
7332         mutex_unlock(&root->fs_info->chunk_mutex);
7333 out:
7334         btrfs_put_block_group(block_group);
7335         return ret;
7336 }
7337
7338 static int find_first_block_group(struct btrfs_root *root,
7339                 struct btrfs_path *path, struct btrfs_key *key)
7340 {
7341         int ret = 0;
7342         struct btrfs_key found_key;
7343         struct extent_buffer *leaf;
7344         int slot;
7345
7346         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7347         if (ret < 0)
7348                 goto out;
7349
7350         while (1) {
7351                 slot = path->slots[0];
7352                 leaf = path->nodes[0];
7353                 if (slot >= btrfs_header_nritems(leaf)) {
7354                         ret = btrfs_next_leaf(root, path);
7355                         if (ret == 0)
7356                                 continue;
7357                         if (ret < 0)
7358                                 goto out;
7359                         break;
7360                 }
7361                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7362
7363                 if (found_key.objectid >= key->objectid &&
7364                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7365                         ret = 0;
7366                         goto out;
7367                 }
7368                 path->slots[0]++;
7369         }
7370         ret = -ENOENT;
7371 out:
7372         return ret;
7373 }
7374
7375 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7376 {
7377         struct btrfs_block_group_cache *block_group;
7378         struct btrfs_space_info *space_info;
7379         struct btrfs_caching_control *caching_ctl;
7380         struct rb_node *n;
7381
7382         down_write(&info->extent_commit_sem);
7383         while (!list_empty(&info->caching_block_groups)) {
7384                 caching_ctl = list_entry(info->caching_block_groups.next,
7385                                          struct btrfs_caching_control, list);
7386                 list_del(&caching_ctl->list);
7387                 put_caching_control(caching_ctl);
7388         }
7389         up_write(&info->extent_commit_sem);
7390
7391         spin_lock(&info->block_group_cache_lock);
7392         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7393                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7394                                        cache_node);
7395                 rb_erase(&block_group->cache_node,
7396                          &info->block_group_cache_tree);
7397                 spin_unlock(&info->block_group_cache_lock);
7398
7399                 down_write(&block_group->space_info->groups_sem);
7400                 list_del(&block_group->list);
7401                 up_write(&block_group->space_info->groups_sem);
7402
7403                 if (block_group->cached == BTRFS_CACHE_STARTED)
7404                         wait_block_group_cache_done(block_group);
7405
7406                 btrfs_remove_free_space_cache(block_group);
7407
7408                 WARN_ON(atomic_read(&block_group->count) != 1);
7409                 kfree(block_group);
7410
7411                 spin_lock(&info->block_group_cache_lock);
7412         }
7413         spin_unlock(&info->block_group_cache_lock);
7414
7415         /* now that all the block groups are freed, go through and
7416          * free all the space_info structs.  This is only called during
7417          * the final stages of unmount, and so we know nobody is
7418          * using them.  We call synchronize_rcu() once before we start,
7419          * just to be on the safe side.
7420          */
7421         synchronize_rcu();
7422
7423         while(!list_empty(&info->space_info)) {
7424                 space_info = list_entry(info->space_info.next,
7425                                         struct btrfs_space_info,
7426                                         list);
7427
7428                 list_del(&space_info->list);
7429                 kfree(space_info);
7430         }
7431         return 0;
7432 }
7433
7434 int btrfs_read_block_groups(struct btrfs_root *root)
7435 {
7436         struct btrfs_path *path;
7437         int ret;
7438         struct btrfs_block_group_cache *cache;
7439         struct btrfs_fs_info *info = root->fs_info;
7440         struct btrfs_space_info *space_info;
7441         struct btrfs_key key;
7442         struct btrfs_key found_key;
7443         struct extent_buffer *leaf;
7444
7445         root = info->extent_root;
7446         key.objectid = 0;
7447         key.offset = 0;
7448         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7449         path = btrfs_alloc_path();
7450         if (!path)
7451                 return -ENOMEM;
7452
7453         while (1) {
7454                 ret = find_first_block_group(root, path, &key);
7455                 if (ret > 0) {
7456                         ret = 0;
7457                         goto error;
7458                 }
7459                 if (ret != 0)
7460                         goto error;
7461
7462                 leaf = path->nodes[0];
7463                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7464                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7465                 if (!cache) {
7466                         ret = -ENOMEM;
7467                         break;
7468                 }
7469
7470                 atomic_set(&cache->count, 1);
7471                 spin_lock_init(&cache->lock);
7472                 spin_lock_init(&cache->tree_lock);
7473                 cache->fs_info = info;
7474                 INIT_LIST_HEAD(&cache->list);
7475                 INIT_LIST_HEAD(&cache->cluster_list);
7476
7477                 /*
7478                  * we only want to have 32k of ram per block group for keeping
7479                  * track of free space, and if we pass 1/2 of that we want to
7480                  * start converting things over to using bitmaps
7481                  */
7482                 cache->extents_thresh = ((1024 * 32) / 2) /
7483                         sizeof(struct btrfs_free_space);
7484
7485                 read_extent_buffer(leaf, &cache->item,
7486                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7487                                    sizeof(cache->item));
7488                 memcpy(&cache->key, &found_key, sizeof(found_key));
7489
7490                 key.objectid = found_key.objectid + found_key.offset;
7491                 btrfs_release_path(root, path);
7492                 cache->flags = btrfs_block_group_flags(&cache->item);
7493                 cache->sectorsize = root->sectorsize;
7494
7495                 /*
7496                  * check for two cases, either we are full, and therefore
7497                  * don't need to bother with the caching work since we won't
7498                  * find any space, or we are empty, and we can just add all
7499                  * the space in and be done with it.  This saves us _alot_ of
7500                  * time, particularly in the full case.
7501                  */
7502                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7503                         exclude_super_stripes(root, cache);
7504                         cache->last_byte_to_unpin = (u64)-1;
7505                         cache->cached = BTRFS_CACHE_FINISHED;
7506                         free_excluded_extents(root, cache);
7507                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7508                         exclude_super_stripes(root, cache);
7509                         cache->last_byte_to_unpin = (u64)-1;
7510                         cache->cached = BTRFS_CACHE_FINISHED;
7511                         add_new_free_space(cache, root->fs_info,
7512                                            found_key.objectid,
7513                                            found_key.objectid +
7514                                            found_key.offset);
7515                         free_excluded_extents(root, cache);
7516                 }
7517
7518                 ret = update_space_info(info, cache->flags, found_key.offset,
7519                                         btrfs_block_group_used(&cache->item),
7520                                         &space_info);
7521                 BUG_ON(ret);
7522                 cache->space_info = space_info;
7523                 spin_lock(&cache->space_info->lock);
7524                 cache->space_info->bytes_super += cache->bytes_super;
7525                 spin_unlock(&cache->space_info->lock);
7526
7527                 down_write(&space_info->groups_sem);
7528                 list_add_tail(&cache->list, &space_info->block_groups);
7529                 up_write(&space_info->groups_sem);
7530
7531                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7532                 BUG_ON(ret);
7533
7534                 set_avail_alloc_bits(root->fs_info, cache->flags);
7535                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7536                         set_block_group_readonly(cache);
7537         }
7538         ret = 0;
7539 error:
7540         btrfs_free_path(path);
7541         return ret;
7542 }
7543
7544 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7545                            struct btrfs_root *root, u64 bytes_used,
7546                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7547                            u64 size)
7548 {
7549         int ret;
7550         struct btrfs_root *extent_root;
7551         struct btrfs_block_group_cache *cache;
7552
7553         extent_root = root->fs_info->extent_root;
7554
7555         root->fs_info->last_trans_log_full_commit = trans->transid;
7556
7557         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7558         if (!cache)
7559                 return -ENOMEM;
7560
7561         cache->key.objectid = chunk_offset;
7562         cache->key.offset = size;
7563         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7564         cache->sectorsize = root->sectorsize;
7565
7566         /*
7567          * we only want to have 32k of ram per block group for keeping track
7568          * of free space, and if we pass 1/2 of that we want to start
7569          * converting things over to using bitmaps
7570          */
7571         cache->extents_thresh = ((1024 * 32) / 2) /
7572                 sizeof(struct btrfs_free_space);
7573         atomic_set(&cache->count, 1);
7574         spin_lock_init(&cache->lock);
7575         spin_lock_init(&cache->tree_lock);
7576         INIT_LIST_HEAD(&cache->list);
7577         INIT_LIST_HEAD(&cache->cluster_list);
7578
7579         btrfs_set_block_group_used(&cache->item, bytes_used);
7580         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7581         cache->flags = type;
7582         btrfs_set_block_group_flags(&cache->item, type);
7583
7584         cache->last_byte_to_unpin = (u64)-1;
7585         cache->cached = BTRFS_CACHE_FINISHED;
7586         exclude_super_stripes(root, cache);
7587
7588         add_new_free_space(cache, root->fs_info, chunk_offset,
7589                            chunk_offset + size);
7590
7591         free_excluded_extents(root, cache);
7592
7593         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7594                                 &cache->space_info);
7595         BUG_ON(ret);
7596
7597         spin_lock(&cache->space_info->lock);
7598         cache->space_info->bytes_super += cache->bytes_super;
7599         spin_unlock(&cache->space_info->lock);
7600
7601         down_write(&cache->space_info->groups_sem);
7602         list_add_tail(&cache->list, &cache->space_info->block_groups);
7603         up_write(&cache->space_info->groups_sem);
7604
7605         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7606         BUG_ON(ret);
7607
7608         ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7609                                 sizeof(cache->item));
7610         BUG_ON(ret);
7611
7612         set_avail_alloc_bits(extent_root->fs_info, type);
7613
7614         return 0;
7615 }
7616
7617 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7618                              struct btrfs_root *root, u64 group_start)
7619 {
7620         struct btrfs_path *path;
7621         struct btrfs_block_group_cache *block_group;
7622         struct btrfs_free_cluster *cluster;
7623         struct btrfs_key key;
7624         int ret;
7625
7626         root = root->fs_info->extent_root;
7627
7628         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7629         BUG_ON(!block_group);
7630         BUG_ON(!block_group->ro);
7631
7632         memcpy(&key, &block_group->key, sizeof(key));
7633
7634         /* make sure this block group isn't part of an allocation cluster */
7635         cluster = &root->fs_info->data_alloc_cluster;
7636         spin_lock(&cluster->refill_lock);
7637         btrfs_return_cluster_to_free_space(block_group, cluster);
7638         spin_unlock(&cluster->refill_lock);
7639
7640         /*
7641          * make sure this block group isn't part of a metadata
7642          * allocation cluster
7643          */
7644         cluster = &root->fs_info->meta_alloc_cluster;
7645         spin_lock(&cluster->refill_lock);
7646         btrfs_return_cluster_to_free_space(block_group, cluster);
7647         spin_unlock(&cluster->refill_lock);
7648
7649         path = btrfs_alloc_path();
7650         BUG_ON(!path);
7651
7652         spin_lock(&root->fs_info->block_group_cache_lock);
7653         rb_erase(&block_group->cache_node,
7654                  &root->fs_info->block_group_cache_tree);
7655         spin_unlock(&root->fs_info->block_group_cache_lock);
7656
7657         down_write(&block_group->space_info->groups_sem);
7658         /*
7659          * we must use list_del_init so people can check to see if they
7660          * are still on the list after taking the semaphore
7661          */
7662         list_del_init(&block_group->list);
7663         up_write(&block_group->space_info->groups_sem);
7664
7665         if (block_group->cached == BTRFS_CACHE_STARTED)
7666                 wait_block_group_cache_done(block_group);
7667
7668         btrfs_remove_free_space_cache(block_group);
7669
7670         spin_lock(&block_group->space_info->lock);
7671         block_group->space_info->total_bytes -= block_group->key.offset;
7672         block_group->space_info->bytes_readonly -= block_group->key.offset;
7673         spin_unlock(&block_group->space_info->lock);
7674
7675         btrfs_clear_space_info_full(root->fs_info);
7676
7677         btrfs_put_block_group(block_group);
7678         btrfs_put_block_group(block_group);
7679
7680         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7681         if (ret > 0)
7682                 ret = -EIO;
7683         if (ret < 0)
7684                 goto out;
7685
7686         ret = btrfs_del_item(trans, root, path);
7687 out:
7688         btrfs_free_path(path);
7689         return ret;
7690 }