becf3396d533deb6fe7a0536a662bc0554bb9cc2
[platform/kernel/linux-rpi.git] / fs / btrfs / relocation.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include <linux/error-injection.h>
13 #include "ctree.h"
14 #include "disk-io.h"
15 #include "transaction.h"
16 #include "volumes.h"
17 #include "locking.h"
18 #include "btrfs_inode.h"
19 #include "async-thread.h"
20 #include "free-space-cache.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 #include "block-group.h"
25 #include "backref.h"
26 #include "misc.h"
27 #include "subpage.h"
28
29 /*
30  * Relocation overview
31  *
32  * [What does relocation do]
33  *
34  * The objective of relocation is to relocate all extents of the target block
35  * group to other block groups.
36  * This is utilized by resize (shrink only), profile converting, compacting
37  * space, or balance routine to spread chunks over devices.
38  *
39  *              Before          |               After
40  * ------------------------------------------------------------------
41  *  BG A: 10 data extents       | BG A: deleted
42  *  BG B:  2 data extents       | BG B: 10 data extents (2 old + 8 relocated)
43  *  BG C:  1 extents            | BG C:  3 data extents (1 old + 2 relocated)
44  *
45  * [How does relocation work]
46  *
47  * 1.   Mark the target block group read-only
48  *      New extents won't be allocated from the target block group.
49  *
50  * 2.1  Record each extent in the target block group
51  *      To build a proper map of extents to be relocated.
52  *
53  * 2.2  Build data reloc tree and reloc trees
54  *      Data reloc tree will contain an inode, recording all newly relocated
55  *      data extents.
56  *      There will be only one data reloc tree for one data block group.
57  *
58  *      Reloc tree will be a special snapshot of its source tree, containing
59  *      relocated tree blocks.
60  *      Each tree referring to a tree block in target block group will get its
61  *      reloc tree built.
62  *
63  * 2.3  Swap source tree with its corresponding reloc tree
64  *      Each involved tree only refers to new extents after swap.
65  *
66  * 3.   Cleanup reloc trees and data reloc tree.
67  *      As old extents in the target block group are still referenced by reloc
68  *      trees, we need to clean them up before really freeing the target block
69  *      group.
70  *
71  * The main complexity is in steps 2.2 and 2.3.
72  *
73  * The entry point of relocation is relocate_block_group() function.
74  */
75
76 #define RELOCATION_RESERVED_NODES       256
77 /*
78  * map address of tree root to tree
79  */
80 struct mapping_node {
81         struct {
82                 struct rb_node rb_node;
83                 u64 bytenr;
84         }; /* Use rb_simle_node for search/insert */
85         void *data;
86 };
87
88 struct mapping_tree {
89         struct rb_root rb_root;
90         spinlock_t lock;
91 };
92
93 /*
94  * present a tree block to process
95  */
96 struct tree_block {
97         struct {
98                 struct rb_node rb_node;
99                 u64 bytenr;
100         }; /* Use rb_simple_node for search/insert */
101         u64 owner;
102         struct btrfs_key key;
103         unsigned int level:8;
104         unsigned int key_ready:1;
105 };
106
107 #define MAX_EXTENTS 128
108
109 struct file_extent_cluster {
110         u64 start;
111         u64 end;
112         u64 boundary[MAX_EXTENTS];
113         unsigned int nr;
114 };
115
116 struct reloc_control {
117         /* block group to relocate */
118         struct btrfs_block_group *block_group;
119         /* extent tree */
120         struct btrfs_root *extent_root;
121         /* inode for moving data */
122         struct inode *data_inode;
123
124         struct btrfs_block_rsv *block_rsv;
125
126         struct btrfs_backref_cache backref_cache;
127
128         struct file_extent_cluster cluster;
129         /* tree blocks have been processed */
130         struct extent_io_tree processed_blocks;
131         /* map start of tree root to corresponding reloc tree */
132         struct mapping_tree reloc_root_tree;
133         /* list of reloc trees */
134         struct list_head reloc_roots;
135         /* list of subvolume trees that get relocated */
136         struct list_head dirty_subvol_roots;
137         /* size of metadata reservation for merging reloc trees */
138         u64 merging_rsv_size;
139         /* size of relocated tree nodes */
140         u64 nodes_relocated;
141         /* reserved size for block group relocation*/
142         u64 reserved_bytes;
143
144         u64 search_start;
145         u64 extents_found;
146
147         unsigned int stage:8;
148         unsigned int create_reloc_tree:1;
149         unsigned int merge_reloc_tree:1;
150         unsigned int found_file_extent:1;
151 };
152
153 /* stages of data relocation */
154 #define MOVE_DATA_EXTENTS       0
155 #define UPDATE_DATA_PTRS        1
156
157 static void mark_block_processed(struct reloc_control *rc,
158                                  struct btrfs_backref_node *node)
159 {
160         u32 blocksize;
161
162         if (node->level == 0 ||
163             in_range(node->bytenr, rc->block_group->start,
164                      rc->block_group->length)) {
165                 blocksize = rc->extent_root->fs_info->nodesize;
166                 set_extent_bits(&rc->processed_blocks, node->bytenr,
167                                 node->bytenr + blocksize - 1, EXTENT_DIRTY);
168         }
169         node->processed = 1;
170 }
171
172
173 static void mapping_tree_init(struct mapping_tree *tree)
174 {
175         tree->rb_root = RB_ROOT;
176         spin_lock_init(&tree->lock);
177 }
178
179 /*
180  * walk up backref nodes until reach node presents tree root
181  */
182 static struct btrfs_backref_node *walk_up_backref(
183                 struct btrfs_backref_node *node,
184                 struct btrfs_backref_edge *edges[], int *index)
185 {
186         struct btrfs_backref_edge *edge;
187         int idx = *index;
188
189         while (!list_empty(&node->upper)) {
190                 edge = list_entry(node->upper.next,
191                                   struct btrfs_backref_edge, list[LOWER]);
192                 edges[idx++] = edge;
193                 node = edge->node[UPPER];
194         }
195         BUG_ON(node->detached);
196         *index = idx;
197         return node;
198 }
199
200 /*
201  * walk down backref nodes to find start of next reference path
202  */
203 static struct btrfs_backref_node *walk_down_backref(
204                 struct btrfs_backref_edge *edges[], int *index)
205 {
206         struct btrfs_backref_edge *edge;
207         struct btrfs_backref_node *lower;
208         int idx = *index;
209
210         while (idx > 0) {
211                 edge = edges[idx - 1];
212                 lower = edge->node[LOWER];
213                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
214                         idx--;
215                         continue;
216                 }
217                 edge = list_entry(edge->list[LOWER].next,
218                                   struct btrfs_backref_edge, list[LOWER]);
219                 edges[idx - 1] = edge;
220                 *index = idx;
221                 return edge->node[UPPER];
222         }
223         *index = 0;
224         return NULL;
225 }
226
227 static void update_backref_node(struct btrfs_backref_cache *cache,
228                                 struct btrfs_backref_node *node, u64 bytenr)
229 {
230         struct rb_node *rb_node;
231         rb_erase(&node->rb_node, &cache->rb_root);
232         node->bytenr = bytenr;
233         rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
234         if (rb_node)
235                 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
236 }
237
238 /*
239  * update backref cache after a transaction commit
240  */
241 static int update_backref_cache(struct btrfs_trans_handle *trans,
242                                 struct btrfs_backref_cache *cache)
243 {
244         struct btrfs_backref_node *node;
245         int level = 0;
246
247         if (cache->last_trans == 0) {
248                 cache->last_trans = trans->transid;
249                 return 0;
250         }
251
252         if (cache->last_trans == trans->transid)
253                 return 0;
254
255         /*
256          * detached nodes are used to avoid unnecessary backref
257          * lookup. transaction commit changes the extent tree.
258          * so the detached nodes are no longer useful.
259          */
260         while (!list_empty(&cache->detached)) {
261                 node = list_entry(cache->detached.next,
262                                   struct btrfs_backref_node, list);
263                 btrfs_backref_cleanup_node(cache, node);
264         }
265
266         while (!list_empty(&cache->changed)) {
267                 node = list_entry(cache->changed.next,
268                                   struct btrfs_backref_node, list);
269                 list_del_init(&node->list);
270                 BUG_ON(node->pending);
271                 update_backref_node(cache, node, node->new_bytenr);
272         }
273
274         /*
275          * some nodes can be left in the pending list if there were
276          * errors during processing the pending nodes.
277          */
278         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
279                 list_for_each_entry(node, &cache->pending[level], list) {
280                         BUG_ON(!node->pending);
281                         if (node->bytenr == node->new_bytenr)
282                                 continue;
283                         update_backref_node(cache, node, node->new_bytenr);
284                 }
285         }
286
287         cache->last_trans = 0;
288         return 1;
289 }
290
291 static bool reloc_root_is_dead(struct btrfs_root *root)
292 {
293         /*
294          * Pair with set_bit/clear_bit in clean_dirty_subvols and
295          * btrfs_update_reloc_root. We need to see the updated bit before
296          * trying to access reloc_root
297          */
298         smp_rmb();
299         if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
300                 return true;
301         return false;
302 }
303
304 /*
305  * Check if this subvolume tree has valid reloc tree.
306  *
307  * Reloc tree after swap is considered dead, thus not considered as valid.
308  * This is enough for most callers, as they don't distinguish dead reloc root
309  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
310  * special case.
311  */
312 static bool have_reloc_root(struct btrfs_root *root)
313 {
314         if (reloc_root_is_dead(root))
315                 return false;
316         if (!root->reloc_root)
317                 return false;
318         return true;
319 }
320
321 int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
322 {
323         struct btrfs_root *reloc_root;
324
325         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
326                 return 0;
327
328         /* This root has been merged with its reloc tree, we can ignore it */
329         if (reloc_root_is_dead(root))
330                 return 1;
331
332         reloc_root = root->reloc_root;
333         if (!reloc_root)
334                 return 0;
335
336         if (btrfs_header_generation(reloc_root->commit_root) ==
337             root->fs_info->running_transaction->transid)
338                 return 0;
339         /*
340          * if there is reloc tree and it was created in previous
341          * transaction backref lookup can find the reloc tree,
342          * so backref node for the fs tree root is useless for
343          * relocation.
344          */
345         return 1;
346 }
347
348 /*
349  * find reloc tree by address of tree root
350  */
351 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
352 {
353         struct reloc_control *rc = fs_info->reloc_ctl;
354         struct rb_node *rb_node;
355         struct mapping_node *node;
356         struct btrfs_root *root = NULL;
357
358         ASSERT(rc);
359         spin_lock(&rc->reloc_root_tree.lock);
360         rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
361         if (rb_node) {
362                 node = rb_entry(rb_node, struct mapping_node, rb_node);
363                 root = (struct btrfs_root *)node->data;
364         }
365         spin_unlock(&rc->reloc_root_tree.lock);
366         return btrfs_grab_root(root);
367 }
368
369 /*
370  * For useless nodes, do two major clean ups:
371  *
372  * - Cleanup the children edges and nodes
373  *   If child node is also orphan (no parent) during cleanup, then the child
374  *   node will also be cleaned up.
375  *
376  * - Freeing up leaves (level 0), keeps nodes detached
377  *   For nodes, the node is still cached as "detached"
378  *
379  * Return false if @node is not in the @useless_nodes list.
380  * Return true if @node is in the @useless_nodes list.
381  */
382 static bool handle_useless_nodes(struct reloc_control *rc,
383                                  struct btrfs_backref_node *node)
384 {
385         struct btrfs_backref_cache *cache = &rc->backref_cache;
386         struct list_head *useless_node = &cache->useless_node;
387         bool ret = false;
388
389         while (!list_empty(useless_node)) {
390                 struct btrfs_backref_node *cur;
391
392                 cur = list_first_entry(useless_node, struct btrfs_backref_node,
393                                  list);
394                 list_del_init(&cur->list);
395
396                 /* Only tree root nodes can be added to @useless_nodes */
397                 ASSERT(list_empty(&cur->upper));
398
399                 if (cur == node)
400                         ret = true;
401
402                 /* The node is the lowest node */
403                 if (cur->lowest) {
404                         list_del_init(&cur->lower);
405                         cur->lowest = 0;
406                 }
407
408                 /* Cleanup the lower edges */
409                 while (!list_empty(&cur->lower)) {
410                         struct btrfs_backref_edge *edge;
411                         struct btrfs_backref_node *lower;
412
413                         edge = list_entry(cur->lower.next,
414                                         struct btrfs_backref_edge, list[UPPER]);
415                         list_del(&edge->list[UPPER]);
416                         list_del(&edge->list[LOWER]);
417                         lower = edge->node[LOWER];
418                         btrfs_backref_free_edge(cache, edge);
419
420                         /* Child node is also orphan, queue for cleanup */
421                         if (list_empty(&lower->upper))
422                                 list_add(&lower->list, useless_node);
423                 }
424                 /* Mark this block processed for relocation */
425                 mark_block_processed(rc, cur);
426
427                 /*
428                  * Backref nodes for tree leaves are deleted from the cache.
429                  * Backref nodes for upper level tree blocks are left in the
430                  * cache to avoid unnecessary backref lookup.
431                  */
432                 if (cur->level > 0) {
433                         list_add(&cur->list, &cache->detached);
434                         cur->detached = 1;
435                 } else {
436                         rb_erase(&cur->rb_node, &cache->rb_root);
437                         btrfs_backref_free_node(cache, cur);
438                 }
439         }
440         return ret;
441 }
442
443 /*
444  * Build backref tree for a given tree block. Root of the backref tree
445  * corresponds the tree block, leaves of the backref tree correspond roots of
446  * b-trees that reference the tree block.
447  *
448  * The basic idea of this function is check backrefs of a given block to find
449  * upper level blocks that reference the block, and then check backrefs of
450  * these upper level blocks recursively. The recursion stops when tree root is
451  * reached or backrefs for the block is cached.
452  *
453  * NOTE: if we find that backrefs for a block are cached, we know backrefs for
454  * all upper level blocks that directly/indirectly reference the block are also
455  * cached.
456  */
457 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
458                         struct reloc_control *rc, struct btrfs_key *node_key,
459                         int level, u64 bytenr)
460 {
461         struct btrfs_backref_iter *iter;
462         struct btrfs_backref_cache *cache = &rc->backref_cache;
463         /* For searching parent of TREE_BLOCK_REF */
464         struct btrfs_path *path;
465         struct btrfs_backref_node *cur;
466         struct btrfs_backref_node *node = NULL;
467         struct btrfs_backref_edge *edge;
468         int ret;
469         int err = 0;
470
471         iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
472         if (!iter)
473                 return ERR_PTR(-ENOMEM);
474         path = btrfs_alloc_path();
475         if (!path) {
476                 err = -ENOMEM;
477                 goto out;
478         }
479
480         node = btrfs_backref_alloc_node(cache, bytenr, level);
481         if (!node) {
482                 err = -ENOMEM;
483                 goto out;
484         }
485
486         node->lowest = 1;
487         cur = node;
488
489         /* Breadth-first search to build backref cache */
490         do {
491                 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
492                                                   cur);
493                 if (ret < 0) {
494                         err = ret;
495                         goto out;
496                 }
497                 edge = list_first_entry_or_null(&cache->pending_edge,
498                                 struct btrfs_backref_edge, list[UPPER]);
499                 /*
500                  * The pending list isn't empty, take the first block to
501                  * process
502                  */
503                 if (edge) {
504                         list_del_init(&edge->list[UPPER]);
505                         cur = edge->node[UPPER];
506                 }
507         } while (edge);
508
509         /* Finish the upper linkage of newly added edges/nodes */
510         ret = btrfs_backref_finish_upper_links(cache, node);
511         if (ret < 0) {
512                 err = ret;
513                 goto out;
514         }
515
516         if (handle_useless_nodes(rc, node))
517                 node = NULL;
518 out:
519         btrfs_backref_iter_free(iter);
520         btrfs_free_path(path);
521         if (err) {
522                 btrfs_backref_error_cleanup(cache, node);
523                 return ERR_PTR(err);
524         }
525         ASSERT(!node || !node->detached);
526         ASSERT(list_empty(&cache->useless_node) &&
527                list_empty(&cache->pending_edge));
528         return node;
529 }
530
531 /*
532  * helper to add backref node for the newly created snapshot.
533  * the backref node is created by cloning backref node that
534  * corresponds to root of source tree
535  */
536 static int clone_backref_node(struct btrfs_trans_handle *trans,
537                               struct reloc_control *rc,
538                               struct btrfs_root *src,
539                               struct btrfs_root *dest)
540 {
541         struct btrfs_root *reloc_root = src->reloc_root;
542         struct btrfs_backref_cache *cache = &rc->backref_cache;
543         struct btrfs_backref_node *node = NULL;
544         struct btrfs_backref_node *new_node;
545         struct btrfs_backref_edge *edge;
546         struct btrfs_backref_edge *new_edge;
547         struct rb_node *rb_node;
548
549         if (cache->last_trans > 0)
550                 update_backref_cache(trans, cache);
551
552         rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
553         if (rb_node) {
554                 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
555                 if (node->detached)
556                         node = NULL;
557                 else
558                         BUG_ON(node->new_bytenr != reloc_root->node->start);
559         }
560
561         if (!node) {
562                 rb_node = rb_simple_search(&cache->rb_root,
563                                            reloc_root->commit_root->start);
564                 if (rb_node) {
565                         node = rb_entry(rb_node, struct btrfs_backref_node,
566                                         rb_node);
567                         BUG_ON(node->detached);
568                 }
569         }
570
571         if (!node)
572                 return 0;
573
574         new_node = btrfs_backref_alloc_node(cache, dest->node->start,
575                                             node->level);
576         if (!new_node)
577                 return -ENOMEM;
578
579         new_node->lowest = node->lowest;
580         new_node->checked = 1;
581         new_node->root = btrfs_grab_root(dest);
582         ASSERT(new_node->root);
583
584         if (!node->lowest) {
585                 list_for_each_entry(edge, &node->lower, list[UPPER]) {
586                         new_edge = btrfs_backref_alloc_edge(cache);
587                         if (!new_edge)
588                                 goto fail;
589
590                         btrfs_backref_link_edge(new_edge, edge->node[LOWER],
591                                                 new_node, LINK_UPPER);
592                 }
593         } else {
594                 list_add_tail(&new_node->lower, &cache->leaves);
595         }
596
597         rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
598                                    &new_node->rb_node);
599         if (rb_node)
600                 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
601
602         if (!new_node->lowest) {
603                 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
604                         list_add_tail(&new_edge->list[LOWER],
605                                       &new_edge->node[LOWER]->upper);
606                 }
607         }
608         return 0;
609 fail:
610         while (!list_empty(&new_node->lower)) {
611                 new_edge = list_entry(new_node->lower.next,
612                                       struct btrfs_backref_edge, list[UPPER]);
613                 list_del(&new_edge->list[UPPER]);
614                 btrfs_backref_free_edge(cache, new_edge);
615         }
616         btrfs_backref_free_node(cache, new_node);
617         return -ENOMEM;
618 }
619
620 /*
621  * helper to add 'address of tree root -> reloc tree' mapping
622  */
623 static int __must_check __add_reloc_root(struct btrfs_root *root)
624 {
625         struct btrfs_fs_info *fs_info = root->fs_info;
626         struct rb_node *rb_node;
627         struct mapping_node *node;
628         struct reloc_control *rc = fs_info->reloc_ctl;
629
630         node = kmalloc(sizeof(*node), GFP_NOFS);
631         if (!node)
632                 return -ENOMEM;
633
634         node->bytenr = root->commit_root->start;
635         node->data = root;
636
637         spin_lock(&rc->reloc_root_tree.lock);
638         rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
639                                    node->bytenr, &node->rb_node);
640         spin_unlock(&rc->reloc_root_tree.lock);
641         if (rb_node) {
642                 btrfs_err(fs_info,
643                             "Duplicate root found for start=%llu while inserting into relocation tree",
644                             node->bytenr);
645                 return -EEXIST;
646         }
647
648         list_add_tail(&root->root_list, &rc->reloc_roots);
649         return 0;
650 }
651
652 /*
653  * helper to delete the 'address of tree root -> reloc tree'
654  * mapping
655  */
656 static void __del_reloc_root(struct btrfs_root *root)
657 {
658         struct btrfs_fs_info *fs_info = root->fs_info;
659         struct rb_node *rb_node;
660         struct mapping_node *node = NULL;
661         struct reloc_control *rc = fs_info->reloc_ctl;
662         bool put_ref = false;
663
664         if (rc && root->node) {
665                 spin_lock(&rc->reloc_root_tree.lock);
666                 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
667                                            root->commit_root->start);
668                 if (rb_node) {
669                         node = rb_entry(rb_node, struct mapping_node, rb_node);
670                         rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
671                         RB_CLEAR_NODE(&node->rb_node);
672                 }
673                 spin_unlock(&rc->reloc_root_tree.lock);
674                 ASSERT(!node || (struct btrfs_root *)node->data == root);
675         }
676
677         /*
678          * We only put the reloc root here if it's on the list.  There's a lot
679          * of places where the pattern is to splice the rc->reloc_roots, process
680          * the reloc roots, and then add the reloc root back onto
681          * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
682          * list we don't want the reference being dropped, because the guy
683          * messing with the list is in charge of the reference.
684          */
685         spin_lock(&fs_info->trans_lock);
686         if (!list_empty(&root->root_list)) {
687                 put_ref = true;
688                 list_del_init(&root->root_list);
689         }
690         spin_unlock(&fs_info->trans_lock);
691         if (put_ref)
692                 btrfs_put_root(root);
693         kfree(node);
694 }
695
696 /*
697  * helper to update the 'address of tree root -> reloc tree'
698  * mapping
699  */
700 static int __update_reloc_root(struct btrfs_root *root)
701 {
702         struct btrfs_fs_info *fs_info = root->fs_info;
703         struct rb_node *rb_node;
704         struct mapping_node *node = NULL;
705         struct reloc_control *rc = fs_info->reloc_ctl;
706
707         spin_lock(&rc->reloc_root_tree.lock);
708         rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
709                                    root->commit_root->start);
710         if (rb_node) {
711                 node = rb_entry(rb_node, struct mapping_node, rb_node);
712                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
713         }
714         spin_unlock(&rc->reloc_root_tree.lock);
715
716         if (!node)
717                 return 0;
718         BUG_ON((struct btrfs_root *)node->data != root);
719
720         spin_lock(&rc->reloc_root_tree.lock);
721         node->bytenr = root->node->start;
722         rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
723                                    node->bytenr, &node->rb_node);
724         spin_unlock(&rc->reloc_root_tree.lock);
725         if (rb_node)
726                 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
727         return 0;
728 }
729
730 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
731                                         struct btrfs_root *root, u64 objectid)
732 {
733         struct btrfs_fs_info *fs_info = root->fs_info;
734         struct btrfs_root *reloc_root;
735         struct extent_buffer *eb;
736         struct btrfs_root_item *root_item;
737         struct btrfs_key root_key;
738         int ret = 0;
739         bool must_abort = false;
740
741         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
742         if (!root_item)
743                 return ERR_PTR(-ENOMEM);
744
745         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
746         root_key.type = BTRFS_ROOT_ITEM_KEY;
747         root_key.offset = objectid;
748
749         if (root->root_key.objectid == objectid) {
750                 u64 commit_root_gen;
751
752                 /* called by btrfs_init_reloc_root */
753                 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
754                                       BTRFS_TREE_RELOC_OBJECTID);
755                 if (ret)
756                         goto fail;
757
758                 /*
759                  * Set the last_snapshot field to the generation of the commit
760                  * root - like this ctree.c:btrfs_block_can_be_shared() behaves
761                  * correctly (returns true) when the relocation root is created
762                  * either inside the critical section of a transaction commit
763                  * (through transaction.c:qgroup_account_snapshot()) and when
764                  * it's created before the transaction commit is started.
765                  */
766                 commit_root_gen = btrfs_header_generation(root->commit_root);
767                 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
768         } else {
769                 /*
770                  * called by btrfs_reloc_post_snapshot_hook.
771                  * the source tree is a reloc tree, all tree blocks
772                  * modified after it was created have RELOC flag
773                  * set in their headers. so it's OK to not update
774                  * the 'last_snapshot'.
775                  */
776                 ret = btrfs_copy_root(trans, root, root->node, &eb,
777                                       BTRFS_TREE_RELOC_OBJECTID);
778                 if (ret)
779                         goto fail;
780         }
781
782         /*
783          * We have changed references at this point, we must abort the
784          * transaction if anything fails.
785          */
786         must_abort = true;
787
788         memcpy(root_item, &root->root_item, sizeof(*root_item));
789         btrfs_set_root_bytenr(root_item, eb->start);
790         btrfs_set_root_level(root_item, btrfs_header_level(eb));
791         btrfs_set_root_generation(root_item, trans->transid);
792
793         if (root->root_key.objectid == objectid) {
794                 btrfs_set_root_refs(root_item, 0);
795                 memset(&root_item->drop_progress, 0,
796                        sizeof(struct btrfs_disk_key));
797                 btrfs_set_root_drop_level(root_item, 0);
798         }
799
800         btrfs_tree_unlock(eb);
801         free_extent_buffer(eb);
802
803         ret = btrfs_insert_root(trans, fs_info->tree_root,
804                                 &root_key, root_item);
805         if (ret)
806                 goto fail;
807
808         kfree(root_item);
809
810         reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
811         if (IS_ERR(reloc_root)) {
812                 ret = PTR_ERR(reloc_root);
813                 goto abort;
814         }
815         set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
816         reloc_root->last_trans = trans->transid;
817         return reloc_root;
818 fail:
819         kfree(root_item);
820 abort:
821         if (must_abort)
822                 btrfs_abort_transaction(trans, ret);
823         return ERR_PTR(ret);
824 }
825
826 /*
827  * create reloc tree for a given fs tree. reloc tree is just a
828  * snapshot of the fs tree with special root objectid.
829  *
830  * The reloc_root comes out of here with two references, one for
831  * root->reloc_root, and another for being on the rc->reloc_roots list.
832  */
833 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
834                           struct btrfs_root *root)
835 {
836         struct btrfs_fs_info *fs_info = root->fs_info;
837         struct btrfs_root *reloc_root;
838         struct reloc_control *rc = fs_info->reloc_ctl;
839         struct btrfs_block_rsv *rsv;
840         int clear_rsv = 0;
841         int ret;
842
843         if (!rc)
844                 return 0;
845
846         /*
847          * The subvolume has reloc tree but the swap is finished, no need to
848          * create/update the dead reloc tree
849          */
850         if (reloc_root_is_dead(root))
851                 return 0;
852
853         /*
854          * This is subtle but important.  We do not do
855          * record_root_in_transaction for reloc roots, instead we record their
856          * corresponding fs root, and then here we update the last trans for the
857          * reloc root.  This means that we have to do this for the entire life
858          * of the reloc root, regardless of which stage of the relocation we are
859          * in.
860          */
861         if (root->reloc_root) {
862                 reloc_root = root->reloc_root;
863                 reloc_root->last_trans = trans->transid;
864                 return 0;
865         }
866
867         /*
868          * We are merging reloc roots, we do not need new reloc trees.  Also
869          * reloc trees never need their own reloc tree.
870          */
871         if (!rc->create_reloc_tree ||
872             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
873                 return 0;
874
875         if (!trans->reloc_reserved) {
876                 rsv = trans->block_rsv;
877                 trans->block_rsv = rc->block_rsv;
878                 clear_rsv = 1;
879         }
880         reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
881         if (clear_rsv)
882                 trans->block_rsv = rsv;
883         if (IS_ERR(reloc_root))
884                 return PTR_ERR(reloc_root);
885
886         ret = __add_reloc_root(reloc_root);
887         ASSERT(ret != -EEXIST);
888         if (ret) {
889                 /* Pairs with create_reloc_root */
890                 btrfs_put_root(reloc_root);
891                 return ret;
892         }
893         root->reloc_root = btrfs_grab_root(reloc_root);
894         return 0;
895 }
896
897 /*
898  * update root item of reloc tree
899  */
900 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
901                             struct btrfs_root *root)
902 {
903         struct btrfs_fs_info *fs_info = root->fs_info;
904         struct btrfs_root *reloc_root;
905         struct btrfs_root_item *root_item;
906         int ret;
907
908         if (!have_reloc_root(root))
909                 return 0;
910
911         reloc_root = root->reloc_root;
912         root_item = &reloc_root->root_item;
913
914         /*
915          * We are probably ok here, but __del_reloc_root() will drop its ref of
916          * the root.  We have the ref for root->reloc_root, but just in case
917          * hold it while we update the reloc root.
918          */
919         btrfs_grab_root(reloc_root);
920
921         /* root->reloc_root will stay until current relocation finished */
922         if (fs_info->reloc_ctl->merge_reloc_tree &&
923             btrfs_root_refs(root_item) == 0) {
924                 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
925                 /*
926                  * Mark the tree as dead before we change reloc_root so
927                  * have_reloc_root will not touch it from now on.
928                  */
929                 smp_wmb();
930                 __del_reloc_root(reloc_root);
931         }
932
933         if (reloc_root->commit_root != reloc_root->node) {
934                 __update_reloc_root(reloc_root);
935                 btrfs_set_root_node(root_item, reloc_root->node);
936                 free_extent_buffer(reloc_root->commit_root);
937                 reloc_root->commit_root = btrfs_root_node(reloc_root);
938         }
939
940         ret = btrfs_update_root(trans, fs_info->tree_root,
941                                 &reloc_root->root_key, root_item);
942         btrfs_put_root(reloc_root);
943         return ret;
944 }
945
946 /*
947  * helper to find first cached inode with inode number >= objectid
948  * in a subvolume
949  */
950 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
951 {
952         struct rb_node *node;
953         struct rb_node *prev;
954         struct btrfs_inode *entry;
955         struct inode *inode;
956
957         spin_lock(&root->inode_lock);
958 again:
959         node = root->inode_tree.rb_node;
960         prev = NULL;
961         while (node) {
962                 prev = node;
963                 entry = rb_entry(node, struct btrfs_inode, rb_node);
964
965                 if (objectid < btrfs_ino(entry))
966                         node = node->rb_left;
967                 else if (objectid > btrfs_ino(entry))
968                         node = node->rb_right;
969                 else
970                         break;
971         }
972         if (!node) {
973                 while (prev) {
974                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
975                         if (objectid <= btrfs_ino(entry)) {
976                                 node = prev;
977                                 break;
978                         }
979                         prev = rb_next(prev);
980                 }
981         }
982         while (node) {
983                 entry = rb_entry(node, struct btrfs_inode, rb_node);
984                 inode = igrab(&entry->vfs_inode);
985                 if (inode) {
986                         spin_unlock(&root->inode_lock);
987                         return inode;
988                 }
989
990                 objectid = btrfs_ino(entry) + 1;
991                 if (cond_resched_lock(&root->inode_lock))
992                         goto again;
993
994                 node = rb_next(node);
995         }
996         spin_unlock(&root->inode_lock);
997         return NULL;
998 }
999
1000 /*
1001  * get new location of data
1002  */
1003 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1004                             u64 bytenr, u64 num_bytes)
1005 {
1006         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1007         struct btrfs_path *path;
1008         struct btrfs_file_extent_item *fi;
1009         struct extent_buffer *leaf;
1010         int ret;
1011
1012         path = btrfs_alloc_path();
1013         if (!path)
1014                 return -ENOMEM;
1015
1016         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1017         ret = btrfs_lookup_file_extent(NULL, root, path,
1018                         btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1019         if (ret < 0)
1020                 goto out;
1021         if (ret > 0) {
1022                 ret = -ENOENT;
1023                 goto out;
1024         }
1025
1026         leaf = path->nodes[0];
1027         fi = btrfs_item_ptr(leaf, path->slots[0],
1028                             struct btrfs_file_extent_item);
1029
1030         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1031                btrfs_file_extent_compression(leaf, fi) ||
1032                btrfs_file_extent_encryption(leaf, fi) ||
1033                btrfs_file_extent_other_encoding(leaf, fi));
1034
1035         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1036                 ret = -EINVAL;
1037                 goto out;
1038         }
1039
1040         *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1041         ret = 0;
1042 out:
1043         btrfs_free_path(path);
1044         return ret;
1045 }
1046
1047 /*
1048  * update file extent items in the tree leaf to point to
1049  * the new locations.
1050  */
1051 static noinline_for_stack
1052 int replace_file_extents(struct btrfs_trans_handle *trans,
1053                          struct reloc_control *rc,
1054                          struct btrfs_root *root,
1055                          struct extent_buffer *leaf)
1056 {
1057         struct btrfs_fs_info *fs_info = root->fs_info;
1058         struct btrfs_key key;
1059         struct btrfs_file_extent_item *fi;
1060         struct inode *inode = NULL;
1061         u64 parent;
1062         u64 bytenr;
1063         u64 new_bytenr = 0;
1064         u64 num_bytes;
1065         u64 end;
1066         u32 nritems;
1067         u32 i;
1068         int ret = 0;
1069         int first = 1;
1070         int dirty = 0;
1071
1072         if (rc->stage != UPDATE_DATA_PTRS)
1073                 return 0;
1074
1075         /* reloc trees always use full backref */
1076         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1077                 parent = leaf->start;
1078         else
1079                 parent = 0;
1080
1081         nritems = btrfs_header_nritems(leaf);
1082         for (i = 0; i < nritems; i++) {
1083                 struct btrfs_ref ref = { 0 };
1084
1085                 cond_resched();
1086                 btrfs_item_key_to_cpu(leaf, &key, i);
1087                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1088                         continue;
1089                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1090                 if (btrfs_file_extent_type(leaf, fi) ==
1091                     BTRFS_FILE_EXTENT_INLINE)
1092                         continue;
1093                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1094                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1095                 if (bytenr == 0)
1096                         continue;
1097                 if (!in_range(bytenr, rc->block_group->start,
1098                               rc->block_group->length))
1099                         continue;
1100
1101                 /*
1102                  * if we are modifying block in fs tree, wait for readpage
1103                  * to complete and drop the extent cache
1104                  */
1105                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1106                         if (first) {
1107                                 inode = find_next_inode(root, key.objectid);
1108                                 first = 0;
1109                         } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1110                                 btrfs_add_delayed_iput(inode);
1111                                 inode = find_next_inode(root, key.objectid);
1112                         }
1113                         if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1114                                 end = key.offset +
1115                                       btrfs_file_extent_num_bytes(leaf, fi);
1116                                 WARN_ON(!IS_ALIGNED(key.offset,
1117                                                     fs_info->sectorsize));
1118                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1119                                 end--;
1120                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1121                                                       key.offset, end);
1122                                 if (!ret)
1123                                         continue;
1124
1125                                 btrfs_drop_extent_cache(BTRFS_I(inode),
1126                                                 key.offset,     end, 1);
1127                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1128                                               key.offset, end);
1129                         }
1130                 }
1131
1132                 ret = get_new_location(rc->data_inode, &new_bytenr,
1133                                        bytenr, num_bytes);
1134                 if (ret) {
1135                         /*
1136                          * Don't have to abort since we've not changed anything
1137                          * in the file extent yet.
1138                          */
1139                         break;
1140                 }
1141
1142                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1143                 dirty = 1;
1144
1145                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1146                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1147                                        num_bytes, parent);
1148                 ref.real_root = root->root_key.objectid;
1149                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1150                                     key.objectid, key.offset,
1151                                     root->root_key.objectid, false);
1152                 ret = btrfs_inc_extent_ref(trans, &ref);
1153                 if (ret) {
1154                         btrfs_abort_transaction(trans, ret);
1155                         break;
1156                 }
1157
1158                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1159                                        num_bytes, parent);
1160                 ref.real_root = root->root_key.objectid;
1161                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1162                                     key.objectid, key.offset,
1163                                     root->root_key.objectid, false);
1164                 ret = btrfs_free_extent(trans, &ref);
1165                 if (ret) {
1166                         btrfs_abort_transaction(trans, ret);
1167                         break;
1168                 }
1169         }
1170         if (dirty)
1171                 btrfs_mark_buffer_dirty(leaf);
1172         if (inode)
1173                 btrfs_add_delayed_iput(inode);
1174         return ret;
1175 }
1176
1177 static noinline_for_stack
1178 int memcmp_node_keys(struct extent_buffer *eb, int slot,
1179                      struct btrfs_path *path, int level)
1180 {
1181         struct btrfs_disk_key key1;
1182         struct btrfs_disk_key key2;
1183         btrfs_node_key(eb, &key1, slot);
1184         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1185         return memcmp(&key1, &key2, sizeof(key1));
1186 }
1187
1188 /*
1189  * try to replace tree blocks in fs tree with the new blocks
1190  * in reloc tree. tree blocks haven't been modified since the
1191  * reloc tree was create can be replaced.
1192  *
1193  * if a block was replaced, level of the block + 1 is returned.
1194  * if no block got replaced, 0 is returned. if there are other
1195  * errors, a negative error number is returned.
1196  */
1197 static noinline_for_stack
1198 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1199                  struct btrfs_root *dest, struct btrfs_root *src,
1200                  struct btrfs_path *path, struct btrfs_key *next_key,
1201                  int lowest_level, int max_level)
1202 {
1203         struct btrfs_fs_info *fs_info = dest->fs_info;
1204         struct extent_buffer *eb;
1205         struct extent_buffer *parent;
1206         struct btrfs_ref ref = { 0 };
1207         struct btrfs_key key;
1208         u64 old_bytenr;
1209         u64 new_bytenr;
1210         u64 old_ptr_gen;
1211         u64 new_ptr_gen;
1212         u64 last_snapshot;
1213         u32 blocksize;
1214         int cow = 0;
1215         int level;
1216         int ret;
1217         int slot;
1218
1219         ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1220         ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1221
1222         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1223 again:
1224         slot = path->slots[lowest_level];
1225         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1226
1227         eb = btrfs_lock_root_node(dest);
1228         level = btrfs_header_level(eb);
1229
1230         if (level < lowest_level) {
1231                 btrfs_tree_unlock(eb);
1232                 free_extent_buffer(eb);
1233                 return 0;
1234         }
1235
1236         if (cow) {
1237                 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1238                                       BTRFS_NESTING_COW);
1239                 if (ret) {
1240                         btrfs_tree_unlock(eb);
1241                         free_extent_buffer(eb);
1242                         return ret;
1243                 }
1244         }
1245
1246         if (next_key) {
1247                 next_key->objectid = (u64)-1;
1248                 next_key->type = (u8)-1;
1249                 next_key->offset = (u64)-1;
1250         }
1251
1252         parent = eb;
1253         while (1) {
1254                 level = btrfs_header_level(parent);
1255                 ASSERT(level >= lowest_level);
1256
1257                 ret = btrfs_bin_search(parent, &key, &slot);
1258                 if (ret < 0)
1259                         break;
1260                 if (ret && slot > 0)
1261                         slot--;
1262
1263                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1264                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1265
1266                 old_bytenr = btrfs_node_blockptr(parent, slot);
1267                 blocksize = fs_info->nodesize;
1268                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1269
1270                 if (level <= max_level) {
1271                         eb = path->nodes[level];
1272                         new_bytenr = btrfs_node_blockptr(eb,
1273                                                         path->slots[level]);
1274                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1275                                                         path->slots[level]);
1276                 } else {
1277                         new_bytenr = 0;
1278                         new_ptr_gen = 0;
1279                 }
1280
1281                 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1282                         ret = level;
1283                         break;
1284                 }
1285
1286                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1287                     memcmp_node_keys(parent, slot, path, level)) {
1288                         if (level <= lowest_level) {
1289                                 ret = 0;
1290                                 break;
1291                         }
1292
1293                         eb = btrfs_read_node_slot(parent, slot);
1294                         if (IS_ERR(eb)) {
1295                                 ret = PTR_ERR(eb);
1296                                 break;
1297                         }
1298                         btrfs_tree_lock(eb);
1299                         if (cow) {
1300                                 ret = btrfs_cow_block(trans, dest, eb, parent,
1301                                                       slot, &eb,
1302                                                       BTRFS_NESTING_COW);
1303                                 if (ret) {
1304                                         btrfs_tree_unlock(eb);
1305                                         free_extent_buffer(eb);
1306                                         break;
1307                                 }
1308                         }
1309
1310                         btrfs_tree_unlock(parent);
1311                         free_extent_buffer(parent);
1312
1313                         parent = eb;
1314                         continue;
1315                 }
1316
1317                 if (!cow) {
1318                         btrfs_tree_unlock(parent);
1319                         free_extent_buffer(parent);
1320                         cow = 1;
1321                         goto again;
1322                 }
1323
1324                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1325                                       path->slots[level]);
1326                 btrfs_release_path(path);
1327
1328                 path->lowest_level = level;
1329                 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1330                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1331                 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1332                 path->lowest_level = 0;
1333                 if (ret) {
1334                         if (ret > 0)
1335                                 ret = -ENOENT;
1336                         break;
1337                 }
1338
1339                 /*
1340                  * Info qgroup to trace both subtrees.
1341                  *
1342                  * We must trace both trees.
1343                  * 1) Tree reloc subtree
1344                  *    If not traced, we will leak data numbers
1345                  * 2) Fs subtree
1346                  *    If not traced, we will double count old data
1347                  *
1348                  * We don't scan the subtree right now, but only record
1349                  * the swapped tree blocks.
1350                  * The real subtree rescan is delayed until we have new
1351                  * CoW on the subtree root node before transaction commit.
1352                  */
1353                 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1354                                 rc->block_group, parent, slot,
1355                                 path->nodes[level], path->slots[level],
1356                                 last_snapshot);
1357                 if (ret < 0)
1358                         break;
1359                 /*
1360                  * swap blocks in fs tree and reloc tree.
1361                  */
1362                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1363                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1364                 btrfs_mark_buffer_dirty(parent);
1365
1366                 btrfs_set_node_blockptr(path->nodes[level],
1367                                         path->slots[level], old_bytenr);
1368                 btrfs_set_node_ptr_generation(path->nodes[level],
1369                                               path->slots[level], old_ptr_gen);
1370                 btrfs_mark_buffer_dirty(path->nodes[level]);
1371
1372                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1373                                        blocksize, path->nodes[level]->start);
1374                 ref.skip_qgroup = true;
1375                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1376                                     0, true);
1377                 ret = btrfs_inc_extent_ref(trans, &ref);
1378                 if (ret) {
1379                         btrfs_abort_transaction(trans, ret);
1380                         break;
1381                 }
1382                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1383                                        blocksize, 0);
1384                 ref.skip_qgroup = true;
1385                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
1386                                     true);
1387                 ret = btrfs_inc_extent_ref(trans, &ref);
1388                 if (ret) {
1389                         btrfs_abort_transaction(trans, ret);
1390                         break;
1391                 }
1392
1393                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1394                                        blocksize, path->nodes[level]->start);
1395                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1396                                     0, true);
1397                 ref.skip_qgroup = true;
1398                 ret = btrfs_free_extent(trans, &ref);
1399                 if (ret) {
1400                         btrfs_abort_transaction(trans, ret);
1401                         break;
1402                 }
1403
1404                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1405                                        blocksize, 0);
1406                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
1407                                     0, true);
1408                 ref.skip_qgroup = true;
1409                 ret = btrfs_free_extent(trans, &ref);
1410                 if (ret) {
1411                         btrfs_abort_transaction(trans, ret);
1412                         break;
1413                 }
1414
1415                 btrfs_unlock_up_safe(path, 0);
1416
1417                 ret = level;
1418                 break;
1419         }
1420         btrfs_tree_unlock(parent);
1421         free_extent_buffer(parent);
1422         return ret;
1423 }
1424
1425 /*
1426  * helper to find next relocated block in reloc tree
1427  */
1428 static noinline_for_stack
1429 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1430                        int *level)
1431 {
1432         struct extent_buffer *eb;
1433         int i;
1434         u64 last_snapshot;
1435         u32 nritems;
1436
1437         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1438
1439         for (i = 0; i < *level; i++) {
1440                 free_extent_buffer(path->nodes[i]);
1441                 path->nodes[i] = NULL;
1442         }
1443
1444         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1445                 eb = path->nodes[i];
1446                 nritems = btrfs_header_nritems(eb);
1447                 while (path->slots[i] + 1 < nritems) {
1448                         path->slots[i]++;
1449                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1450                             last_snapshot)
1451                                 continue;
1452
1453                         *level = i;
1454                         return 0;
1455                 }
1456                 free_extent_buffer(path->nodes[i]);
1457                 path->nodes[i] = NULL;
1458         }
1459         return 1;
1460 }
1461
1462 /*
1463  * walk down reloc tree to find relocated block of lowest level
1464  */
1465 static noinline_for_stack
1466 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1467                          int *level)
1468 {
1469         struct extent_buffer *eb = NULL;
1470         int i;
1471         u64 ptr_gen = 0;
1472         u64 last_snapshot;
1473         u32 nritems;
1474
1475         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1476
1477         for (i = *level; i > 0; i--) {
1478                 eb = path->nodes[i];
1479                 nritems = btrfs_header_nritems(eb);
1480                 while (path->slots[i] < nritems) {
1481                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1482                         if (ptr_gen > last_snapshot)
1483                                 break;
1484                         path->slots[i]++;
1485                 }
1486                 if (path->slots[i] >= nritems) {
1487                         if (i == *level)
1488                                 break;
1489                         *level = i + 1;
1490                         return 0;
1491                 }
1492                 if (i == 1) {
1493                         *level = i;
1494                         return 0;
1495                 }
1496
1497                 eb = btrfs_read_node_slot(eb, path->slots[i]);
1498                 if (IS_ERR(eb))
1499                         return PTR_ERR(eb);
1500                 BUG_ON(btrfs_header_level(eb) != i - 1);
1501                 path->nodes[i - 1] = eb;
1502                 path->slots[i - 1] = 0;
1503         }
1504         return 1;
1505 }
1506
1507 /*
1508  * invalidate extent cache for file extents whose key in range of
1509  * [min_key, max_key)
1510  */
1511 static int invalidate_extent_cache(struct btrfs_root *root,
1512                                    struct btrfs_key *min_key,
1513                                    struct btrfs_key *max_key)
1514 {
1515         struct btrfs_fs_info *fs_info = root->fs_info;
1516         struct inode *inode = NULL;
1517         u64 objectid;
1518         u64 start, end;
1519         u64 ino;
1520
1521         objectid = min_key->objectid;
1522         while (1) {
1523                 cond_resched();
1524                 iput(inode);
1525
1526                 if (objectid > max_key->objectid)
1527                         break;
1528
1529                 inode = find_next_inode(root, objectid);
1530                 if (!inode)
1531                         break;
1532                 ino = btrfs_ino(BTRFS_I(inode));
1533
1534                 if (ino > max_key->objectid) {
1535                         iput(inode);
1536                         break;
1537                 }
1538
1539                 objectid = ino + 1;
1540                 if (!S_ISREG(inode->i_mode))
1541                         continue;
1542
1543                 if (unlikely(min_key->objectid == ino)) {
1544                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1545                                 continue;
1546                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1547                                 start = 0;
1548                         else {
1549                                 start = min_key->offset;
1550                                 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1551                         }
1552                 } else {
1553                         start = 0;
1554                 }
1555
1556                 if (unlikely(max_key->objectid == ino)) {
1557                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1558                                 continue;
1559                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1560                                 end = (u64)-1;
1561                         } else {
1562                                 if (max_key->offset == 0)
1563                                         continue;
1564                                 end = max_key->offset;
1565                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1566                                 end--;
1567                         }
1568                 } else {
1569                         end = (u64)-1;
1570                 }
1571
1572                 /* the lock_extent waits for readpage to complete */
1573                 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
1574                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
1575                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1576         }
1577         return 0;
1578 }
1579
1580 static int find_next_key(struct btrfs_path *path, int level,
1581                          struct btrfs_key *key)
1582
1583 {
1584         while (level < BTRFS_MAX_LEVEL) {
1585                 if (!path->nodes[level])
1586                         break;
1587                 if (path->slots[level] + 1 <
1588                     btrfs_header_nritems(path->nodes[level])) {
1589                         btrfs_node_key_to_cpu(path->nodes[level], key,
1590                                               path->slots[level] + 1);
1591                         return 0;
1592                 }
1593                 level++;
1594         }
1595         return 1;
1596 }
1597
1598 /*
1599  * Insert current subvolume into reloc_control::dirty_subvol_roots
1600  */
1601 static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
1602                                struct reloc_control *rc,
1603                                struct btrfs_root *root)
1604 {
1605         struct btrfs_root *reloc_root = root->reloc_root;
1606         struct btrfs_root_item *reloc_root_item;
1607         int ret;
1608
1609         /* @root must be a subvolume tree root with a valid reloc tree */
1610         ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1611         ASSERT(reloc_root);
1612
1613         reloc_root_item = &reloc_root->root_item;
1614         memset(&reloc_root_item->drop_progress, 0,
1615                 sizeof(reloc_root_item->drop_progress));
1616         btrfs_set_root_drop_level(reloc_root_item, 0);
1617         btrfs_set_root_refs(reloc_root_item, 0);
1618         ret = btrfs_update_reloc_root(trans, root);
1619         if (ret)
1620                 return ret;
1621
1622         if (list_empty(&root->reloc_dirty_list)) {
1623                 btrfs_grab_root(root);
1624                 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1625         }
1626
1627         return 0;
1628 }
1629
1630 static int clean_dirty_subvols(struct reloc_control *rc)
1631 {
1632         struct btrfs_root *root;
1633         struct btrfs_root *next;
1634         int ret = 0;
1635         int ret2;
1636
1637         list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1638                                  reloc_dirty_list) {
1639                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1640                         /* Merged subvolume, cleanup its reloc root */
1641                         struct btrfs_root *reloc_root = root->reloc_root;
1642
1643                         list_del_init(&root->reloc_dirty_list);
1644                         root->reloc_root = NULL;
1645                         /*
1646                          * Need barrier to ensure clear_bit() only happens after
1647                          * root->reloc_root = NULL. Pairs with have_reloc_root.
1648                          */
1649                         smp_wmb();
1650                         clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1651                         if (reloc_root) {
1652                                 /*
1653                                  * btrfs_drop_snapshot drops our ref we hold for
1654                                  * ->reloc_root.  If it fails however we must
1655                                  * drop the ref ourselves.
1656                                  */
1657                                 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1658                                 if (ret2 < 0) {
1659                                         btrfs_put_root(reloc_root);
1660                                         if (!ret)
1661                                                 ret = ret2;
1662                                 }
1663                         }
1664                         btrfs_put_root(root);
1665                 } else {
1666                         /* Orphan reloc tree, just clean it up */
1667                         ret2 = btrfs_drop_snapshot(root, 0, 1);
1668                         if (ret2 < 0) {
1669                                 btrfs_put_root(root);
1670                                 if (!ret)
1671                                         ret = ret2;
1672                         }
1673                 }
1674         }
1675         return ret;
1676 }
1677
1678 /*
1679  * merge the relocated tree blocks in reloc tree with corresponding
1680  * fs tree.
1681  */
1682 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1683                                                struct btrfs_root *root)
1684 {
1685         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1686         struct btrfs_key key;
1687         struct btrfs_key next_key;
1688         struct btrfs_trans_handle *trans = NULL;
1689         struct btrfs_root *reloc_root;
1690         struct btrfs_root_item *root_item;
1691         struct btrfs_path *path;
1692         struct extent_buffer *leaf;
1693         int reserve_level;
1694         int level;
1695         int max_level;
1696         int replaced = 0;
1697         int ret = 0;
1698         u32 min_reserved;
1699
1700         path = btrfs_alloc_path();
1701         if (!path)
1702                 return -ENOMEM;
1703         path->reada = READA_FORWARD;
1704
1705         reloc_root = root->reloc_root;
1706         root_item = &reloc_root->root_item;
1707
1708         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1709                 level = btrfs_root_level(root_item);
1710                 atomic_inc(&reloc_root->node->refs);
1711                 path->nodes[level] = reloc_root->node;
1712                 path->slots[level] = 0;
1713         } else {
1714                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1715
1716                 level = btrfs_root_drop_level(root_item);
1717                 BUG_ON(level == 0);
1718                 path->lowest_level = level;
1719                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1720                 path->lowest_level = 0;
1721                 if (ret < 0) {
1722                         btrfs_free_path(path);
1723                         return ret;
1724                 }
1725
1726                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1727                                       path->slots[level]);
1728                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1729
1730                 btrfs_unlock_up_safe(path, 0);
1731         }
1732
1733         /*
1734          * In merge_reloc_root(), we modify the upper level pointer to swap the
1735          * tree blocks between reloc tree and subvolume tree.  Thus for tree
1736          * block COW, we COW at most from level 1 to root level for each tree.
1737          *
1738          * Thus the needed metadata size is at most root_level * nodesize,
1739          * and * 2 since we have two trees to COW.
1740          */
1741         reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1742         min_reserved = fs_info->nodesize * reserve_level * 2;
1743         memset(&next_key, 0, sizeof(next_key));
1744
1745         while (1) {
1746                 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
1747                                              BTRFS_RESERVE_FLUSH_LIMIT);
1748                 if (ret)
1749                         goto out;
1750                 trans = btrfs_start_transaction(root, 0);
1751                 if (IS_ERR(trans)) {
1752                         ret = PTR_ERR(trans);
1753                         trans = NULL;
1754                         goto out;
1755                 }
1756
1757                 /*
1758                  * At this point we no longer have a reloc_control, so we can't
1759                  * depend on btrfs_init_reloc_root to update our last_trans.
1760                  *
1761                  * But that's ok, we started the trans handle on our
1762                  * corresponding fs_root, which means it's been added to the
1763                  * dirty list.  At commit time we'll still call
1764                  * btrfs_update_reloc_root() and update our root item
1765                  * appropriately.
1766                  */
1767                 reloc_root->last_trans = trans->transid;
1768                 trans->block_rsv = rc->block_rsv;
1769
1770                 replaced = 0;
1771                 max_level = level;
1772
1773                 ret = walk_down_reloc_tree(reloc_root, path, &level);
1774                 if (ret < 0)
1775                         goto out;
1776                 if (ret > 0)
1777                         break;
1778
1779                 if (!find_next_key(path, level, &key) &&
1780                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1781                         ret = 0;
1782                 } else {
1783                         ret = replace_path(trans, rc, root, reloc_root, path,
1784                                            &next_key, level, max_level);
1785                 }
1786                 if (ret < 0)
1787                         goto out;
1788                 if (ret > 0) {
1789                         level = ret;
1790                         btrfs_node_key_to_cpu(path->nodes[level], &key,
1791                                               path->slots[level]);
1792                         replaced = 1;
1793                 }
1794
1795                 ret = walk_up_reloc_tree(reloc_root, path, &level);
1796                 if (ret > 0)
1797                         break;
1798
1799                 BUG_ON(level == 0);
1800                 /*
1801                  * save the merging progress in the drop_progress.
1802                  * this is OK since root refs == 1 in this case.
1803                  */
1804                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1805                                path->slots[level]);
1806                 btrfs_set_root_drop_level(root_item, level);
1807
1808                 btrfs_end_transaction_throttle(trans);
1809                 trans = NULL;
1810
1811                 btrfs_btree_balance_dirty(fs_info);
1812
1813                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1814                         invalidate_extent_cache(root, &key, &next_key);
1815         }
1816
1817         /*
1818          * handle the case only one block in the fs tree need to be
1819          * relocated and the block is tree root.
1820          */
1821         leaf = btrfs_lock_root_node(root);
1822         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1823                               BTRFS_NESTING_COW);
1824         btrfs_tree_unlock(leaf);
1825         free_extent_buffer(leaf);
1826 out:
1827         btrfs_free_path(path);
1828
1829         if (ret == 0) {
1830                 ret = insert_dirty_subvol(trans, rc, root);
1831                 if (ret)
1832                         btrfs_abort_transaction(trans, ret);
1833         }
1834
1835         if (trans)
1836                 btrfs_end_transaction_throttle(trans);
1837
1838         btrfs_btree_balance_dirty(fs_info);
1839
1840         if (replaced && rc->stage == UPDATE_DATA_PTRS)
1841                 invalidate_extent_cache(root, &key, &next_key);
1842
1843         return ret;
1844 }
1845
1846 static noinline_for_stack
1847 int prepare_to_merge(struct reloc_control *rc, int err)
1848 {
1849         struct btrfs_root *root = rc->extent_root;
1850         struct btrfs_fs_info *fs_info = root->fs_info;
1851         struct btrfs_root *reloc_root;
1852         struct btrfs_trans_handle *trans;
1853         LIST_HEAD(reloc_roots);
1854         u64 num_bytes = 0;
1855         int ret;
1856
1857         mutex_lock(&fs_info->reloc_mutex);
1858         rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1859         rc->merging_rsv_size += rc->nodes_relocated * 2;
1860         mutex_unlock(&fs_info->reloc_mutex);
1861
1862 again:
1863         if (!err) {
1864                 num_bytes = rc->merging_rsv_size;
1865                 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
1866                                           BTRFS_RESERVE_FLUSH_ALL);
1867                 if (ret)
1868                         err = ret;
1869         }
1870
1871         trans = btrfs_join_transaction(rc->extent_root);
1872         if (IS_ERR(trans)) {
1873                 if (!err)
1874                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
1875                                                 num_bytes, NULL);
1876                 return PTR_ERR(trans);
1877         }
1878
1879         if (!err) {
1880                 if (num_bytes != rc->merging_rsv_size) {
1881                         btrfs_end_transaction(trans);
1882                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
1883                                                 num_bytes, NULL);
1884                         goto again;
1885                 }
1886         }
1887
1888         rc->merge_reloc_tree = 1;
1889
1890         while (!list_empty(&rc->reloc_roots)) {
1891                 reloc_root = list_entry(rc->reloc_roots.next,
1892                                         struct btrfs_root, root_list);
1893                 list_del_init(&reloc_root->root_list);
1894
1895                 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1896                                 false);
1897                 if (IS_ERR(root)) {
1898                         /*
1899                          * Even if we have an error we need this reloc root
1900                          * back on our list so we can clean up properly.
1901                          */
1902                         list_add(&reloc_root->root_list, &reloc_roots);
1903                         btrfs_abort_transaction(trans, (int)PTR_ERR(root));
1904                         if (!err)
1905                                 err = PTR_ERR(root);
1906                         break;
1907                 }
1908                 ASSERT(root->reloc_root == reloc_root);
1909
1910                 /*
1911                  * set reference count to 1, so btrfs_recover_relocation
1912                  * knows it should resumes merging
1913                  */
1914                 if (!err)
1915                         btrfs_set_root_refs(&reloc_root->root_item, 1);
1916                 ret = btrfs_update_reloc_root(trans, root);
1917
1918                 /*
1919                  * Even if we have an error we need this reloc root back on our
1920                  * list so we can clean up properly.
1921                  */
1922                 list_add(&reloc_root->root_list, &reloc_roots);
1923                 btrfs_put_root(root);
1924
1925                 if (ret) {
1926                         btrfs_abort_transaction(trans, ret);
1927                         if (!err)
1928                                 err = ret;
1929                         break;
1930                 }
1931         }
1932
1933         list_splice(&reloc_roots, &rc->reloc_roots);
1934
1935         if (!err)
1936                 err = btrfs_commit_transaction(trans);
1937         else
1938                 btrfs_end_transaction(trans);
1939         return err;
1940 }
1941
1942 static noinline_for_stack
1943 void free_reloc_roots(struct list_head *list)
1944 {
1945         struct btrfs_root *reloc_root, *tmp;
1946
1947         list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1948                 __del_reloc_root(reloc_root);
1949 }
1950
1951 static noinline_for_stack
1952 void merge_reloc_roots(struct reloc_control *rc)
1953 {
1954         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1955         struct btrfs_root *root;
1956         struct btrfs_root *reloc_root;
1957         LIST_HEAD(reloc_roots);
1958         int found = 0;
1959         int ret = 0;
1960 again:
1961         root = rc->extent_root;
1962
1963         /*
1964          * this serializes us with btrfs_record_root_in_transaction,
1965          * we have to make sure nobody is in the middle of
1966          * adding their roots to the list while we are
1967          * doing this splice
1968          */
1969         mutex_lock(&fs_info->reloc_mutex);
1970         list_splice_init(&rc->reloc_roots, &reloc_roots);
1971         mutex_unlock(&fs_info->reloc_mutex);
1972
1973         while (!list_empty(&reloc_roots)) {
1974                 found = 1;
1975                 reloc_root = list_entry(reloc_roots.next,
1976                                         struct btrfs_root, root_list);
1977
1978                 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1979                                          false);
1980                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1981                         if (IS_ERR(root)) {
1982                                 /*
1983                                  * For recovery we read the fs roots on mount,
1984                                  * and if we didn't find the root then we marked
1985                                  * the reloc root as a garbage root.  For normal
1986                                  * relocation obviously the root should exist in
1987                                  * memory.  However there's no reason we can't
1988                                  * handle the error properly here just in case.
1989                                  */
1990                                 ASSERT(0);
1991                                 ret = PTR_ERR(root);
1992                                 goto out;
1993                         }
1994                         if (root->reloc_root != reloc_root) {
1995                                 /*
1996                                  * This is actually impossible without something
1997                                  * going really wrong (like weird race condition
1998                                  * or cosmic rays).
1999                                  */
2000                                 ASSERT(0);
2001                                 ret = -EINVAL;
2002                                 goto out;
2003                         }
2004                         ret = merge_reloc_root(rc, root);
2005                         btrfs_put_root(root);
2006                         if (ret) {
2007                                 if (list_empty(&reloc_root->root_list))
2008                                         list_add_tail(&reloc_root->root_list,
2009                                                       &reloc_roots);
2010                                 goto out;
2011                         }
2012                 } else {
2013                         if (!IS_ERR(root)) {
2014                                 if (root->reloc_root == reloc_root) {
2015                                         root->reloc_root = NULL;
2016                                         btrfs_put_root(reloc_root);
2017                                 }
2018                                 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
2019                                           &root->state);
2020                                 btrfs_put_root(root);
2021                         }
2022
2023                         list_del_init(&reloc_root->root_list);
2024                         /* Don't forget to queue this reloc root for cleanup */
2025                         list_add_tail(&reloc_root->reloc_dirty_list,
2026                                       &rc->dirty_subvol_roots);
2027                 }
2028         }
2029
2030         if (found) {
2031                 found = 0;
2032                 goto again;
2033         }
2034 out:
2035         if (ret) {
2036                 btrfs_handle_fs_error(fs_info, ret, NULL);
2037                 free_reloc_roots(&reloc_roots);
2038
2039                 /* new reloc root may be added */
2040                 mutex_lock(&fs_info->reloc_mutex);
2041                 list_splice_init(&rc->reloc_roots, &reloc_roots);
2042                 mutex_unlock(&fs_info->reloc_mutex);
2043                 free_reloc_roots(&reloc_roots);
2044         }
2045
2046         /*
2047          * We used to have
2048          *
2049          * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2050          *
2051          * here, but it's wrong.  If we fail to start the transaction in
2052          * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2053          * have actually been removed from the reloc_root_tree rb tree.  This is
2054          * fine because we're bailing here, and we hold a reference on the root
2055          * for the list that holds it, so these roots will be cleaned up when we
2056          * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2057          * will be cleaned up on unmount.
2058          *
2059          * The remaining nodes will be cleaned up by free_reloc_control.
2060          */
2061 }
2062
2063 static void free_block_list(struct rb_root *blocks)
2064 {
2065         struct tree_block *block;
2066         struct rb_node *rb_node;
2067         while ((rb_node = rb_first(blocks))) {
2068                 block = rb_entry(rb_node, struct tree_block, rb_node);
2069                 rb_erase(rb_node, blocks);
2070                 kfree(block);
2071         }
2072 }
2073
2074 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2075                                       struct btrfs_root *reloc_root)
2076 {
2077         struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2078         struct btrfs_root *root;
2079         int ret;
2080
2081         if (reloc_root->last_trans == trans->transid)
2082                 return 0;
2083
2084         root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2085
2086         /*
2087          * This should succeed, since we can't have a reloc root without having
2088          * already looked up the actual root and created the reloc root for this
2089          * root.
2090          *
2091          * However if there's some sort of corruption where we have a ref to a
2092          * reloc root without a corresponding root this could return ENOENT.
2093          */
2094         if (IS_ERR(root)) {
2095                 ASSERT(0);
2096                 return PTR_ERR(root);
2097         }
2098         if (root->reloc_root != reloc_root) {
2099                 ASSERT(0);
2100                 btrfs_err(fs_info,
2101                           "root %llu has two reloc roots associated with it",
2102                           reloc_root->root_key.offset);
2103                 btrfs_put_root(root);
2104                 return -EUCLEAN;
2105         }
2106         ret = btrfs_record_root_in_trans(trans, root);
2107         btrfs_put_root(root);
2108
2109         return ret;
2110 }
2111
2112 static noinline_for_stack
2113 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2114                                      struct reloc_control *rc,
2115                                      struct btrfs_backref_node *node,
2116                                      struct btrfs_backref_edge *edges[])
2117 {
2118         struct btrfs_backref_node *next;
2119         struct btrfs_root *root;
2120         int index = 0;
2121         int ret;
2122
2123         next = node;
2124         while (1) {
2125                 cond_resched();
2126                 next = walk_up_backref(next, edges, &index);
2127                 root = next->root;
2128
2129                 /*
2130                  * If there is no root, then our references for this block are
2131                  * incomplete, as we should be able to walk all the way up to a
2132                  * block that is owned by a root.
2133                  *
2134                  * This path is only for SHAREABLE roots, so if we come upon a
2135                  * non-SHAREABLE root then we have backrefs that resolve
2136                  * improperly.
2137                  *
2138                  * Both of these cases indicate file system corruption, or a bug
2139                  * in the backref walking code.
2140                  */
2141                 if (!root) {
2142                         ASSERT(0);
2143                         btrfs_err(trans->fs_info,
2144                 "bytenr %llu doesn't have a backref path ending in a root",
2145                                   node->bytenr);
2146                         return ERR_PTR(-EUCLEAN);
2147                 }
2148                 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2149                         ASSERT(0);
2150                         btrfs_err(trans->fs_info,
2151         "bytenr %llu has multiple refs with one ending in a non-shareable root",
2152                                   node->bytenr);
2153                         return ERR_PTR(-EUCLEAN);
2154                 }
2155
2156                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2157                         ret = record_reloc_root_in_trans(trans, root);
2158                         if (ret)
2159                                 return ERR_PTR(ret);
2160                         break;
2161                 }
2162
2163                 ret = btrfs_record_root_in_trans(trans, root);
2164                 if (ret)
2165                         return ERR_PTR(ret);
2166                 root = root->reloc_root;
2167
2168                 /*
2169                  * We could have raced with another thread which failed, so
2170                  * root->reloc_root may not be set, return ENOENT in this case.
2171                  */
2172                 if (!root)
2173                         return ERR_PTR(-ENOENT);
2174
2175                 if (next->new_bytenr != root->node->start) {
2176                         /*
2177                          * We just created the reloc root, so we shouldn't have
2178                          * ->new_bytenr set and this shouldn't be in the changed
2179                          *  list.  If it is then we have multiple roots pointing
2180                          *  at the same bytenr which indicates corruption, or
2181                          *  we've made a mistake in the backref walking code.
2182                          */
2183                         ASSERT(next->new_bytenr == 0);
2184                         ASSERT(list_empty(&next->list));
2185                         if (next->new_bytenr || !list_empty(&next->list)) {
2186                                 btrfs_err(trans->fs_info,
2187         "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
2188                                           node->bytenr, next->bytenr);
2189                                 return ERR_PTR(-EUCLEAN);
2190                         }
2191
2192                         next->new_bytenr = root->node->start;
2193                         btrfs_put_root(next->root);
2194                         next->root = btrfs_grab_root(root);
2195                         ASSERT(next->root);
2196                         list_add_tail(&next->list,
2197                                       &rc->backref_cache.changed);
2198                         mark_block_processed(rc, next);
2199                         break;
2200                 }
2201
2202                 WARN_ON(1);
2203                 root = NULL;
2204                 next = walk_down_backref(edges, &index);
2205                 if (!next || next->level <= node->level)
2206                         break;
2207         }
2208         if (!root) {
2209                 /*
2210                  * This can happen if there's fs corruption or if there's a bug
2211                  * in the backref lookup code.
2212                  */
2213                 ASSERT(0);
2214                 return ERR_PTR(-ENOENT);
2215         }
2216
2217         next = node;
2218         /* setup backref node path for btrfs_reloc_cow_block */
2219         while (1) {
2220                 rc->backref_cache.path[next->level] = next;
2221                 if (--index < 0)
2222                         break;
2223                 next = edges[index]->node[UPPER];
2224         }
2225         return root;
2226 }
2227
2228 /*
2229  * Select a tree root for relocation.
2230  *
2231  * Return NULL if the block is not shareable. We should use do_relocation() in
2232  * this case.
2233  *
2234  * Return a tree root pointer if the block is shareable.
2235  * Return -ENOENT if the block is root of reloc tree.
2236  */
2237 static noinline_for_stack
2238 struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2239 {
2240         struct btrfs_backref_node *next;
2241         struct btrfs_root *root;
2242         struct btrfs_root *fs_root = NULL;
2243         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2244         int index = 0;
2245
2246         next = node;
2247         while (1) {
2248                 cond_resched();
2249                 next = walk_up_backref(next, edges, &index);
2250                 root = next->root;
2251
2252                 /*
2253                  * This can occur if we have incomplete extent refs leading all
2254                  * the way up a particular path, in this case return -EUCLEAN.
2255                  */
2256                 if (!root)
2257                         return ERR_PTR(-EUCLEAN);
2258
2259                 /* No other choice for non-shareable tree */
2260                 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2261                         return root;
2262
2263                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2264                         fs_root = root;
2265
2266                 if (next != node)
2267                         return NULL;
2268
2269                 next = walk_down_backref(edges, &index);
2270                 if (!next || next->level <= node->level)
2271                         break;
2272         }
2273
2274         if (!fs_root)
2275                 return ERR_PTR(-ENOENT);
2276         return fs_root;
2277 }
2278
2279 static noinline_for_stack
2280 u64 calcu_metadata_size(struct reloc_control *rc,
2281                         struct btrfs_backref_node *node, int reserve)
2282 {
2283         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2284         struct btrfs_backref_node *next = node;
2285         struct btrfs_backref_edge *edge;
2286         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2287         u64 num_bytes = 0;
2288         int index = 0;
2289
2290         BUG_ON(reserve && node->processed);
2291
2292         while (next) {
2293                 cond_resched();
2294                 while (1) {
2295                         if (next->processed && (reserve || next != node))
2296                                 break;
2297
2298                         num_bytes += fs_info->nodesize;
2299
2300                         if (list_empty(&next->upper))
2301                                 break;
2302
2303                         edge = list_entry(next->upper.next,
2304                                         struct btrfs_backref_edge, list[LOWER]);
2305                         edges[index++] = edge;
2306                         next = edge->node[UPPER];
2307                 }
2308                 next = walk_down_backref(edges, &index);
2309         }
2310         return num_bytes;
2311 }
2312
2313 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2314                                   struct reloc_control *rc,
2315                                   struct btrfs_backref_node *node)
2316 {
2317         struct btrfs_root *root = rc->extent_root;
2318         struct btrfs_fs_info *fs_info = root->fs_info;
2319         u64 num_bytes;
2320         int ret;
2321         u64 tmp;
2322
2323         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2324
2325         trans->block_rsv = rc->block_rsv;
2326         rc->reserved_bytes += num_bytes;
2327
2328         /*
2329          * We are under a transaction here so we can only do limited flushing.
2330          * If we get an enospc just kick back -EAGAIN so we know to drop the
2331          * transaction and try to refill when we can flush all the things.
2332          */
2333         ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2334                                 BTRFS_RESERVE_FLUSH_LIMIT);
2335         if (ret) {
2336                 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2337                 while (tmp <= rc->reserved_bytes)
2338                         tmp <<= 1;
2339                 /*
2340                  * only one thread can access block_rsv at this point,
2341                  * so we don't need hold lock to protect block_rsv.
2342                  * we expand more reservation size here to allow enough
2343                  * space for relocation and we will return earlier in
2344                  * enospc case.
2345                  */
2346                 rc->block_rsv->size = tmp + fs_info->nodesize *
2347                                       RELOCATION_RESERVED_NODES;
2348                 return -EAGAIN;
2349         }
2350
2351         return 0;
2352 }
2353
2354 /*
2355  * relocate a block tree, and then update pointers in upper level
2356  * blocks that reference the block to point to the new location.
2357  *
2358  * if called by link_to_upper, the block has already been relocated.
2359  * in that case this function just updates pointers.
2360  */
2361 static int do_relocation(struct btrfs_trans_handle *trans,
2362                          struct reloc_control *rc,
2363                          struct btrfs_backref_node *node,
2364                          struct btrfs_key *key,
2365                          struct btrfs_path *path, int lowest)
2366 {
2367         struct btrfs_backref_node *upper;
2368         struct btrfs_backref_edge *edge;
2369         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370         struct btrfs_root *root;
2371         struct extent_buffer *eb;
2372         u32 blocksize;
2373         u64 bytenr;
2374         int slot;
2375         int ret = 0;
2376
2377         /*
2378          * If we are lowest then this is the first time we're processing this
2379          * block, and thus shouldn't have an eb associated with it yet.
2380          */
2381         ASSERT(!lowest || !node->eb);
2382
2383         path->lowest_level = node->level + 1;
2384         rc->backref_cache.path[node->level] = node;
2385         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2386                 struct btrfs_ref ref = { 0 };
2387
2388                 cond_resched();
2389
2390                 upper = edge->node[UPPER];
2391                 root = select_reloc_root(trans, rc, upper, edges);
2392                 if (IS_ERR(root)) {
2393                         ret = PTR_ERR(root);
2394                         goto next;
2395                 }
2396
2397                 if (upper->eb && !upper->locked) {
2398                         if (!lowest) {
2399                                 ret = btrfs_bin_search(upper->eb, key, &slot);
2400                                 if (ret < 0)
2401                                         goto next;
2402                                 BUG_ON(ret);
2403                                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2404                                 if (node->eb->start == bytenr)
2405                                         goto next;
2406                         }
2407                         btrfs_backref_drop_node_buffer(upper);
2408                 }
2409
2410                 if (!upper->eb) {
2411                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2412                         if (ret) {
2413                                 if (ret > 0)
2414                                         ret = -ENOENT;
2415
2416                                 btrfs_release_path(path);
2417                                 break;
2418                         }
2419
2420                         if (!upper->eb) {
2421                                 upper->eb = path->nodes[upper->level];
2422                                 path->nodes[upper->level] = NULL;
2423                         } else {
2424                                 BUG_ON(upper->eb != path->nodes[upper->level]);
2425                         }
2426
2427                         upper->locked = 1;
2428                         path->locks[upper->level] = 0;
2429
2430                         slot = path->slots[upper->level];
2431                         btrfs_release_path(path);
2432                 } else {
2433                         ret = btrfs_bin_search(upper->eb, key, &slot);
2434                         if (ret < 0)
2435                                 goto next;
2436                         BUG_ON(ret);
2437                 }
2438
2439                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2440                 if (lowest) {
2441                         if (bytenr != node->bytenr) {
2442                                 btrfs_err(root->fs_info,
2443                 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2444                                           bytenr, node->bytenr, slot,
2445                                           upper->eb->start);
2446                                 ret = -EIO;
2447                                 goto next;
2448                         }
2449                 } else {
2450                         if (node->eb->start == bytenr)
2451                                 goto next;
2452                 }
2453
2454                 blocksize = root->fs_info->nodesize;
2455                 eb = btrfs_read_node_slot(upper->eb, slot);
2456                 if (IS_ERR(eb)) {
2457                         ret = PTR_ERR(eb);
2458                         goto next;
2459                 }
2460                 btrfs_tree_lock(eb);
2461
2462                 if (!node->eb) {
2463                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2464                                               slot, &eb, BTRFS_NESTING_COW);
2465                         btrfs_tree_unlock(eb);
2466                         free_extent_buffer(eb);
2467                         if (ret < 0)
2468                                 goto next;
2469                         /*
2470                          * We've just COWed this block, it should have updated
2471                          * the correct backref node entry.
2472                          */
2473                         ASSERT(node->eb == eb);
2474                 } else {
2475                         btrfs_set_node_blockptr(upper->eb, slot,
2476                                                 node->eb->start);
2477                         btrfs_set_node_ptr_generation(upper->eb, slot,
2478                                                       trans->transid);
2479                         btrfs_mark_buffer_dirty(upper->eb);
2480
2481                         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2482                                                node->eb->start, blocksize,
2483                                                upper->eb->start);
2484                         ref.real_root = root->root_key.objectid;
2485                         btrfs_init_tree_ref(&ref, node->level,
2486                                             btrfs_header_owner(upper->eb),
2487                                             root->root_key.objectid, false);
2488                         ret = btrfs_inc_extent_ref(trans, &ref);
2489                         if (!ret)
2490                                 ret = btrfs_drop_subtree(trans, root, eb,
2491                                                          upper->eb);
2492                         if (ret)
2493                                 btrfs_abort_transaction(trans, ret);
2494                 }
2495 next:
2496                 if (!upper->pending)
2497                         btrfs_backref_drop_node_buffer(upper);
2498                 else
2499                         btrfs_backref_unlock_node_buffer(upper);
2500                 if (ret)
2501                         break;
2502         }
2503
2504         if (!ret && node->pending) {
2505                 btrfs_backref_drop_node_buffer(node);
2506                 list_move_tail(&node->list, &rc->backref_cache.changed);
2507                 node->pending = 0;
2508         }
2509
2510         path->lowest_level = 0;
2511
2512         /*
2513          * We should have allocated all of our space in the block rsv and thus
2514          * shouldn't ENOSPC.
2515          */
2516         ASSERT(ret != -ENOSPC);
2517         return ret;
2518 }
2519
2520 static int link_to_upper(struct btrfs_trans_handle *trans,
2521                          struct reloc_control *rc,
2522                          struct btrfs_backref_node *node,
2523                          struct btrfs_path *path)
2524 {
2525         struct btrfs_key key;
2526
2527         btrfs_node_key_to_cpu(node->eb, &key, 0);
2528         return do_relocation(trans, rc, node, &key, path, 0);
2529 }
2530
2531 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2532                                 struct reloc_control *rc,
2533                                 struct btrfs_path *path, int err)
2534 {
2535         LIST_HEAD(list);
2536         struct btrfs_backref_cache *cache = &rc->backref_cache;
2537         struct btrfs_backref_node *node;
2538         int level;
2539         int ret;
2540
2541         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2542                 while (!list_empty(&cache->pending[level])) {
2543                         node = list_entry(cache->pending[level].next,
2544                                           struct btrfs_backref_node, list);
2545                         list_move_tail(&node->list, &list);
2546                         BUG_ON(!node->pending);
2547
2548                         if (!err) {
2549                                 ret = link_to_upper(trans, rc, node, path);
2550                                 if (ret < 0)
2551                                         err = ret;
2552                         }
2553                 }
2554                 list_splice_init(&list, &cache->pending[level]);
2555         }
2556         return err;
2557 }
2558
2559 /*
2560  * mark a block and all blocks directly/indirectly reference the block
2561  * as processed.
2562  */
2563 static void update_processed_blocks(struct reloc_control *rc,
2564                                     struct btrfs_backref_node *node)
2565 {
2566         struct btrfs_backref_node *next = node;
2567         struct btrfs_backref_edge *edge;
2568         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2569         int index = 0;
2570
2571         while (next) {
2572                 cond_resched();
2573                 while (1) {
2574                         if (next->processed)
2575                                 break;
2576
2577                         mark_block_processed(rc, next);
2578
2579                         if (list_empty(&next->upper))
2580                                 break;
2581
2582                         edge = list_entry(next->upper.next,
2583                                         struct btrfs_backref_edge, list[LOWER]);
2584                         edges[index++] = edge;
2585                         next = edge->node[UPPER];
2586                 }
2587                 next = walk_down_backref(edges, &index);
2588         }
2589 }
2590
2591 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2592 {
2593         u32 blocksize = rc->extent_root->fs_info->nodesize;
2594
2595         if (test_range_bit(&rc->processed_blocks, bytenr,
2596                            bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2597                 return 1;
2598         return 0;
2599 }
2600
2601 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2602                               struct tree_block *block)
2603 {
2604         struct extent_buffer *eb;
2605
2606         eb = read_tree_block(fs_info, block->bytenr, block->owner,
2607                              block->key.offset, block->level, NULL);
2608         if (IS_ERR(eb)) {
2609                 return PTR_ERR(eb);
2610         } else if (!extent_buffer_uptodate(eb)) {
2611                 free_extent_buffer(eb);
2612                 return -EIO;
2613         }
2614         if (block->level == 0)
2615                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2616         else
2617                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2618         free_extent_buffer(eb);
2619         block->key_ready = 1;
2620         return 0;
2621 }
2622
2623 /*
2624  * helper function to relocate a tree block
2625  */
2626 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2627                                 struct reloc_control *rc,
2628                                 struct btrfs_backref_node *node,
2629                                 struct btrfs_key *key,
2630                                 struct btrfs_path *path)
2631 {
2632         struct btrfs_root *root;
2633         int ret = 0;
2634
2635         if (!node)
2636                 return 0;
2637
2638         /*
2639          * If we fail here we want to drop our backref_node because we are going
2640          * to start over and regenerate the tree for it.
2641          */
2642         ret = reserve_metadata_space(trans, rc, node);
2643         if (ret)
2644                 goto out;
2645
2646         BUG_ON(node->processed);
2647         root = select_one_root(node);
2648         if (IS_ERR(root)) {
2649                 ret = PTR_ERR(root);
2650
2651                 /* See explanation in select_one_root for the -EUCLEAN case. */
2652                 ASSERT(ret == -ENOENT);
2653                 if (ret == -ENOENT) {
2654                         ret = 0;
2655                         update_processed_blocks(rc, node);
2656                 }
2657                 goto out;
2658         }
2659
2660         if (root) {
2661                 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2662                         /*
2663                          * This block was the root block of a root, and this is
2664                          * the first time we're processing the block and thus it
2665                          * should not have had the ->new_bytenr modified and
2666                          * should have not been included on the changed list.
2667                          *
2668                          * However in the case of corruption we could have
2669                          * multiple refs pointing to the same block improperly,
2670                          * and thus we would trip over these checks.  ASSERT()
2671                          * for the developer case, because it could indicate a
2672                          * bug in the backref code, however error out for a
2673                          * normal user in the case of corruption.
2674                          */
2675                         ASSERT(node->new_bytenr == 0);
2676                         ASSERT(list_empty(&node->list));
2677                         if (node->new_bytenr || !list_empty(&node->list)) {
2678                                 btrfs_err(root->fs_info,
2679                                   "bytenr %llu has improper references to it",
2680                                           node->bytenr);
2681                                 ret = -EUCLEAN;
2682                                 goto out;
2683                         }
2684                         ret = btrfs_record_root_in_trans(trans, root);
2685                         if (ret)
2686                                 goto out;
2687                         /*
2688                          * Another thread could have failed, need to check if we
2689                          * have reloc_root actually set.
2690                          */
2691                         if (!root->reloc_root) {
2692                                 ret = -ENOENT;
2693                                 goto out;
2694                         }
2695                         root = root->reloc_root;
2696                         node->new_bytenr = root->node->start;
2697                         btrfs_put_root(node->root);
2698                         node->root = btrfs_grab_root(root);
2699                         ASSERT(node->root);
2700                         list_add_tail(&node->list, &rc->backref_cache.changed);
2701                 } else {
2702                         path->lowest_level = node->level;
2703                         if (root == root->fs_info->chunk_root)
2704                                 btrfs_reserve_chunk_metadata(trans, false);
2705                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2706                         btrfs_release_path(path);
2707                         if (root == root->fs_info->chunk_root)
2708                                 btrfs_trans_release_chunk_metadata(trans);
2709                         if (ret > 0)
2710                                 ret = 0;
2711                 }
2712                 if (!ret)
2713                         update_processed_blocks(rc, node);
2714         } else {
2715                 ret = do_relocation(trans, rc, node, key, path, 1);
2716         }
2717 out:
2718         if (ret || node->level == 0 || node->cowonly)
2719                 btrfs_backref_cleanup_node(&rc->backref_cache, node);
2720         return ret;
2721 }
2722
2723 /*
2724  * relocate a list of blocks
2725  */
2726 static noinline_for_stack
2727 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2728                          struct reloc_control *rc, struct rb_root *blocks)
2729 {
2730         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2731         struct btrfs_backref_node *node;
2732         struct btrfs_path *path;
2733         struct tree_block *block;
2734         struct tree_block *next;
2735         int ret;
2736         int err = 0;
2737
2738         path = btrfs_alloc_path();
2739         if (!path) {
2740                 err = -ENOMEM;
2741                 goto out_free_blocks;
2742         }
2743
2744         /* Kick in readahead for tree blocks with missing keys */
2745         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2746                 if (!block->key_ready)
2747                         btrfs_readahead_tree_block(fs_info, block->bytenr,
2748                                                    block->owner, 0,
2749                                                    block->level);
2750         }
2751
2752         /* Get first keys */
2753         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2754                 if (!block->key_ready) {
2755                         err = get_tree_block_key(fs_info, block);
2756                         if (err)
2757                                 goto out_free_path;
2758                 }
2759         }
2760
2761         /* Do tree relocation */
2762         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2763                 node = build_backref_tree(rc, &block->key,
2764                                           block->level, block->bytenr);
2765                 if (IS_ERR(node)) {
2766                         err = PTR_ERR(node);
2767                         goto out;
2768                 }
2769
2770                 ret = relocate_tree_block(trans, rc, node, &block->key,
2771                                           path);
2772                 if (ret < 0) {
2773                         err = ret;
2774                         break;
2775                 }
2776         }
2777 out:
2778         err = finish_pending_nodes(trans, rc, path, err);
2779
2780 out_free_path:
2781         btrfs_free_path(path);
2782 out_free_blocks:
2783         free_block_list(blocks);
2784         return err;
2785 }
2786
2787 static noinline_for_stack int prealloc_file_extent_cluster(
2788                                 struct btrfs_inode *inode,
2789                                 struct file_extent_cluster *cluster)
2790 {
2791         u64 alloc_hint = 0;
2792         u64 start;
2793         u64 end;
2794         u64 offset = inode->index_cnt;
2795         u64 num_bytes;
2796         int nr;
2797         int ret = 0;
2798         u64 i_size = i_size_read(&inode->vfs_inode);
2799         u64 prealloc_start = cluster->start - offset;
2800         u64 prealloc_end = cluster->end - offset;
2801         u64 cur_offset = prealloc_start;
2802
2803         /*
2804          * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
2805          * This means the range [i_size, PAGE_END + 1) is filled with zeros by
2806          * btrfs_do_readpage() call of previously relocated file cluster.
2807          *
2808          * If the current cluster starts in the above range, btrfs_do_readpage()
2809          * will skip the read, and relocate_one_page() will later writeback
2810          * the padding zeros as new data, causing data corruption.
2811          *
2812          * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
2813          */
2814         if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
2815                 struct address_space *mapping = inode->vfs_inode.i_mapping;
2816                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2817                 const u32 sectorsize = fs_info->sectorsize;
2818                 struct page *page;
2819
2820                 ASSERT(sectorsize < PAGE_SIZE);
2821                 ASSERT(IS_ALIGNED(i_size, sectorsize));
2822
2823                 /*
2824                  * Subpage can't handle page with DIRTY but without UPTODATE
2825                  * bit as it can lead to the following deadlock:
2826                  *
2827                  * btrfs_readpage()
2828                  * | Page already *locked*
2829                  * |- btrfs_lock_and_flush_ordered_range()
2830                  *    |- btrfs_start_ordered_extent()
2831                  *       |- extent_write_cache_pages()
2832                  *          |- lock_page()
2833                  *             We try to lock the page we already hold.
2834                  *
2835                  * Here we just writeback the whole data reloc inode, so that
2836                  * we will be ensured to have no dirty range in the page, and
2837                  * are safe to clear the uptodate bits.
2838                  *
2839                  * This shouldn't cause too much overhead, as we need to write
2840                  * the data back anyway.
2841                  */
2842                 ret = filemap_write_and_wait(mapping);
2843                 if (ret < 0)
2844                         return ret;
2845
2846                 clear_extent_bits(&inode->io_tree, i_size,
2847                                   round_up(i_size, PAGE_SIZE) - 1,
2848                                   EXTENT_UPTODATE);
2849                 page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
2850                 /*
2851                  * If page is freed we don't need to do anything then, as we
2852                  * will re-read the whole page anyway.
2853                  */
2854                 if (page) {
2855                         btrfs_subpage_clear_uptodate(fs_info, page, i_size,
2856                                         round_up(i_size, PAGE_SIZE) - i_size);
2857                         unlock_page(page);
2858                         put_page(page);
2859                 }
2860         }
2861
2862         BUG_ON(cluster->start != cluster->boundary[0]);
2863         ret = btrfs_alloc_data_chunk_ondemand(inode,
2864                                               prealloc_end + 1 - prealloc_start);
2865         if (ret)
2866                 return ret;
2867
2868         btrfs_inode_lock(&inode->vfs_inode, 0);
2869         for (nr = 0; nr < cluster->nr; nr++) {
2870                 start = cluster->boundary[nr] - offset;
2871                 if (nr + 1 < cluster->nr)
2872                         end = cluster->boundary[nr + 1] - 1 - offset;
2873                 else
2874                         end = cluster->end - offset;
2875
2876                 lock_extent(&inode->io_tree, start, end);
2877                 num_bytes = end + 1 - start;
2878                 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2879                                                 num_bytes, num_bytes,
2880                                                 end + 1, &alloc_hint);
2881                 cur_offset = end + 1;
2882                 unlock_extent(&inode->io_tree, start, end);
2883                 if (ret)
2884                         break;
2885         }
2886         btrfs_inode_unlock(&inode->vfs_inode, 0);
2887
2888         if (cur_offset < prealloc_end)
2889                 btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2890                                                prealloc_end + 1 - cur_offset);
2891         return ret;
2892 }
2893
2894 static noinline_for_stack
2895 int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2896                          u64 block_start)
2897 {
2898         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2899         struct extent_map *em;
2900         int ret = 0;
2901
2902         em = alloc_extent_map();
2903         if (!em)
2904                 return -ENOMEM;
2905
2906         em->start = start;
2907         em->len = end + 1 - start;
2908         em->block_len = em->len;
2909         em->block_start = block_start;
2910         set_bit(EXTENT_FLAG_PINNED, &em->flags);
2911
2912         lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2913         while (1) {
2914                 write_lock(&em_tree->lock);
2915                 ret = add_extent_mapping(em_tree, em, 0);
2916                 write_unlock(&em_tree->lock);
2917                 if (ret != -EEXIST) {
2918                         free_extent_map(em);
2919                         break;
2920                 }
2921                 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
2922         }
2923         unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2924         return ret;
2925 }
2926
2927 /*
2928  * Allow error injection to test balance/relocation cancellation
2929  */
2930 noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2931 {
2932         return atomic_read(&fs_info->balance_cancel_req) ||
2933                 atomic_read(&fs_info->reloc_cancel_req) ||
2934                 fatal_signal_pending(current);
2935 }
2936 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2937
2938 static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
2939                                     int cluster_nr)
2940 {
2941         /* Last extent, use cluster end directly */
2942         if (cluster_nr >= cluster->nr - 1)
2943                 return cluster->end;
2944
2945         /* Use next boundary start*/
2946         return cluster->boundary[cluster_nr + 1] - 1;
2947 }
2948
2949 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
2950                              struct file_extent_cluster *cluster,
2951                              int *cluster_nr, unsigned long page_index)
2952 {
2953         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2954         u64 offset = BTRFS_I(inode)->index_cnt;
2955         const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
2956         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2957         struct page *page;
2958         u64 page_start;
2959         u64 page_end;
2960         u64 cur;
2961         int ret;
2962
2963         ASSERT(page_index <= last_index);
2964         page = find_lock_page(inode->i_mapping, page_index);
2965         if (!page) {
2966                 page_cache_sync_readahead(inode->i_mapping, ra, NULL,
2967                                 page_index, last_index + 1 - page_index);
2968                 page = find_or_create_page(inode->i_mapping, page_index, mask);
2969                 if (!page)
2970                         return -ENOMEM;
2971         }
2972         ret = set_page_extent_mapped(page);
2973         if (ret < 0)
2974                 goto release_page;
2975
2976         if (PageReadahead(page))
2977                 page_cache_async_readahead(inode->i_mapping, ra, NULL, page,
2978                                    page_index, last_index + 1 - page_index);
2979
2980         if (!PageUptodate(page)) {
2981                 btrfs_readpage(NULL, page);
2982                 lock_page(page);
2983                 if (!PageUptodate(page)) {
2984                         ret = -EIO;
2985                         goto release_page;
2986                 }
2987         }
2988
2989         page_start = page_offset(page);
2990         page_end = page_start + PAGE_SIZE - 1;
2991
2992         /*
2993          * Start from the cluster, as for subpage case, the cluster can start
2994          * inside the page.
2995          */
2996         cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
2997         while (cur <= page_end) {
2998                 u64 extent_start = cluster->boundary[*cluster_nr] - offset;
2999                 u64 extent_end = get_cluster_boundary_end(cluster,
3000                                                 *cluster_nr) - offset;
3001                 u64 clamped_start = max(page_start, extent_start);
3002                 u64 clamped_end = min(page_end, extent_end);
3003                 u32 clamped_len = clamped_end + 1 - clamped_start;
3004
3005                 /* Reserve metadata for this range */
3006                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3007                                                       clamped_len);
3008                 if (ret)
3009                         goto release_page;
3010
3011                 /* Mark the range delalloc and dirty for later writeback */
3012                 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
3013                 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
3014                                                 clamped_end, 0, NULL);
3015                 if (ret) {
3016                         clear_extent_bits(&BTRFS_I(inode)->io_tree,
3017                                         clamped_start, clamped_end,
3018                                         EXTENT_LOCKED | EXTENT_BOUNDARY);
3019                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
3020                                                         clamped_len, true);
3021                         btrfs_delalloc_release_extents(BTRFS_I(inode),
3022                                                        clamped_len);
3023                         goto release_page;
3024                 }
3025                 btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
3026
3027                 /*
3028                  * Set the boundary if it's inside the page.
3029                  * Data relocation requires the destination extents to have the
3030                  * same size as the source.
3031                  * EXTENT_BOUNDARY bit prevents current extent from being merged
3032                  * with previous extent.
3033                  */
3034                 if (in_range(cluster->boundary[*cluster_nr] - offset,
3035                              page_start, PAGE_SIZE)) {
3036                         u64 boundary_start = cluster->boundary[*cluster_nr] -
3037                                                 offset;
3038                         u64 boundary_end = boundary_start +
3039                                            fs_info->sectorsize - 1;
3040
3041                         set_extent_bits(&BTRFS_I(inode)->io_tree,
3042                                         boundary_start, boundary_end,
3043                                         EXTENT_BOUNDARY);
3044                 }
3045                 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end);
3046                 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
3047                 cur += clamped_len;
3048
3049                 /* Crossed extent end, go to next extent */
3050                 if (cur >= extent_end) {
3051                         (*cluster_nr)++;
3052                         /* Just finished the last extent of the cluster, exit. */
3053                         if (*cluster_nr >= cluster->nr)
3054                                 break;
3055                 }
3056         }
3057         unlock_page(page);
3058         put_page(page);
3059
3060         balance_dirty_pages_ratelimited(inode->i_mapping);
3061         btrfs_throttle(fs_info);
3062         if (btrfs_should_cancel_balance(fs_info))
3063                 ret = -ECANCELED;
3064         return ret;
3065
3066 release_page:
3067         unlock_page(page);
3068         put_page(page);
3069         return ret;
3070 }
3071
3072 static int relocate_file_extent_cluster(struct inode *inode,
3073                                         struct file_extent_cluster *cluster)
3074 {
3075         u64 offset = BTRFS_I(inode)->index_cnt;
3076         unsigned long index;
3077         unsigned long last_index;
3078         struct file_ra_state *ra;
3079         int cluster_nr = 0;
3080         int ret = 0;
3081
3082         if (!cluster->nr)
3083                 return 0;
3084
3085         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3086         if (!ra)
3087                 return -ENOMEM;
3088
3089         ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
3090         if (ret)
3091                 goto out;
3092
3093         file_ra_state_init(ra, inode->i_mapping);
3094
3095         ret = setup_extent_mapping(inode, cluster->start - offset,
3096                                    cluster->end - offset, cluster->start);
3097         if (ret)
3098                 goto out;
3099
3100         last_index = (cluster->end - offset) >> PAGE_SHIFT;
3101         for (index = (cluster->start - offset) >> PAGE_SHIFT;
3102              index <= last_index && !ret; index++)
3103                 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
3104         if (ret == 0)
3105                 WARN_ON(cluster_nr != cluster->nr);
3106 out:
3107         kfree(ra);
3108         return ret;
3109 }
3110
3111 static noinline_for_stack
3112 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3113                          struct file_extent_cluster *cluster)
3114 {
3115         int ret;
3116
3117         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3118                 ret = relocate_file_extent_cluster(inode, cluster);
3119                 if (ret)
3120                         return ret;
3121                 cluster->nr = 0;
3122         }
3123
3124         if (!cluster->nr)
3125                 cluster->start = extent_key->objectid;
3126         else
3127                 BUG_ON(cluster->nr >= MAX_EXTENTS);
3128         cluster->end = extent_key->objectid + extent_key->offset - 1;
3129         cluster->boundary[cluster->nr] = extent_key->objectid;
3130         cluster->nr++;
3131
3132         if (cluster->nr >= MAX_EXTENTS) {
3133                 ret = relocate_file_extent_cluster(inode, cluster);
3134                 if (ret)
3135                         return ret;
3136                 cluster->nr = 0;
3137         }
3138         return 0;
3139 }
3140
3141 /*
3142  * helper to add a tree block to the list.
3143  * the major work is getting the generation and level of the block
3144  */
3145 static int add_tree_block(struct reloc_control *rc,
3146                           struct btrfs_key *extent_key,
3147                           struct btrfs_path *path,
3148                           struct rb_root *blocks)
3149 {
3150         struct extent_buffer *eb;
3151         struct btrfs_extent_item *ei;
3152         struct btrfs_tree_block_info *bi;
3153         struct tree_block *block;
3154         struct rb_node *rb_node;
3155         u32 item_size;
3156         int level = -1;
3157         u64 generation;
3158         u64 owner = 0;
3159
3160         eb =  path->nodes[0];
3161         item_size = btrfs_item_size_nr(eb, path->slots[0]);
3162
3163         if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3164             item_size >= sizeof(*ei) + sizeof(*bi)) {
3165                 unsigned long ptr = 0, end;
3166
3167                 ei = btrfs_item_ptr(eb, path->slots[0],
3168                                 struct btrfs_extent_item);
3169                 end = (unsigned long)ei + item_size;
3170                 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3171                         bi = (struct btrfs_tree_block_info *)(ei + 1);
3172                         level = btrfs_tree_block_level(eb, bi);
3173                         ptr = (unsigned long)(bi + 1);
3174                 } else {
3175                         level = (int)extent_key->offset;
3176                         ptr = (unsigned long)(ei + 1);
3177                 }
3178                 generation = btrfs_extent_generation(eb, ei);
3179
3180                 /*
3181                  * We're reading random blocks without knowing their owner ahead
3182                  * of time.  This is ok most of the time, as all reloc roots and
3183                  * fs roots have the same lock type.  However normal trees do
3184                  * not, and the only way to know ahead of time is to read the
3185                  * inline ref offset.  We know it's an fs root if
3186                  *
3187                  * 1. There's more than one ref.
3188                  * 2. There's a SHARED_DATA_REF_KEY set.
3189                  * 3. FULL_BACKREF is set on the flags.
3190                  *
3191                  * Otherwise it's safe to assume that the ref offset == the
3192                  * owner of this block, so we can use that when calling
3193                  * read_tree_block.
3194                  */
3195                 if (btrfs_extent_refs(eb, ei) == 1 &&
3196                     !(btrfs_extent_flags(eb, ei) &
3197                       BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
3198                     ptr < end) {
3199                         struct btrfs_extent_inline_ref *iref;
3200                         int type;
3201
3202                         iref = (struct btrfs_extent_inline_ref *)ptr;
3203                         type = btrfs_get_extent_inline_ref_type(eb, iref,
3204                                                         BTRFS_REF_TYPE_BLOCK);
3205                         if (type == BTRFS_REF_TYPE_INVALID)
3206                                 return -EINVAL;
3207                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
3208                                 owner = btrfs_extent_inline_ref_offset(eb, iref);
3209                 }
3210         } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3211                 btrfs_print_v0_err(eb->fs_info);
3212                 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
3213                 return -EINVAL;
3214         } else {
3215                 BUG();
3216         }
3217
3218         btrfs_release_path(path);
3219
3220         BUG_ON(level == -1);
3221
3222         block = kmalloc(sizeof(*block), GFP_NOFS);
3223         if (!block)
3224                 return -ENOMEM;
3225
3226         block->bytenr = extent_key->objectid;
3227         block->key.objectid = rc->extent_root->fs_info->nodesize;
3228         block->key.offset = generation;
3229         block->level = level;
3230         block->key_ready = 0;
3231         block->owner = owner;
3232
3233         rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3234         if (rb_node)
3235                 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
3236                                     -EEXIST);
3237
3238         return 0;
3239 }
3240
3241 /*
3242  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3243  */
3244 static int __add_tree_block(struct reloc_control *rc,
3245                             u64 bytenr, u32 blocksize,
3246                             struct rb_root *blocks)
3247 {
3248         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3249         struct btrfs_path *path;
3250         struct btrfs_key key;
3251         int ret;
3252         bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3253
3254         if (tree_block_processed(bytenr, rc))
3255                 return 0;
3256
3257         if (rb_simple_search(blocks, bytenr))
3258                 return 0;
3259
3260         path = btrfs_alloc_path();
3261         if (!path)
3262                 return -ENOMEM;
3263 again:
3264         key.objectid = bytenr;
3265         if (skinny) {
3266                 key.type = BTRFS_METADATA_ITEM_KEY;
3267                 key.offset = (u64)-1;
3268         } else {
3269                 key.type = BTRFS_EXTENT_ITEM_KEY;
3270                 key.offset = blocksize;
3271         }
3272
3273         path->search_commit_root = 1;
3274         path->skip_locking = 1;
3275         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3276         if (ret < 0)
3277                 goto out;
3278
3279         if (ret > 0 && skinny) {
3280                 if (path->slots[0]) {
3281                         path->slots[0]--;
3282                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3283                                               path->slots[0]);
3284                         if (key.objectid == bytenr &&
3285                             (key.type == BTRFS_METADATA_ITEM_KEY ||
3286                              (key.type == BTRFS_EXTENT_ITEM_KEY &&
3287                               key.offset == blocksize)))
3288                                 ret = 0;
3289                 }
3290
3291                 if (ret) {
3292                         skinny = false;
3293                         btrfs_release_path(path);
3294                         goto again;
3295                 }
3296         }
3297         if (ret) {
3298                 ASSERT(ret == 1);
3299                 btrfs_print_leaf(path->nodes[0]);
3300                 btrfs_err(fs_info,
3301              "tree block extent item (%llu) is not found in extent tree",
3302                      bytenr);
3303                 WARN_ON(1);
3304                 ret = -EINVAL;
3305                 goto out;
3306         }
3307
3308         ret = add_tree_block(rc, &key, path, blocks);
3309 out:
3310         btrfs_free_path(path);
3311         return ret;
3312 }
3313
3314 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3315                                     struct btrfs_block_group *block_group,
3316                                     struct inode *inode,
3317                                     u64 ino)
3318 {
3319         struct btrfs_root *root = fs_info->tree_root;
3320         struct btrfs_trans_handle *trans;
3321         int ret = 0;
3322
3323         if (inode)
3324                 goto truncate;
3325
3326         inode = btrfs_iget(fs_info->sb, ino, root);
3327         if (IS_ERR(inode))
3328                 return -ENOENT;
3329
3330 truncate:
3331         ret = btrfs_check_trunc_cache_free_space(fs_info,
3332                                                  &fs_info->global_block_rsv);
3333         if (ret)
3334                 goto out;
3335
3336         trans = btrfs_join_transaction(root);
3337         if (IS_ERR(trans)) {
3338                 ret = PTR_ERR(trans);
3339                 goto out;
3340         }
3341
3342         ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3343
3344         btrfs_end_transaction(trans);
3345         btrfs_btree_balance_dirty(fs_info);
3346 out:
3347         iput(inode);
3348         return ret;
3349 }
3350
3351 /*
3352  * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3353  * cache inode, to avoid free space cache data extent blocking data relocation.
3354  */
3355 static int delete_v1_space_cache(struct extent_buffer *leaf,
3356                                  struct btrfs_block_group *block_group,
3357                                  u64 data_bytenr)
3358 {
3359         u64 space_cache_ino;
3360         struct btrfs_file_extent_item *ei;
3361         struct btrfs_key key;
3362         bool found = false;
3363         int i;
3364         int ret;
3365
3366         if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3367                 return 0;
3368
3369         for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3370                 u8 type;
3371
3372                 btrfs_item_key_to_cpu(leaf, &key, i);
3373                 if (key.type != BTRFS_EXTENT_DATA_KEY)
3374                         continue;
3375                 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3376                 type = btrfs_file_extent_type(leaf, ei);
3377
3378                 if ((type == BTRFS_FILE_EXTENT_REG ||
3379                      type == BTRFS_FILE_EXTENT_PREALLOC) &&
3380                     btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3381                         found = true;
3382                         space_cache_ino = key.objectid;
3383                         break;
3384                 }
3385         }
3386         if (!found)
3387                 return -ENOENT;
3388         ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3389                                         space_cache_ino);
3390         return ret;
3391 }
3392
3393 /*
3394  * helper to find all tree blocks that reference a given data extent
3395  */
3396 static noinline_for_stack
3397 int add_data_references(struct reloc_control *rc,
3398                         struct btrfs_key *extent_key,
3399                         struct btrfs_path *path,
3400                         struct rb_root *blocks)
3401 {
3402         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3403         struct ulist *leaves = NULL;
3404         struct ulist_iterator leaf_uiter;
3405         struct ulist_node *ref_node = NULL;
3406         const u32 blocksize = fs_info->nodesize;
3407         int ret = 0;
3408
3409         btrfs_release_path(path);
3410         ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3411                                    0, &leaves, NULL, true);
3412         if (ret < 0)
3413                 return ret;
3414
3415         ULIST_ITER_INIT(&leaf_uiter);
3416         while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3417                 struct extent_buffer *eb;
3418
3419                 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
3420                 if (IS_ERR(eb)) {
3421                         ret = PTR_ERR(eb);
3422                         break;
3423                 }
3424                 ret = delete_v1_space_cache(eb, rc->block_group,
3425                                             extent_key->objectid);
3426                 free_extent_buffer(eb);
3427                 if (ret < 0)
3428                         break;
3429                 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3430                 if (ret < 0)
3431                         break;
3432         }
3433         if (ret < 0)
3434                 free_block_list(blocks);
3435         ulist_free(leaves);
3436         return ret;
3437 }
3438
3439 /*
3440  * helper to find next unprocessed extent
3441  */
3442 static noinline_for_stack
3443 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3444                      struct btrfs_key *extent_key)
3445 {
3446         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3447         struct btrfs_key key;
3448         struct extent_buffer *leaf;
3449         u64 start, end, last;
3450         int ret;
3451
3452         last = rc->block_group->start + rc->block_group->length;
3453         while (1) {
3454                 cond_resched();
3455                 if (rc->search_start >= last) {
3456                         ret = 1;
3457                         break;
3458                 }
3459
3460                 key.objectid = rc->search_start;
3461                 key.type = BTRFS_EXTENT_ITEM_KEY;
3462                 key.offset = 0;
3463
3464                 path->search_commit_root = 1;
3465                 path->skip_locking = 1;
3466                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3467                                         0, 0);
3468                 if (ret < 0)
3469                         break;
3470 next:
3471                 leaf = path->nodes[0];
3472                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3473                         ret = btrfs_next_leaf(rc->extent_root, path);
3474                         if (ret != 0)
3475                                 break;
3476                         leaf = path->nodes[0];
3477                 }
3478
3479                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3480                 if (key.objectid >= last) {
3481                         ret = 1;
3482                         break;
3483                 }
3484
3485                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3486                     key.type != BTRFS_METADATA_ITEM_KEY) {
3487                         path->slots[0]++;
3488                         goto next;
3489                 }
3490
3491                 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3492                     key.objectid + key.offset <= rc->search_start) {
3493                         path->slots[0]++;
3494                         goto next;
3495                 }
3496
3497                 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3498                     key.objectid + fs_info->nodesize <=
3499                     rc->search_start) {
3500                         path->slots[0]++;
3501                         goto next;
3502                 }
3503
3504                 ret = find_first_extent_bit(&rc->processed_blocks,
3505                                             key.objectid, &start, &end,
3506                                             EXTENT_DIRTY, NULL);
3507
3508                 if (ret == 0 && start <= key.objectid) {
3509                         btrfs_release_path(path);
3510                         rc->search_start = end + 1;
3511                 } else {
3512                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
3513                                 rc->search_start = key.objectid + key.offset;
3514                         else
3515                                 rc->search_start = key.objectid +
3516                                         fs_info->nodesize;
3517                         memcpy(extent_key, &key, sizeof(key));
3518                         return 0;
3519                 }
3520         }
3521         btrfs_release_path(path);
3522         return ret;
3523 }
3524
3525 static void set_reloc_control(struct reloc_control *rc)
3526 {
3527         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3528
3529         mutex_lock(&fs_info->reloc_mutex);
3530         fs_info->reloc_ctl = rc;
3531         mutex_unlock(&fs_info->reloc_mutex);
3532 }
3533
3534 static void unset_reloc_control(struct reloc_control *rc)
3535 {
3536         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3537
3538         mutex_lock(&fs_info->reloc_mutex);
3539         fs_info->reloc_ctl = NULL;
3540         mutex_unlock(&fs_info->reloc_mutex);
3541 }
3542
3543 static noinline_for_stack
3544 int prepare_to_relocate(struct reloc_control *rc)
3545 {
3546         struct btrfs_trans_handle *trans;
3547         int ret;
3548
3549         rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3550                                               BTRFS_BLOCK_RSV_TEMP);
3551         if (!rc->block_rsv)
3552                 return -ENOMEM;
3553
3554         memset(&rc->cluster, 0, sizeof(rc->cluster));
3555         rc->search_start = rc->block_group->start;
3556         rc->extents_found = 0;
3557         rc->nodes_relocated = 0;
3558         rc->merging_rsv_size = 0;
3559         rc->reserved_bytes = 0;
3560         rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3561                               RELOCATION_RESERVED_NODES;
3562         ret = btrfs_block_rsv_refill(rc->extent_root,
3563                                      rc->block_rsv, rc->block_rsv->size,
3564                                      BTRFS_RESERVE_FLUSH_ALL);
3565         if (ret)
3566                 return ret;
3567
3568         rc->create_reloc_tree = 1;
3569         set_reloc_control(rc);
3570
3571         trans = btrfs_join_transaction(rc->extent_root);
3572         if (IS_ERR(trans)) {
3573                 unset_reloc_control(rc);
3574                 /*
3575                  * extent tree is not a ref_cow tree and has no reloc_root to
3576                  * cleanup.  And callers are responsible to free the above
3577                  * block rsv.
3578                  */
3579                 return PTR_ERR(trans);
3580         }
3581
3582         ret = btrfs_commit_transaction(trans);
3583         if (ret)
3584                 unset_reloc_control(rc);
3585
3586         return ret;
3587 }
3588
3589 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3590 {
3591         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3592         struct rb_root blocks = RB_ROOT;
3593         struct btrfs_key key;
3594         struct btrfs_trans_handle *trans = NULL;
3595         struct btrfs_path *path;
3596         struct btrfs_extent_item *ei;
3597         u64 flags;
3598         int ret;
3599         int err = 0;
3600         int progress = 0;
3601
3602         path = btrfs_alloc_path();
3603         if (!path)
3604                 return -ENOMEM;
3605         path->reada = READA_FORWARD;
3606
3607         ret = prepare_to_relocate(rc);
3608         if (ret) {
3609                 err = ret;
3610                 goto out_free;
3611         }
3612
3613         while (1) {
3614                 rc->reserved_bytes = 0;
3615                 ret = btrfs_block_rsv_refill(rc->extent_root,
3616                                         rc->block_rsv, rc->block_rsv->size,
3617                                         BTRFS_RESERVE_FLUSH_ALL);
3618                 if (ret) {
3619                         err = ret;
3620                         break;
3621                 }
3622                 progress++;
3623                 trans = btrfs_start_transaction(rc->extent_root, 0);
3624                 if (IS_ERR(trans)) {
3625                         err = PTR_ERR(trans);
3626                         trans = NULL;
3627                         break;
3628                 }
3629 restart:
3630                 if (update_backref_cache(trans, &rc->backref_cache)) {
3631                         btrfs_end_transaction(trans);
3632                         trans = NULL;
3633                         continue;
3634                 }
3635
3636                 ret = find_next_extent(rc, path, &key);
3637                 if (ret < 0)
3638                         err = ret;
3639                 if (ret != 0)
3640                         break;
3641
3642                 rc->extents_found++;
3643
3644                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3645                                     struct btrfs_extent_item);
3646                 flags = btrfs_extent_flags(path->nodes[0], ei);
3647
3648                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3649                         ret = add_tree_block(rc, &key, path, &blocks);
3650                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3651                            (flags & BTRFS_EXTENT_FLAG_DATA)) {
3652                         ret = add_data_references(rc, &key, path, &blocks);
3653                 } else {
3654                         btrfs_release_path(path);
3655                         ret = 0;
3656                 }
3657                 if (ret < 0) {
3658                         err = ret;
3659                         break;
3660                 }
3661
3662                 if (!RB_EMPTY_ROOT(&blocks)) {
3663                         ret = relocate_tree_blocks(trans, rc, &blocks);
3664                         if (ret < 0) {
3665                                 if (ret != -EAGAIN) {
3666                                         err = ret;
3667                                         break;
3668                                 }
3669                                 rc->extents_found--;
3670                                 rc->search_start = key.objectid;
3671                         }
3672                 }
3673
3674                 btrfs_end_transaction_throttle(trans);
3675                 btrfs_btree_balance_dirty(fs_info);
3676                 trans = NULL;
3677
3678                 if (rc->stage == MOVE_DATA_EXTENTS &&
3679                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3680                         rc->found_file_extent = 1;
3681                         ret = relocate_data_extent(rc->data_inode,
3682                                                    &key, &rc->cluster);
3683                         if (ret < 0) {
3684                                 err = ret;
3685                                 break;
3686                         }
3687                 }
3688                 if (btrfs_should_cancel_balance(fs_info)) {
3689                         err = -ECANCELED;
3690                         break;
3691                 }
3692         }
3693         if (trans && progress && err == -ENOSPC) {
3694                 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3695                 if (ret == 1) {
3696                         err = 0;
3697                         progress = 0;
3698                         goto restart;
3699                 }
3700         }
3701
3702         btrfs_release_path(path);
3703         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3704
3705         if (trans) {
3706                 btrfs_end_transaction_throttle(trans);
3707                 btrfs_btree_balance_dirty(fs_info);
3708         }
3709
3710         if (!err) {
3711                 ret = relocate_file_extent_cluster(rc->data_inode,
3712                                                    &rc->cluster);
3713                 if (ret < 0)
3714                         err = ret;
3715         }
3716
3717         rc->create_reloc_tree = 0;
3718         set_reloc_control(rc);
3719
3720         btrfs_backref_release_cache(&rc->backref_cache);
3721         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3722
3723         /*
3724          * Even in the case when the relocation is cancelled, we should all go
3725          * through prepare_to_merge() and merge_reloc_roots().
3726          *
3727          * For error (including cancelled balance), prepare_to_merge() will
3728          * mark all reloc trees orphan, then queue them for cleanup in
3729          * merge_reloc_roots()
3730          */
3731         err = prepare_to_merge(rc, err);
3732
3733         merge_reloc_roots(rc);
3734
3735         rc->merge_reloc_tree = 0;
3736         unset_reloc_control(rc);
3737         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3738
3739         /* get rid of pinned extents */
3740         trans = btrfs_join_transaction(rc->extent_root);
3741         if (IS_ERR(trans)) {
3742                 err = PTR_ERR(trans);
3743                 goto out_free;
3744         }
3745         ret = btrfs_commit_transaction(trans);
3746         if (ret && !err)
3747                 err = ret;
3748 out_free:
3749         ret = clean_dirty_subvols(rc);
3750         if (ret < 0 && !err)
3751                 err = ret;
3752         btrfs_free_block_rsv(fs_info, rc->block_rsv);
3753         btrfs_free_path(path);
3754         return err;
3755 }
3756
3757 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3758                                  struct btrfs_root *root, u64 objectid)
3759 {
3760         struct btrfs_path *path;
3761         struct btrfs_inode_item *item;
3762         struct extent_buffer *leaf;
3763         int ret;
3764
3765         path = btrfs_alloc_path();
3766         if (!path)
3767                 return -ENOMEM;
3768
3769         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3770         if (ret)
3771                 goto out;
3772
3773         leaf = path->nodes[0];
3774         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3775         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3776         btrfs_set_inode_generation(leaf, item, 1);
3777         btrfs_set_inode_size(leaf, item, 0);
3778         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3779         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3780                                           BTRFS_INODE_PREALLOC);
3781         btrfs_mark_buffer_dirty(leaf);
3782 out:
3783         btrfs_free_path(path);
3784         return ret;
3785 }
3786
3787 static void delete_orphan_inode(struct btrfs_trans_handle *trans,
3788                                 struct btrfs_root *root, u64 objectid)
3789 {
3790         struct btrfs_path *path;
3791         struct btrfs_key key;
3792         int ret = 0;
3793
3794         path = btrfs_alloc_path();
3795         if (!path) {
3796                 ret = -ENOMEM;
3797                 goto out;
3798         }
3799
3800         key.objectid = objectid;
3801         key.type = BTRFS_INODE_ITEM_KEY;
3802         key.offset = 0;
3803         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3804         if (ret) {
3805                 if (ret > 0)
3806                         ret = -ENOENT;
3807                 goto out;
3808         }
3809         ret = btrfs_del_item(trans, root, path);
3810 out:
3811         if (ret)
3812                 btrfs_abort_transaction(trans, ret);
3813         btrfs_free_path(path);
3814 }
3815
3816 /*
3817  * helper to create inode for data relocation.
3818  * the inode is in data relocation tree and its link count is 0
3819  */
3820 static noinline_for_stack
3821 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3822                                  struct btrfs_block_group *group)
3823 {
3824         struct inode *inode = NULL;
3825         struct btrfs_trans_handle *trans;
3826         struct btrfs_root *root;
3827         u64 objectid;
3828         int err = 0;
3829
3830         root = btrfs_grab_root(fs_info->data_reloc_root);
3831         trans = btrfs_start_transaction(root, 6);
3832         if (IS_ERR(trans)) {
3833                 btrfs_put_root(root);
3834                 return ERR_CAST(trans);
3835         }
3836
3837         err = btrfs_get_free_objectid(root, &objectid);
3838         if (err)
3839                 goto out;
3840
3841         err = __insert_orphan_inode(trans, root, objectid);
3842         if (err)
3843                 goto out;
3844
3845         inode = btrfs_iget(fs_info->sb, objectid, root);
3846         if (IS_ERR(inode)) {
3847                 delete_orphan_inode(trans, root, objectid);
3848                 err = PTR_ERR(inode);
3849                 inode = NULL;
3850                 goto out;
3851         }
3852         BTRFS_I(inode)->index_cnt = group->start;
3853
3854         err = btrfs_orphan_add(trans, BTRFS_I(inode));
3855 out:
3856         btrfs_put_root(root);
3857         btrfs_end_transaction(trans);
3858         btrfs_btree_balance_dirty(fs_info);
3859         if (err) {
3860                 if (inode)
3861                         iput(inode);
3862                 inode = ERR_PTR(err);
3863         }
3864         return inode;
3865 }
3866
3867 /*
3868  * Mark start of chunk relocation that is cancellable. Check if the cancellation
3869  * has been requested meanwhile and don't start in that case.
3870  *
3871  * Return:
3872  *   0             success
3873  *   -EINPROGRESS  operation is already in progress, that's probably a bug
3874  *   -ECANCELED    cancellation request was set before the operation started
3875  */
3876 static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
3877 {
3878         if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
3879                 /* This should not happen */
3880                 btrfs_err(fs_info, "reloc already running, cannot start");
3881                 return -EINPROGRESS;
3882         }
3883
3884         if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
3885                 btrfs_info(fs_info, "chunk relocation canceled on start");
3886                 /*
3887                  * On cancel, clear all requests but let the caller mark
3888                  * the end after cleanup operations.
3889                  */
3890                 atomic_set(&fs_info->reloc_cancel_req, 0);
3891                 return -ECANCELED;
3892         }
3893         return 0;
3894 }
3895
3896 /*
3897  * Mark end of chunk relocation that is cancellable and wake any waiters.
3898  */
3899 static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
3900 {
3901         /* Requested after start, clear bit first so any waiters can continue */
3902         if (atomic_read(&fs_info->reloc_cancel_req) > 0)
3903                 btrfs_info(fs_info, "chunk relocation canceled during operation");
3904         clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
3905         atomic_set(&fs_info->reloc_cancel_req, 0);
3906 }
3907
3908 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3909 {
3910         struct reloc_control *rc;
3911
3912         rc = kzalloc(sizeof(*rc), GFP_NOFS);
3913         if (!rc)
3914                 return NULL;
3915
3916         INIT_LIST_HEAD(&rc->reloc_roots);
3917         INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3918         btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3919         mapping_tree_init(&rc->reloc_root_tree);
3920         extent_io_tree_init(fs_info, &rc->processed_blocks,
3921                             IO_TREE_RELOC_BLOCKS, NULL);
3922         return rc;
3923 }
3924
3925 static void free_reloc_control(struct reloc_control *rc)
3926 {
3927         struct mapping_node *node, *tmp;
3928
3929         free_reloc_roots(&rc->reloc_roots);
3930         rbtree_postorder_for_each_entry_safe(node, tmp,
3931                         &rc->reloc_root_tree.rb_root, rb_node)
3932                 kfree(node);
3933
3934         kfree(rc);
3935 }
3936
3937 /*
3938  * Print the block group being relocated
3939  */
3940 static void describe_relocation(struct btrfs_fs_info *fs_info,
3941                                 struct btrfs_block_group *block_group)
3942 {
3943         char buf[128] = {'\0'};
3944
3945         btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3946
3947         btrfs_info(fs_info,
3948                    "relocating block group %llu flags %s",
3949                    block_group->start, buf);
3950 }
3951
3952 static const char *stage_to_string(int stage)
3953 {
3954         if (stage == MOVE_DATA_EXTENTS)
3955                 return "move data extents";
3956         if (stage == UPDATE_DATA_PTRS)
3957                 return "update data pointers";
3958         return "unknown";
3959 }
3960
3961 /*
3962  * function to relocate all extents in a block group.
3963  */
3964 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
3965 {
3966         struct btrfs_block_group *bg;
3967         struct btrfs_root *extent_root = fs_info->extent_root;
3968         struct reloc_control *rc;
3969         struct inode *inode;
3970         struct btrfs_path *path;
3971         int ret;
3972         int rw = 0;
3973         int err = 0;
3974
3975         /*
3976          * This only gets set if we had a half-deleted snapshot on mount.  We
3977          * cannot allow relocation to start while we're still trying to clean up
3978          * these pending deletions.
3979          */
3980         ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
3981         if (ret)
3982                 return ret;
3983
3984         /* We may have been woken up by close_ctree, so bail if we're closing. */
3985         if (btrfs_fs_closing(fs_info))
3986                 return -EINTR;
3987
3988         bg = btrfs_lookup_block_group(fs_info, group_start);
3989         if (!bg)
3990                 return -ENOENT;
3991
3992         if (btrfs_pinned_by_swapfile(fs_info, bg)) {
3993                 btrfs_put_block_group(bg);
3994                 return -ETXTBSY;
3995         }
3996
3997         rc = alloc_reloc_control(fs_info);
3998         if (!rc) {
3999                 btrfs_put_block_group(bg);
4000                 return -ENOMEM;
4001         }
4002
4003         ret = reloc_chunk_start(fs_info);
4004         if (ret < 0) {
4005                 err = ret;
4006                 goto out_put_bg;
4007         }
4008
4009         rc->extent_root = extent_root;
4010         rc->block_group = bg;
4011
4012         ret = btrfs_inc_block_group_ro(rc->block_group, true);
4013         if (ret) {
4014                 err = ret;
4015                 goto out;
4016         }
4017         rw = 1;
4018
4019         path = btrfs_alloc_path();
4020         if (!path) {
4021                 err = -ENOMEM;
4022                 goto out;
4023         }
4024
4025         inode = lookup_free_space_inode(rc->block_group, path);
4026         btrfs_free_path(path);
4027
4028         if (!IS_ERR(inode))
4029                 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4030         else
4031                 ret = PTR_ERR(inode);
4032
4033         if (ret && ret != -ENOENT) {
4034                 err = ret;
4035                 goto out;
4036         }
4037
4038         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4039         if (IS_ERR(rc->data_inode)) {
4040                 err = PTR_ERR(rc->data_inode);
4041                 rc->data_inode = NULL;
4042                 goto out;
4043         }
4044
4045         describe_relocation(fs_info, rc->block_group);
4046
4047         btrfs_wait_block_group_reservations(rc->block_group);
4048         btrfs_wait_nocow_writers(rc->block_group);
4049         btrfs_wait_ordered_roots(fs_info, U64_MAX,
4050                                  rc->block_group->start,
4051                                  rc->block_group->length);
4052
4053         while (1) {
4054                 int finishes_stage;
4055
4056                 mutex_lock(&fs_info->cleaner_mutex);
4057                 ret = relocate_block_group(rc);
4058                 mutex_unlock(&fs_info->cleaner_mutex);
4059                 if (ret < 0)
4060                         err = ret;
4061
4062                 finishes_stage = rc->stage;
4063                 /*
4064                  * We may have gotten ENOSPC after we already dirtied some
4065                  * extents.  If writeout happens while we're relocating a
4066                  * different block group we could end up hitting the
4067                  * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4068                  * btrfs_reloc_cow_block.  Make sure we write everything out
4069                  * properly so we don't trip over this problem, and then break
4070                  * out of the loop if we hit an error.
4071                  */
4072                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4073                         ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4074                                                        (u64)-1);
4075                         if (ret)
4076                                 err = ret;
4077                         invalidate_mapping_pages(rc->data_inode->i_mapping,
4078                                                  0, -1);
4079                         rc->stage = UPDATE_DATA_PTRS;
4080                 }
4081
4082                 if (err < 0)
4083                         goto out;
4084
4085                 if (rc->extents_found == 0)
4086                         break;
4087
4088                 btrfs_info(fs_info, "found %llu extents, stage: %s",
4089                            rc->extents_found, stage_to_string(finishes_stage));
4090         }
4091
4092         WARN_ON(rc->block_group->pinned > 0);
4093         WARN_ON(rc->block_group->reserved > 0);
4094         WARN_ON(rc->block_group->used > 0);
4095 out:
4096         if (err && rw)
4097                 btrfs_dec_block_group_ro(rc->block_group);
4098         iput(rc->data_inode);
4099 out_put_bg:
4100         btrfs_put_block_group(bg);
4101         reloc_chunk_end(fs_info);
4102         free_reloc_control(rc);
4103         return err;
4104 }
4105
4106 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4107 {
4108         struct btrfs_fs_info *fs_info = root->fs_info;
4109         struct btrfs_trans_handle *trans;
4110         int ret, err;
4111
4112         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4113         if (IS_ERR(trans))
4114                 return PTR_ERR(trans);
4115
4116         memset(&root->root_item.drop_progress, 0,
4117                 sizeof(root->root_item.drop_progress));
4118         btrfs_set_root_drop_level(&root->root_item, 0);
4119         btrfs_set_root_refs(&root->root_item, 0);
4120         ret = btrfs_update_root(trans, fs_info->tree_root,
4121                                 &root->root_key, &root->root_item);
4122
4123         err = btrfs_end_transaction(trans);
4124         if (err)
4125                 return err;
4126         return ret;
4127 }
4128
4129 /*
4130  * recover relocation interrupted by system crash.
4131  *
4132  * this function resumes merging reloc trees with corresponding fs trees.
4133  * this is important for keeping the sharing of tree blocks
4134  */
4135 int btrfs_recover_relocation(struct btrfs_root *root)
4136 {
4137         struct btrfs_fs_info *fs_info = root->fs_info;
4138         LIST_HEAD(reloc_roots);
4139         struct btrfs_key key;
4140         struct btrfs_root *fs_root;
4141         struct btrfs_root *reloc_root;
4142         struct btrfs_path *path;
4143         struct extent_buffer *leaf;
4144         struct reloc_control *rc = NULL;
4145         struct btrfs_trans_handle *trans;
4146         int ret;
4147         int err = 0;
4148
4149         path = btrfs_alloc_path();
4150         if (!path)
4151                 return -ENOMEM;
4152         path->reada = READA_BACK;
4153
4154         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4155         key.type = BTRFS_ROOT_ITEM_KEY;
4156         key.offset = (u64)-1;
4157
4158         while (1) {
4159                 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4160                                         path, 0, 0);
4161                 if (ret < 0) {
4162                         err = ret;
4163                         goto out;
4164                 }
4165                 if (ret > 0) {
4166                         if (path->slots[0] == 0)
4167                                 break;
4168                         path->slots[0]--;
4169                 }
4170                 leaf = path->nodes[0];
4171                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4172                 btrfs_release_path(path);
4173
4174                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4175                     key.type != BTRFS_ROOT_ITEM_KEY)
4176                         break;
4177
4178                 reloc_root = btrfs_read_tree_root(root, &key);
4179                 if (IS_ERR(reloc_root)) {
4180                         err = PTR_ERR(reloc_root);
4181                         goto out;
4182                 }
4183
4184                 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
4185                 list_add(&reloc_root->root_list, &reloc_roots);
4186
4187                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4188                         fs_root = btrfs_get_fs_root(fs_info,
4189                                         reloc_root->root_key.offset, false);
4190                         if (IS_ERR(fs_root)) {
4191                                 ret = PTR_ERR(fs_root);
4192                                 if (ret != -ENOENT) {
4193                                         err = ret;
4194                                         goto out;
4195                                 }
4196                                 ret = mark_garbage_root(reloc_root);
4197                                 if (ret < 0) {
4198                                         err = ret;
4199                                         goto out;
4200                                 }
4201                         } else {
4202                                 btrfs_put_root(fs_root);
4203                         }
4204                 }
4205
4206                 if (key.offset == 0)
4207                         break;
4208
4209                 key.offset--;
4210         }
4211         btrfs_release_path(path);
4212
4213         if (list_empty(&reloc_roots))
4214                 goto out;
4215
4216         rc = alloc_reloc_control(fs_info);
4217         if (!rc) {
4218                 err = -ENOMEM;
4219                 goto out;
4220         }
4221
4222         ret = reloc_chunk_start(fs_info);
4223         if (ret < 0) {
4224                 err = ret;
4225                 goto out_end;
4226         }
4227
4228         rc->extent_root = fs_info->extent_root;
4229
4230         set_reloc_control(rc);
4231
4232         trans = btrfs_join_transaction(rc->extent_root);
4233         if (IS_ERR(trans)) {
4234                 err = PTR_ERR(trans);
4235                 goto out_unset;
4236         }
4237
4238         rc->merge_reloc_tree = 1;
4239
4240         while (!list_empty(&reloc_roots)) {
4241                 reloc_root = list_entry(reloc_roots.next,
4242                                         struct btrfs_root, root_list);
4243                 list_del(&reloc_root->root_list);
4244
4245                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4246                         list_add_tail(&reloc_root->root_list,
4247                                       &rc->reloc_roots);
4248                         continue;
4249                 }
4250
4251                 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
4252                                             false);
4253                 if (IS_ERR(fs_root)) {
4254                         err = PTR_ERR(fs_root);
4255                         list_add_tail(&reloc_root->root_list, &reloc_roots);
4256                         btrfs_end_transaction(trans);
4257                         goto out_unset;
4258                 }
4259
4260                 err = __add_reloc_root(reloc_root);
4261                 ASSERT(err != -EEXIST);
4262                 if (err) {
4263                         list_add_tail(&reloc_root->root_list, &reloc_roots);
4264                         btrfs_put_root(fs_root);
4265                         btrfs_end_transaction(trans);
4266                         goto out_unset;
4267                 }
4268                 fs_root->reloc_root = btrfs_grab_root(reloc_root);
4269                 btrfs_put_root(fs_root);
4270         }
4271
4272         err = btrfs_commit_transaction(trans);
4273         if (err)
4274                 goto out_unset;
4275
4276         merge_reloc_roots(rc);
4277
4278         unset_reloc_control(rc);
4279
4280         trans = btrfs_join_transaction(rc->extent_root);
4281         if (IS_ERR(trans)) {
4282                 err = PTR_ERR(trans);
4283                 goto out_clean;
4284         }
4285         err = btrfs_commit_transaction(trans);
4286 out_clean:
4287         ret = clean_dirty_subvols(rc);
4288         if (ret < 0 && !err)
4289                 err = ret;
4290 out_unset:
4291         unset_reloc_control(rc);
4292 out_end:
4293         reloc_chunk_end(fs_info);
4294         free_reloc_control(rc);
4295 out:
4296         free_reloc_roots(&reloc_roots);
4297
4298         btrfs_free_path(path);
4299
4300         if (err == 0) {
4301                 /* cleanup orphan inode in data relocation tree */
4302                 fs_root = btrfs_grab_root(fs_info->data_reloc_root);
4303                 ASSERT(fs_root);
4304                 err = btrfs_orphan_cleanup(fs_root);
4305                 btrfs_put_root(fs_root);
4306         }
4307         return err;
4308 }
4309
4310 /*
4311  * helper to add ordered checksum for data relocation.
4312  *
4313  * cloning checksum properly handles the nodatasum extents.
4314  * it also saves CPU time to re-calculate the checksum.
4315  */
4316 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
4317 {
4318         struct btrfs_fs_info *fs_info = inode->root->fs_info;
4319         struct btrfs_ordered_sum *sums;
4320         struct btrfs_ordered_extent *ordered;
4321         int ret;
4322         u64 disk_bytenr;
4323         u64 new_bytenr;
4324         LIST_HEAD(list);
4325
4326         ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4327         BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
4328
4329         disk_bytenr = file_pos + inode->index_cnt;
4330         ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4331                                        disk_bytenr + len - 1, &list, 0);
4332         if (ret)
4333                 goto out;
4334
4335         while (!list_empty(&list)) {
4336                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4337                 list_del_init(&sums->list);
4338
4339                 /*
4340                  * We need to offset the new_bytenr based on where the csum is.
4341                  * We need to do this because we will read in entire prealloc
4342                  * extents but we may have written to say the middle of the
4343                  * prealloc extent, so we need to make sure the csum goes with
4344                  * the right disk offset.
4345                  *
4346                  * We can do this because the data reloc inode refers strictly
4347                  * to the on disk bytes, so we don't have to worry about
4348                  * disk_len vs real len like with real inodes since it's all
4349                  * disk length.
4350                  */
4351                 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4352                 sums->bytenr = new_bytenr;
4353
4354                 btrfs_add_ordered_sum(ordered, sums);
4355         }
4356 out:
4357         btrfs_put_ordered_extent(ordered);
4358         return ret;
4359 }
4360
4361 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4362                           struct btrfs_root *root, struct extent_buffer *buf,
4363                           struct extent_buffer *cow)
4364 {
4365         struct btrfs_fs_info *fs_info = root->fs_info;
4366         struct reloc_control *rc;
4367         struct btrfs_backref_node *node;
4368         int first_cow = 0;
4369         int level;
4370         int ret = 0;
4371
4372         rc = fs_info->reloc_ctl;
4373         if (!rc)
4374                 return 0;
4375
4376         BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
4377
4378         level = btrfs_header_level(buf);
4379         if (btrfs_header_generation(buf) <=
4380             btrfs_root_last_snapshot(&root->root_item))
4381                 first_cow = 1;
4382
4383         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4384             rc->create_reloc_tree) {
4385                 WARN_ON(!first_cow && level == 0);
4386
4387                 node = rc->backref_cache.path[level];
4388                 BUG_ON(node->bytenr != buf->start &&
4389                        node->new_bytenr != buf->start);
4390
4391                 btrfs_backref_drop_node_buffer(node);
4392                 atomic_inc(&cow->refs);
4393                 node->eb = cow;
4394                 node->new_bytenr = cow->start;
4395
4396                 if (!node->pending) {
4397                         list_move_tail(&node->list,
4398                                        &rc->backref_cache.pending[level]);
4399                         node->pending = 1;
4400                 }
4401
4402                 if (first_cow)
4403                         mark_block_processed(rc, node);
4404
4405                 if (first_cow && level > 0)
4406                         rc->nodes_relocated += buf->len;
4407         }
4408
4409         if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4410                 ret = replace_file_extents(trans, rc, root, cow);
4411         return ret;
4412 }
4413
4414 /*
4415  * called before creating snapshot. it calculates metadata reservation
4416  * required for relocating tree blocks in the snapshot
4417  */
4418 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4419                               u64 *bytes_to_reserve)
4420 {
4421         struct btrfs_root *root = pending->root;
4422         struct reloc_control *rc = root->fs_info->reloc_ctl;
4423
4424         if (!rc || !have_reloc_root(root))
4425                 return;
4426
4427         if (!rc->merge_reloc_tree)
4428                 return;
4429
4430         root = root->reloc_root;
4431         BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4432         /*
4433          * relocation is in the stage of merging trees. the space
4434          * used by merging a reloc tree is twice the size of
4435          * relocated tree nodes in the worst case. half for cowing
4436          * the reloc tree, half for cowing the fs tree. the space
4437          * used by cowing the reloc tree will be freed after the
4438          * tree is dropped. if we create snapshot, cowing the fs
4439          * tree may use more space than it frees. so we need
4440          * reserve extra space.
4441          */
4442         *bytes_to_reserve += rc->nodes_relocated;
4443 }
4444
4445 /*
4446  * called after snapshot is created. migrate block reservation
4447  * and create reloc root for the newly created snapshot
4448  *
4449  * This is similar to btrfs_init_reloc_root(), we come out of here with two
4450  * references held on the reloc_root, one for root->reloc_root and one for
4451  * rc->reloc_roots.
4452  */
4453 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4454                                struct btrfs_pending_snapshot *pending)
4455 {
4456         struct btrfs_root *root = pending->root;
4457         struct btrfs_root *reloc_root;
4458         struct btrfs_root *new_root;
4459         struct reloc_control *rc = root->fs_info->reloc_ctl;
4460         int ret;
4461
4462         if (!rc || !have_reloc_root(root))
4463                 return 0;
4464
4465         rc = root->fs_info->reloc_ctl;
4466         rc->merging_rsv_size += rc->nodes_relocated;
4467
4468         if (rc->merge_reloc_tree) {
4469                 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4470                                               rc->block_rsv,
4471                                               rc->nodes_relocated, true);
4472                 if (ret)
4473                         return ret;
4474         }
4475
4476         new_root = pending->snap;
4477         reloc_root = create_reloc_root(trans, root->reloc_root,
4478                                        new_root->root_key.objectid);
4479         if (IS_ERR(reloc_root))
4480                 return PTR_ERR(reloc_root);
4481
4482         ret = __add_reloc_root(reloc_root);
4483         ASSERT(ret != -EEXIST);
4484         if (ret) {
4485                 /* Pairs with create_reloc_root */
4486                 btrfs_put_root(reloc_root);
4487                 return ret;
4488         }
4489         new_root->reloc_root = btrfs_grab_root(reloc_root);
4490
4491         if (rc->create_reloc_tree)
4492                 ret = clone_backref_node(trans, rc, root, reloc_root);
4493         return ret;
4494 }