d421e289dc737e65d238224ae6ed22767075861b
[platform/kernel/linux-rpi.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "ctree.h"
20 #include "extent-tree.h"
21 #include "tree-log.h"
22 #include "disk-io.h"
23 #include "print-tree.h"
24 #include "volumes.h"
25 #include "raid56.h"
26 #include "locking.h"
27 #include "free-space-cache.h"
28 #include "free-space-tree.h"
29 #include "sysfs.h"
30 #include "qgroup.h"
31 #include "ref-verify.h"
32 #include "space-info.h"
33 #include "block-rsv.h"
34 #include "delalloc-space.h"
35 #include "discard.h"
36 #include "rcu-string.h"
37 #include "zoned.h"
38 #include "dev-replace.h"
39 #include "fs.h"
40 #include "accessors.h"
41 #include "root-tree.h"
42 #include "file-item.h"
43 #include "orphan.h"
44 #include "tree-checker.h"
45
46 #undef SCRAMBLE_DELAYED_REFS
47
48
49 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
50                                struct btrfs_delayed_ref_node *node, u64 parent,
51                                u64 root_objectid, u64 owner_objectid,
52                                u64 owner_offset, int refs_to_drop,
53                                struct btrfs_delayed_extent_op *extra_op);
54 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
55                                     struct extent_buffer *leaf,
56                                     struct btrfs_extent_item *ei);
57 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
58                                       u64 parent, u64 root_objectid,
59                                       u64 flags, u64 owner, u64 offset,
60                                       struct btrfs_key *ins, int ref_mod);
61 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
62                                      struct btrfs_delayed_ref_node *node,
63                                      struct btrfs_delayed_extent_op *extent_op);
64 static int find_next_key(struct btrfs_path *path, int level,
65                          struct btrfs_key *key);
66
67 static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
68 {
69         return (cache->flags & bits) == bits;
70 }
71
72 /* simple helper to search for an existing data extent at a given offset */
73 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
74 {
75         struct btrfs_root *root = btrfs_extent_root(fs_info, start);
76         int ret;
77         struct btrfs_key key;
78         struct btrfs_path *path;
79
80         path = btrfs_alloc_path();
81         if (!path)
82                 return -ENOMEM;
83
84         key.objectid = start;
85         key.offset = len;
86         key.type = BTRFS_EXTENT_ITEM_KEY;
87         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
88         btrfs_free_path(path);
89         return ret;
90 }
91
92 /*
93  * helper function to lookup reference count and flags of a tree block.
94  *
95  * the head node for delayed ref is used to store the sum of all the
96  * reference count modifications queued up in the rbtree. the head
97  * node may also store the extent flags to set. This way you can check
98  * to see what the reference count and extent flags would be if all of
99  * the delayed refs are not processed.
100  */
101 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
102                              struct btrfs_fs_info *fs_info, u64 bytenr,
103                              u64 offset, int metadata, u64 *refs, u64 *flags)
104 {
105         struct btrfs_root *extent_root;
106         struct btrfs_delayed_ref_head *head;
107         struct btrfs_delayed_ref_root *delayed_refs;
108         struct btrfs_path *path;
109         struct btrfs_extent_item *ei;
110         struct extent_buffer *leaf;
111         struct btrfs_key key;
112         u32 item_size;
113         u64 num_refs;
114         u64 extent_flags;
115         int ret;
116
117         /*
118          * If we don't have skinny metadata, don't bother doing anything
119          * different
120          */
121         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
122                 offset = fs_info->nodesize;
123                 metadata = 0;
124         }
125
126         path = btrfs_alloc_path();
127         if (!path)
128                 return -ENOMEM;
129
130         if (!trans) {
131                 path->skip_locking = 1;
132                 path->search_commit_root = 1;
133         }
134
135 search_again:
136         key.objectid = bytenr;
137         key.offset = offset;
138         if (metadata)
139                 key.type = BTRFS_METADATA_ITEM_KEY;
140         else
141                 key.type = BTRFS_EXTENT_ITEM_KEY;
142
143         extent_root = btrfs_extent_root(fs_info, bytenr);
144         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
145         if (ret < 0)
146                 goto out_free;
147
148         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
149                 if (path->slots[0]) {
150                         path->slots[0]--;
151                         btrfs_item_key_to_cpu(path->nodes[0], &key,
152                                               path->slots[0]);
153                         if (key.objectid == bytenr &&
154                             key.type == BTRFS_EXTENT_ITEM_KEY &&
155                             key.offset == fs_info->nodesize)
156                                 ret = 0;
157                 }
158         }
159
160         if (ret == 0) {
161                 leaf = path->nodes[0];
162                 item_size = btrfs_item_size(leaf, path->slots[0]);
163                 if (item_size >= sizeof(*ei)) {
164                         ei = btrfs_item_ptr(leaf, path->slots[0],
165                                             struct btrfs_extent_item);
166                         num_refs = btrfs_extent_refs(leaf, ei);
167                         extent_flags = btrfs_extent_flags(leaf, ei);
168                 } else {
169                         ret = -EUCLEAN;
170                         btrfs_err(fs_info,
171                         "unexpected extent item size, has %u expect >= %zu",
172                                   item_size, sizeof(*ei));
173                         if (trans)
174                                 btrfs_abort_transaction(trans, ret);
175                         else
176                                 btrfs_handle_fs_error(fs_info, ret, NULL);
177
178                         goto out_free;
179                 }
180
181                 BUG_ON(num_refs == 0);
182         } else {
183                 num_refs = 0;
184                 extent_flags = 0;
185                 ret = 0;
186         }
187
188         if (!trans)
189                 goto out;
190
191         delayed_refs = &trans->transaction->delayed_refs;
192         spin_lock(&delayed_refs->lock);
193         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
194         if (head) {
195                 if (!mutex_trylock(&head->mutex)) {
196                         refcount_inc(&head->refs);
197                         spin_unlock(&delayed_refs->lock);
198
199                         btrfs_release_path(path);
200
201                         /*
202                          * Mutex was contended, block until it's released and try
203                          * again
204                          */
205                         mutex_lock(&head->mutex);
206                         mutex_unlock(&head->mutex);
207                         btrfs_put_delayed_ref_head(head);
208                         goto search_again;
209                 }
210                 spin_lock(&head->lock);
211                 if (head->extent_op && head->extent_op->update_flags)
212                         extent_flags |= head->extent_op->flags_to_set;
213                 else
214                         BUG_ON(num_refs == 0);
215
216                 num_refs += head->ref_mod;
217                 spin_unlock(&head->lock);
218                 mutex_unlock(&head->mutex);
219         }
220         spin_unlock(&delayed_refs->lock);
221 out:
222         WARN_ON(num_refs == 0);
223         if (refs)
224                 *refs = num_refs;
225         if (flags)
226                 *flags = extent_flags;
227 out_free:
228         btrfs_free_path(path);
229         return ret;
230 }
231
232 /*
233  * Back reference rules.  Back refs have three main goals:
234  *
235  * 1) differentiate between all holders of references to an extent so that
236  *    when a reference is dropped we can make sure it was a valid reference
237  *    before freeing the extent.
238  *
239  * 2) Provide enough information to quickly find the holders of an extent
240  *    if we notice a given block is corrupted or bad.
241  *
242  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
243  *    maintenance.  This is actually the same as #2, but with a slightly
244  *    different use case.
245  *
246  * There are two kinds of back refs. The implicit back refs is optimized
247  * for pointers in non-shared tree blocks. For a given pointer in a block,
248  * back refs of this kind provide information about the block's owner tree
249  * and the pointer's key. These information allow us to find the block by
250  * b-tree searching. The full back refs is for pointers in tree blocks not
251  * referenced by their owner trees. The location of tree block is recorded
252  * in the back refs. Actually the full back refs is generic, and can be
253  * used in all cases the implicit back refs is used. The major shortcoming
254  * of the full back refs is its overhead. Every time a tree block gets
255  * COWed, we have to update back refs entry for all pointers in it.
256  *
257  * For a newly allocated tree block, we use implicit back refs for
258  * pointers in it. This means most tree related operations only involve
259  * implicit back refs. For a tree block created in old transaction, the
260  * only way to drop a reference to it is COW it. So we can detect the
261  * event that tree block loses its owner tree's reference and do the
262  * back refs conversion.
263  *
264  * When a tree block is COWed through a tree, there are four cases:
265  *
266  * The reference count of the block is one and the tree is the block's
267  * owner tree. Nothing to do in this case.
268  *
269  * The reference count of the block is one and the tree is not the
270  * block's owner tree. In this case, full back refs is used for pointers
271  * in the block. Remove these full back refs, add implicit back refs for
272  * every pointers in the new block.
273  *
274  * The reference count of the block is greater than one and the tree is
275  * the block's owner tree. In this case, implicit back refs is used for
276  * pointers in the block. Add full back refs for every pointers in the
277  * block, increase lower level extents' reference counts. The original
278  * implicit back refs are entailed to the new block.
279  *
280  * The reference count of the block is greater than one and the tree is
281  * not the block's owner tree. Add implicit back refs for every pointer in
282  * the new block, increase lower level extents' reference count.
283  *
284  * Back Reference Key composing:
285  *
286  * The key objectid corresponds to the first byte in the extent,
287  * The key type is used to differentiate between types of back refs.
288  * There are different meanings of the key offset for different types
289  * of back refs.
290  *
291  * File extents can be referenced by:
292  *
293  * - multiple snapshots, subvolumes, or different generations in one subvol
294  * - different files inside a single subvolume
295  * - different offsets inside a file (bookend extents in file.c)
296  *
297  * The extent ref structure for the implicit back refs has fields for:
298  *
299  * - Objectid of the subvolume root
300  * - objectid of the file holding the reference
301  * - original offset in the file
302  * - how many bookend extents
303  *
304  * The key offset for the implicit back refs is hash of the first
305  * three fields.
306  *
307  * The extent ref structure for the full back refs has field for:
308  *
309  * - number of pointers in the tree leaf
310  *
311  * The key offset for the implicit back refs is the first byte of
312  * the tree leaf
313  *
314  * When a file extent is allocated, The implicit back refs is used.
315  * the fields are filled in:
316  *
317  *     (root_key.objectid, inode objectid, offset in file, 1)
318  *
319  * When a file extent is removed file truncation, we find the
320  * corresponding implicit back refs and check the following fields:
321  *
322  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
323  *
324  * Btree extents can be referenced by:
325  *
326  * - Different subvolumes
327  *
328  * Both the implicit back refs and the full back refs for tree blocks
329  * only consist of key. The key offset for the implicit back refs is
330  * objectid of block's owner tree. The key offset for the full back refs
331  * is the first byte of parent block.
332  *
333  * When implicit back refs is used, information about the lowest key and
334  * level of the tree block are required. These information are stored in
335  * tree block info structure.
336  */
337
338 /*
339  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
340  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
341  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
342  */
343 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
344                                      struct btrfs_extent_inline_ref *iref,
345                                      enum btrfs_inline_ref_type is_data)
346 {
347         int type = btrfs_extent_inline_ref_type(eb, iref);
348         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
349
350         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
351             type == BTRFS_SHARED_BLOCK_REF_KEY ||
352             type == BTRFS_SHARED_DATA_REF_KEY ||
353             type == BTRFS_EXTENT_DATA_REF_KEY) {
354                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
355                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
356                                 return type;
357                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
358                                 ASSERT(eb->fs_info);
359                                 /*
360                                  * Every shared one has parent tree block,
361                                  * which must be aligned to sector size.
362                                  */
363                                 if (offset &&
364                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
365                                         return type;
366                         }
367                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
368                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
369                                 return type;
370                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
371                                 ASSERT(eb->fs_info);
372                                 /*
373                                  * Every shared one has parent tree block,
374                                  * which must be aligned to sector size.
375                                  */
376                                 if (offset &&
377                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
378                                         return type;
379                         }
380                 } else {
381                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
382                         return type;
383                 }
384         }
385
386         WARN_ON(1);
387         btrfs_print_leaf(eb);
388         btrfs_err(eb->fs_info,
389                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
390                   eb->start, (unsigned long)iref, type);
391
392         return BTRFS_REF_TYPE_INVALID;
393 }
394
395 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
396 {
397         u32 high_crc = ~(u32)0;
398         u32 low_crc = ~(u32)0;
399         __le64 lenum;
400
401         lenum = cpu_to_le64(root_objectid);
402         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
403         lenum = cpu_to_le64(owner);
404         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
405         lenum = cpu_to_le64(offset);
406         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
407
408         return ((u64)high_crc << 31) ^ (u64)low_crc;
409 }
410
411 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
412                                      struct btrfs_extent_data_ref *ref)
413 {
414         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
415                                     btrfs_extent_data_ref_objectid(leaf, ref),
416                                     btrfs_extent_data_ref_offset(leaf, ref));
417 }
418
419 static int match_extent_data_ref(struct extent_buffer *leaf,
420                                  struct btrfs_extent_data_ref *ref,
421                                  u64 root_objectid, u64 owner, u64 offset)
422 {
423         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
424             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
425             btrfs_extent_data_ref_offset(leaf, ref) != offset)
426                 return 0;
427         return 1;
428 }
429
430 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
431                                            struct btrfs_path *path,
432                                            u64 bytenr, u64 parent,
433                                            u64 root_objectid,
434                                            u64 owner, u64 offset)
435 {
436         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
437         struct btrfs_key key;
438         struct btrfs_extent_data_ref *ref;
439         struct extent_buffer *leaf;
440         u32 nritems;
441         int ret;
442         int recow;
443         int err = -ENOENT;
444
445         key.objectid = bytenr;
446         if (parent) {
447                 key.type = BTRFS_SHARED_DATA_REF_KEY;
448                 key.offset = parent;
449         } else {
450                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
451                 key.offset = hash_extent_data_ref(root_objectid,
452                                                   owner, offset);
453         }
454 again:
455         recow = 0;
456         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
457         if (ret < 0) {
458                 err = ret;
459                 goto fail;
460         }
461
462         if (parent) {
463                 if (!ret)
464                         return 0;
465                 goto fail;
466         }
467
468         leaf = path->nodes[0];
469         nritems = btrfs_header_nritems(leaf);
470         while (1) {
471                 if (path->slots[0] >= nritems) {
472                         ret = btrfs_next_leaf(root, path);
473                         if (ret < 0)
474                                 err = ret;
475                         if (ret)
476                                 goto fail;
477
478                         leaf = path->nodes[0];
479                         nritems = btrfs_header_nritems(leaf);
480                         recow = 1;
481                 }
482
483                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
484                 if (key.objectid != bytenr ||
485                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
486                         goto fail;
487
488                 ref = btrfs_item_ptr(leaf, path->slots[0],
489                                      struct btrfs_extent_data_ref);
490
491                 if (match_extent_data_ref(leaf, ref, root_objectid,
492                                           owner, offset)) {
493                         if (recow) {
494                                 btrfs_release_path(path);
495                                 goto again;
496                         }
497                         err = 0;
498                         break;
499                 }
500                 path->slots[0]++;
501         }
502 fail:
503         return err;
504 }
505
506 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
507                                            struct btrfs_path *path,
508                                            u64 bytenr, u64 parent,
509                                            u64 root_objectid, u64 owner,
510                                            u64 offset, int refs_to_add)
511 {
512         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
513         struct btrfs_key key;
514         struct extent_buffer *leaf;
515         u32 size;
516         u32 num_refs;
517         int ret;
518
519         key.objectid = bytenr;
520         if (parent) {
521                 key.type = BTRFS_SHARED_DATA_REF_KEY;
522                 key.offset = parent;
523                 size = sizeof(struct btrfs_shared_data_ref);
524         } else {
525                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
526                 key.offset = hash_extent_data_ref(root_objectid,
527                                                   owner, offset);
528                 size = sizeof(struct btrfs_extent_data_ref);
529         }
530
531         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
532         if (ret && ret != -EEXIST)
533                 goto fail;
534
535         leaf = path->nodes[0];
536         if (parent) {
537                 struct btrfs_shared_data_ref *ref;
538                 ref = btrfs_item_ptr(leaf, path->slots[0],
539                                      struct btrfs_shared_data_ref);
540                 if (ret == 0) {
541                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
542                 } else {
543                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
544                         num_refs += refs_to_add;
545                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
546                 }
547         } else {
548                 struct btrfs_extent_data_ref *ref;
549                 while (ret == -EEXIST) {
550                         ref = btrfs_item_ptr(leaf, path->slots[0],
551                                              struct btrfs_extent_data_ref);
552                         if (match_extent_data_ref(leaf, ref, root_objectid,
553                                                   owner, offset))
554                                 break;
555                         btrfs_release_path(path);
556                         key.offset++;
557                         ret = btrfs_insert_empty_item(trans, root, path, &key,
558                                                       size);
559                         if (ret && ret != -EEXIST)
560                                 goto fail;
561
562                         leaf = path->nodes[0];
563                 }
564                 ref = btrfs_item_ptr(leaf, path->slots[0],
565                                      struct btrfs_extent_data_ref);
566                 if (ret == 0) {
567                         btrfs_set_extent_data_ref_root(leaf, ref,
568                                                        root_objectid);
569                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
570                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
571                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
572                 } else {
573                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
574                         num_refs += refs_to_add;
575                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
576                 }
577         }
578         btrfs_mark_buffer_dirty(trans, leaf);
579         ret = 0;
580 fail:
581         btrfs_release_path(path);
582         return ret;
583 }
584
585 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
586                                            struct btrfs_root *root,
587                                            struct btrfs_path *path,
588                                            int refs_to_drop)
589 {
590         struct btrfs_key key;
591         struct btrfs_extent_data_ref *ref1 = NULL;
592         struct btrfs_shared_data_ref *ref2 = NULL;
593         struct extent_buffer *leaf;
594         u32 num_refs = 0;
595         int ret = 0;
596
597         leaf = path->nodes[0];
598         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
599
600         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
601                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
602                                       struct btrfs_extent_data_ref);
603                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
604         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
605                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
606                                       struct btrfs_shared_data_ref);
607                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
608         } else {
609                 btrfs_err(trans->fs_info,
610                           "unrecognized backref key (%llu %u %llu)",
611                           key.objectid, key.type, key.offset);
612                 btrfs_abort_transaction(trans, -EUCLEAN);
613                 return -EUCLEAN;
614         }
615
616         BUG_ON(num_refs < refs_to_drop);
617         num_refs -= refs_to_drop;
618
619         if (num_refs == 0) {
620                 ret = btrfs_del_item(trans, root, path);
621         } else {
622                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
623                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
624                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
625                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
626                 btrfs_mark_buffer_dirty(trans, leaf);
627         }
628         return ret;
629 }
630
631 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
632                                           struct btrfs_extent_inline_ref *iref)
633 {
634         struct btrfs_key key;
635         struct extent_buffer *leaf;
636         struct btrfs_extent_data_ref *ref1;
637         struct btrfs_shared_data_ref *ref2;
638         u32 num_refs = 0;
639         int type;
640
641         leaf = path->nodes[0];
642         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
643
644         if (iref) {
645                 /*
646                  * If type is invalid, we should have bailed out earlier than
647                  * this call.
648                  */
649                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
650                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
651                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
652                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
653                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
654                 } else {
655                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
656                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
657                 }
658         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
659                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
660                                       struct btrfs_extent_data_ref);
661                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
662         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
663                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
664                                       struct btrfs_shared_data_ref);
665                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
666         } else {
667                 WARN_ON(1);
668         }
669         return num_refs;
670 }
671
672 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
673                                           struct btrfs_path *path,
674                                           u64 bytenr, u64 parent,
675                                           u64 root_objectid)
676 {
677         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
678         struct btrfs_key key;
679         int ret;
680
681         key.objectid = bytenr;
682         if (parent) {
683                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
684                 key.offset = parent;
685         } else {
686                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
687                 key.offset = root_objectid;
688         }
689
690         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
691         if (ret > 0)
692                 ret = -ENOENT;
693         return ret;
694 }
695
696 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
697                                           struct btrfs_path *path,
698                                           u64 bytenr, u64 parent,
699                                           u64 root_objectid)
700 {
701         struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
702         struct btrfs_key key;
703         int ret;
704
705         key.objectid = bytenr;
706         if (parent) {
707                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
708                 key.offset = parent;
709         } else {
710                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
711                 key.offset = root_objectid;
712         }
713
714         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
715         btrfs_release_path(path);
716         return ret;
717 }
718
719 static inline int extent_ref_type(u64 parent, u64 owner)
720 {
721         int type;
722         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
723                 if (parent > 0)
724                         type = BTRFS_SHARED_BLOCK_REF_KEY;
725                 else
726                         type = BTRFS_TREE_BLOCK_REF_KEY;
727         } else {
728                 if (parent > 0)
729                         type = BTRFS_SHARED_DATA_REF_KEY;
730                 else
731                         type = BTRFS_EXTENT_DATA_REF_KEY;
732         }
733         return type;
734 }
735
736 static int find_next_key(struct btrfs_path *path, int level,
737                          struct btrfs_key *key)
738
739 {
740         for (; level < BTRFS_MAX_LEVEL; level++) {
741                 if (!path->nodes[level])
742                         break;
743                 if (path->slots[level] + 1 >=
744                     btrfs_header_nritems(path->nodes[level]))
745                         continue;
746                 if (level == 0)
747                         btrfs_item_key_to_cpu(path->nodes[level], key,
748                                               path->slots[level] + 1);
749                 else
750                         btrfs_node_key_to_cpu(path->nodes[level], key,
751                                               path->slots[level] + 1);
752                 return 0;
753         }
754         return 1;
755 }
756
757 /*
758  * look for inline back ref. if back ref is found, *ref_ret is set
759  * to the address of inline back ref, and 0 is returned.
760  *
761  * if back ref isn't found, *ref_ret is set to the address where it
762  * should be inserted, and -ENOENT is returned.
763  *
764  * if insert is true and there are too many inline back refs, the path
765  * points to the extent item, and -EAGAIN is returned.
766  *
767  * NOTE: inline back refs are ordered in the same way that back ref
768  *       items in the tree are ordered.
769  */
770 static noinline_for_stack
771 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
772                                  struct btrfs_path *path,
773                                  struct btrfs_extent_inline_ref **ref_ret,
774                                  u64 bytenr, u64 num_bytes,
775                                  u64 parent, u64 root_objectid,
776                                  u64 owner, u64 offset, int insert)
777 {
778         struct btrfs_fs_info *fs_info = trans->fs_info;
779         struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
780         struct btrfs_key key;
781         struct extent_buffer *leaf;
782         struct btrfs_extent_item *ei;
783         struct btrfs_extent_inline_ref *iref;
784         u64 flags;
785         u64 item_size;
786         unsigned long ptr;
787         unsigned long end;
788         int extra_size;
789         int type;
790         int want;
791         int ret;
792         int err = 0;
793         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
794         int needed;
795
796         key.objectid = bytenr;
797         key.type = BTRFS_EXTENT_ITEM_KEY;
798         key.offset = num_bytes;
799
800         want = extent_ref_type(parent, owner);
801         if (insert) {
802                 extra_size = btrfs_extent_inline_ref_size(want);
803                 path->search_for_extension = 1;
804                 path->keep_locks = 1;
805         } else
806                 extra_size = -1;
807
808         /*
809          * Owner is our level, so we can just add one to get the level for the
810          * block we are interested in.
811          */
812         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
813                 key.type = BTRFS_METADATA_ITEM_KEY;
814                 key.offset = owner;
815         }
816
817 again:
818         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
819         if (ret < 0) {
820                 err = ret;
821                 goto out;
822         }
823
824         /*
825          * We may be a newly converted file system which still has the old fat
826          * extent entries for metadata, so try and see if we have one of those.
827          */
828         if (ret > 0 && skinny_metadata) {
829                 skinny_metadata = false;
830                 if (path->slots[0]) {
831                         path->slots[0]--;
832                         btrfs_item_key_to_cpu(path->nodes[0], &key,
833                                               path->slots[0]);
834                         if (key.objectid == bytenr &&
835                             key.type == BTRFS_EXTENT_ITEM_KEY &&
836                             key.offset == num_bytes)
837                                 ret = 0;
838                 }
839                 if (ret) {
840                         key.objectid = bytenr;
841                         key.type = BTRFS_EXTENT_ITEM_KEY;
842                         key.offset = num_bytes;
843                         btrfs_release_path(path);
844                         goto again;
845                 }
846         }
847
848         if (ret && !insert) {
849                 err = -ENOENT;
850                 goto out;
851         } else if (WARN_ON(ret)) {
852                 btrfs_print_leaf(path->nodes[0]);
853                 btrfs_err(fs_info,
854 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
855                           bytenr, num_bytes, parent, root_objectid, owner,
856                           offset);
857                 err = -EIO;
858                 goto out;
859         }
860
861         leaf = path->nodes[0];
862         item_size = btrfs_item_size(leaf, path->slots[0]);
863         if (unlikely(item_size < sizeof(*ei))) {
864                 err = -EUCLEAN;
865                 btrfs_err(fs_info,
866                           "unexpected extent item size, has %llu expect >= %zu",
867                           item_size, sizeof(*ei));
868                 btrfs_abort_transaction(trans, err);
869                 goto out;
870         }
871
872         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
873         flags = btrfs_extent_flags(leaf, ei);
874
875         ptr = (unsigned long)(ei + 1);
876         end = (unsigned long)ei + item_size;
877
878         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
879                 ptr += sizeof(struct btrfs_tree_block_info);
880                 BUG_ON(ptr > end);
881         }
882
883         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
884                 needed = BTRFS_REF_TYPE_DATA;
885         else
886                 needed = BTRFS_REF_TYPE_BLOCK;
887
888         err = -ENOENT;
889         while (1) {
890                 if (ptr >= end) {
891                         if (ptr > end) {
892                                 err = -EUCLEAN;
893                                 btrfs_print_leaf(path->nodes[0]);
894                                 btrfs_crit(fs_info,
895 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
896                                         path->slots[0], root_objectid, owner, offset, parent);
897                         }
898                         break;
899                 }
900                 iref = (struct btrfs_extent_inline_ref *)ptr;
901                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
902                 if (type == BTRFS_REF_TYPE_INVALID) {
903                         err = -EUCLEAN;
904                         goto out;
905                 }
906
907                 if (want < type)
908                         break;
909                 if (want > type) {
910                         ptr += btrfs_extent_inline_ref_size(type);
911                         continue;
912                 }
913
914                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
915                         struct btrfs_extent_data_ref *dref;
916                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
917                         if (match_extent_data_ref(leaf, dref, root_objectid,
918                                                   owner, offset)) {
919                                 err = 0;
920                                 break;
921                         }
922                         if (hash_extent_data_ref_item(leaf, dref) <
923                             hash_extent_data_ref(root_objectid, owner, offset))
924                                 break;
925                 } else {
926                         u64 ref_offset;
927                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
928                         if (parent > 0) {
929                                 if (parent == ref_offset) {
930                                         err = 0;
931                                         break;
932                                 }
933                                 if (ref_offset < parent)
934                                         break;
935                         } else {
936                                 if (root_objectid == ref_offset) {
937                                         err = 0;
938                                         break;
939                                 }
940                                 if (ref_offset < root_objectid)
941                                         break;
942                         }
943                 }
944                 ptr += btrfs_extent_inline_ref_size(type);
945         }
946         if (err == -ENOENT && insert) {
947                 if (item_size + extra_size >=
948                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
949                         err = -EAGAIN;
950                         goto out;
951                 }
952                 /*
953                  * To add new inline back ref, we have to make sure
954                  * there is no corresponding back ref item.
955                  * For simplicity, we just do not add new inline back
956                  * ref if there is any kind of item for this block
957                  */
958                 if (find_next_key(path, 0, &key) == 0 &&
959                     key.objectid == bytenr &&
960                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
961                         err = -EAGAIN;
962                         goto out;
963                 }
964         }
965         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
966 out:
967         if (insert) {
968                 path->keep_locks = 0;
969                 path->search_for_extension = 0;
970                 btrfs_unlock_up_safe(path, 1);
971         }
972         return err;
973 }
974
975 /*
976  * helper to add new inline back ref
977  */
978 static noinline_for_stack
979 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
980                                  struct btrfs_path *path,
981                                  struct btrfs_extent_inline_ref *iref,
982                                  u64 parent, u64 root_objectid,
983                                  u64 owner, u64 offset, int refs_to_add,
984                                  struct btrfs_delayed_extent_op *extent_op)
985 {
986         struct extent_buffer *leaf;
987         struct btrfs_extent_item *ei;
988         unsigned long ptr;
989         unsigned long end;
990         unsigned long item_offset;
991         u64 refs;
992         int size;
993         int type;
994
995         leaf = path->nodes[0];
996         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
997         item_offset = (unsigned long)iref - (unsigned long)ei;
998
999         type = extent_ref_type(parent, owner);
1000         size = btrfs_extent_inline_ref_size(type);
1001
1002         btrfs_extend_item(trans, path, size);
1003
1004         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1005         refs = btrfs_extent_refs(leaf, ei);
1006         refs += refs_to_add;
1007         btrfs_set_extent_refs(leaf, ei, refs);
1008         if (extent_op)
1009                 __run_delayed_extent_op(extent_op, leaf, ei);
1010
1011         ptr = (unsigned long)ei + item_offset;
1012         end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1013         if (ptr < end - size)
1014                 memmove_extent_buffer(leaf, ptr + size, ptr,
1015                                       end - size - ptr);
1016
1017         iref = (struct btrfs_extent_inline_ref *)ptr;
1018         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1019         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1020                 struct btrfs_extent_data_ref *dref;
1021                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1022                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1023                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1024                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1025                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1026         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1027                 struct btrfs_shared_data_ref *sref;
1028                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1029                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1030                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1031         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1032                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1033         } else {
1034                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1035         }
1036         btrfs_mark_buffer_dirty(trans, leaf);
1037 }
1038
1039 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1040                                  struct btrfs_path *path,
1041                                  struct btrfs_extent_inline_ref **ref_ret,
1042                                  u64 bytenr, u64 num_bytes, u64 parent,
1043                                  u64 root_objectid, u64 owner, u64 offset)
1044 {
1045         int ret;
1046
1047         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1048                                            num_bytes, parent, root_objectid,
1049                                            owner, offset, 0);
1050         if (ret != -ENOENT)
1051                 return ret;
1052
1053         btrfs_release_path(path);
1054         *ref_ret = NULL;
1055
1056         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1057                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1058                                             root_objectid);
1059         } else {
1060                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1061                                              root_objectid, owner, offset);
1062         }
1063         return ret;
1064 }
1065
1066 /*
1067  * helper to update/remove inline back ref
1068  */
1069 static noinline_for_stack int update_inline_extent_backref(
1070                                   struct btrfs_trans_handle *trans,
1071                                   struct btrfs_path *path,
1072                                   struct btrfs_extent_inline_ref *iref,
1073                                   int refs_to_mod,
1074                                   struct btrfs_delayed_extent_op *extent_op)
1075 {
1076         struct extent_buffer *leaf = path->nodes[0];
1077         struct btrfs_fs_info *fs_info = leaf->fs_info;
1078         struct btrfs_extent_item *ei;
1079         struct btrfs_extent_data_ref *dref = NULL;
1080         struct btrfs_shared_data_ref *sref = NULL;
1081         unsigned long ptr;
1082         unsigned long end;
1083         u32 item_size;
1084         int size;
1085         int type;
1086         u64 refs;
1087
1088         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1089         refs = btrfs_extent_refs(leaf, ei);
1090         if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) {
1091                 struct btrfs_key key;
1092                 u32 extent_size;
1093
1094                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1095                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1096                         extent_size = fs_info->nodesize;
1097                 else
1098                         extent_size = key.offset;
1099                 btrfs_print_leaf(leaf);
1100                 btrfs_err(fs_info,
1101         "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu",
1102                           key.objectid, extent_size, refs_to_mod, refs);
1103                 return -EUCLEAN;
1104         }
1105         refs += refs_to_mod;
1106         btrfs_set_extent_refs(leaf, ei, refs);
1107         if (extent_op)
1108                 __run_delayed_extent_op(extent_op, leaf, ei);
1109
1110         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1111         /*
1112          * Function btrfs_get_extent_inline_ref_type() has already printed
1113          * error messages.
1114          */
1115         if (unlikely(type == BTRFS_REF_TYPE_INVALID))
1116                 return -EUCLEAN;
1117
1118         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1119                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1120                 refs = btrfs_extent_data_ref_count(leaf, dref);
1121         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1122                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1123                 refs = btrfs_shared_data_ref_count(leaf, sref);
1124         } else {
1125                 refs = 1;
1126                 /*
1127                  * For tree blocks we can only drop one ref for it, and tree
1128                  * blocks should not have refs > 1.
1129                  *
1130                  * Furthermore if we're inserting a new inline backref, we
1131                  * won't reach this path either. That would be
1132                  * setup_inline_extent_backref().
1133                  */
1134                 if (unlikely(refs_to_mod != -1)) {
1135                         struct btrfs_key key;
1136
1137                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1138
1139                         btrfs_print_leaf(leaf);
1140                         btrfs_err(fs_info,
1141                         "invalid refs_to_mod for tree block %llu, has %d expect -1",
1142                                   key.objectid, refs_to_mod);
1143                         return -EUCLEAN;
1144                 }
1145         }
1146
1147         if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) {
1148                 struct btrfs_key key;
1149                 u32 extent_size;
1150
1151                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1152                 if (key.type == BTRFS_METADATA_ITEM_KEY)
1153                         extent_size = fs_info->nodesize;
1154                 else
1155                         extent_size = key.offset;
1156                 btrfs_print_leaf(leaf);
1157                 btrfs_err(fs_info,
1158 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu",
1159                           (unsigned long)iref, key.objectid, extent_size,
1160                           refs_to_mod, refs);
1161                 return -EUCLEAN;
1162         }
1163         refs += refs_to_mod;
1164
1165         if (refs > 0) {
1166                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1167                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1168                 else
1169                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1170         } else {
1171                 size =  btrfs_extent_inline_ref_size(type);
1172                 item_size = btrfs_item_size(leaf, path->slots[0]);
1173                 ptr = (unsigned long)iref;
1174                 end = (unsigned long)ei + item_size;
1175                 if (ptr + size < end)
1176                         memmove_extent_buffer(leaf, ptr, ptr + size,
1177                                               end - ptr - size);
1178                 item_size -= size;
1179                 btrfs_truncate_item(trans, path, item_size, 1);
1180         }
1181         btrfs_mark_buffer_dirty(trans, leaf);
1182         return 0;
1183 }
1184
1185 static noinline_for_stack
1186 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1187                                  struct btrfs_path *path,
1188                                  u64 bytenr, u64 num_bytes, u64 parent,
1189                                  u64 root_objectid, u64 owner,
1190                                  u64 offset, int refs_to_add,
1191                                  struct btrfs_delayed_extent_op *extent_op)
1192 {
1193         struct btrfs_extent_inline_ref *iref;
1194         int ret;
1195
1196         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1197                                            num_bytes, parent, root_objectid,
1198                                            owner, offset, 1);
1199         if (ret == 0) {
1200                 /*
1201                  * We're adding refs to a tree block we already own, this
1202                  * should not happen at all.
1203                  */
1204                 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1205                         btrfs_print_leaf(path->nodes[0]);
1206                         btrfs_crit(trans->fs_info,
1207 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u",
1208                                    bytenr, num_bytes, root_objectid, path->slots[0]);
1209                         return -EUCLEAN;
1210                 }
1211                 ret = update_inline_extent_backref(trans, path, iref,
1212                                                    refs_to_add, extent_op);
1213         } else if (ret == -ENOENT) {
1214                 setup_inline_extent_backref(trans, path, iref, parent,
1215                                             root_objectid, owner, offset,
1216                                             refs_to_add, extent_op);
1217                 ret = 0;
1218         }
1219         return ret;
1220 }
1221
1222 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1223                                  struct btrfs_root *root,
1224                                  struct btrfs_path *path,
1225                                  struct btrfs_extent_inline_ref *iref,
1226                                  int refs_to_drop, int is_data)
1227 {
1228         int ret = 0;
1229
1230         BUG_ON(!is_data && refs_to_drop != 1);
1231         if (iref)
1232                 ret = update_inline_extent_backref(trans, path, iref,
1233                                                    -refs_to_drop, NULL);
1234         else if (is_data)
1235                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1236         else
1237                 ret = btrfs_del_item(trans, root, path);
1238         return ret;
1239 }
1240
1241 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1242                                u64 *discarded_bytes)
1243 {
1244         int j, ret = 0;
1245         u64 bytes_left, end;
1246         u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
1247
1248         /* Adjust the range to be aligned to 512B sectors if necessary. */
1249         if (start != aligned_start) {
1250                 len -= aligned_start - start;
1251                 len = round_down(len, 1 << SECTOR_SHIFT);
1252                 start = aligned_start;
1253         }
1254
1255         *discarded_bytes = 0;
1256
1257         if (!len)
1258                 return 0;
1259
1260         end = start + len;
1261         bytes_left = len;
1262
1263         /* Skip any superblocks on this device. */
1264         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1265                 u64 sb_start = btrfs_sb_offset(j);
1266                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1267                 u64 size = sb_start - start;
1268
1269                 if (!in_range(sb_start, start, bytes_left) &&
1270                     !in_range(sb_end, start, bytes_left) &&
1271                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1272                         continue;
1273
1274                 /*
1275                  * Superblock spans beginning of range.  Adjust start and
1276                  * try again.
1277                  */
1278                 if (sb_start <= start) {
1279                         start += sb_end - start;
1280                         if (start > end) {
1281                                 bytes_left = 0;
1282                                 break;
1283                         }
1284                         bytes_left = end - start;
1285                         continue;
1286                 }
1287
1288                 if (size) {
1289                         ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1290                                                    size >> SECTOR_SHIFT,
1291                                                    GFP_NOFS);
1292                         if (!ret)
1293                                 *discarded_bytes += size;
1294                         else if (ret != -EOPNOTSUPP)
1295                                 return ret;
1296                 }
1297
1298                 start = sb_end;
1299                 if (start > end) {
1300                         bytes_left = 0;
1301                         break;
1302                 }
1303                 bytes_left = end - start;
1304         }
1305
1306         if (bytes_left) {
1307                 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
1308                                            bytes_left >> SECTOR_SHIFT,
1309                                            GFP_NOFS);
1310                 if (!ret)
1311                         *discarded_bytes += bytes_left;
1312         }
1313         return ret;
1314 }
1315
1316 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
1317 {
1318         struct btrfs_device *dev = stripe->dev;
1319         struct btrfs_fs_info *fs_info = dev->fs_info;
1320         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1321         u64 phys = stripe->physical;
1322         u64 len = stripe->length;
1323         u64 discarded = 0;
1324         int ret = 0;
1325
1326         /* Zone reset on a zoned filesystem */
1327         if (btrfs_can_zone_reset(dev, phys, len)) {
1328                 u64 src_disc;
1329
1330                 ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
1331                 if (ret)
1332                         goto out;
1333
1334                 if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
1335                     dev != dev_replace->srcdev)
1336                         goto out;
1337
1338                 src_disc = discarded;
1339
1340                 /* Send to replace target as well */
1341                 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
1342                                               &discarded);
1343                 discarded += src_disc;
1344         } else if (bdev_max_discard_sectors(stripe->dev->bdev)) {
1345                 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
1346         } else {
1347                 ret = 0;
1348                 *bytes = 0;
1349         }
1350
1351 out:
1352         *bytes = discarded;
1353         return ret;
1354 }
1355
1356 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1357                          u64 num_bytes, u64 *actual_bytes)
1358 {
1359         int ret = 0;
1360         u64 discarded_bytes = 0;
1361         u64 end = bytenr + num_bytes;
1362         u64 cur = bytenr;
1363
1364         /*
1365          * Avoid races with device replace and make sure the devices in the
1366          * stripes don't go away while we are discarding.
1367          */
1368         btrfs_bio_counter_inc_blocked(fs_info);
1369         while (cur < end) {
1370                 struct btrfs_discard_stripe *stripes;
1371                 unsigned int num_stripes;
1372                 int i;
1373
1374                 num_bytes = end - cur;
1375                 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
1376                 if (IS_ERR(stripes)) {
1377                         ret = PTR_ERR(stripes);
1378                         if (ret == -EOPNOTSUPP)
1379                                 ret = 0;
1380                         break;
1381                 }
1382
1383                 for (i = 0; i < num_stripes; i++) {
1384                         struct btrfs_discard_stripe *stripe = stripes + i;
1385                         u64 bytes;
1386
1387                         if (!stripe->dev->bdev) {
1388                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1389                                 continue;
1390                         }
1391
1392                         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
1393                                         &stripe->dev->dev_state))
1394                                 continue;
1395
1396                         ret = do_discard_extent(stripe, &bytes);
1397                         if (ret) {
1398                                 /*
1399                                  * Keep going if discard is not supported by the
1400                                  * device.
1401                                  */
1402                                 if (ret != -EOPNOTSUPP)
1403                                         break;
1404                                 ret = 0;
1405                         } else {
1406                                 discarded_bytes += bytes;
1407                         }
1408                 }
1409                 kfree(stripes);
1410                 if (ret)
1411                         break;
1412                 cur += num_bytes;
1413         }
1414         btrfs_bio_counter_dec(fs_info);
1415         if (actual_bytes)
1416                 *actual_bytes = discarded_bytes;
1417         return ret;
1418 }
1419
1420 /* Can return -ENOMEM */
1421 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1422                          struct btrfs_ref *generic_ref)
1423 {
1424         struct btrfs_fs_info *fs_info = trans->fs_info;
1425         int ret;
1426
1427         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1428                generic_ref->action);
1429         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1430                generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1431
1432         if (generic_ref->type == BTRFS_REF_METADATA)
1433                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1434         else
1435                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1436
1437         btrfs_ref_tree_mod(fs_info, generic_ref);
1438
1439         return ret;
1440 }
1441
1442 /*
1443  * __btrfs_inc_extent_ref - insert backreference for a given extent
1444  *
1445  * The counterpart is in __btrfs_free_extent(), with examples and more details
1446  * how it works.
1447  *
1448  * @trans:          Handle of transaction
1449  *
1450  * @node:           The delayed ref node used to get the bytenr/length for
1451  *                  extent whose references are incremented.
1452  *
1453  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1454  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1455  *                  bytenr of the parent block. Since new extents are always
1456  *                  created with indirect references, this will only be the case
1457  *                  when relocating a shared extent. In that case, root_objectid
1458  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1459  *                  be 0
1460  *
1461  * @root_objectid:  The id of the root where this modification has originated,
1462  *                  this can be either one of the well-known metadata trees or
1463  *                  the subvolume id which references this extent.
1464  *
1465  * @owner:          For data extents it is the inode number of the owning file.
1466  *                  For metadata extents this parameter holds the level in the
1467  *                  tree of the extent.
1468  *
1469  * @offset:         For metadata extents the offset is ignored and is currently
1470  *                  always passed as 0. For data extents it is the fileoffset
1471  *                  this extent belongs to.
1472  *
1473  * @refs_to_add     Number of references to add
1474  *
1475  * @extent_op       Pointer to a structure, holding information necessary when
1476  *                  updating a tree block's flags
1477  *
1478  */
1479 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1480                                   struct btrfs_delayed_ref_node *node,
1481                                   u64 parent, u64 root_objectid,
1482                                   u64 owner, u64 offset, int refs_to_add,
1483                                   struct btrfs_delayed_extent_op *extent_op)
1484 {
1485         struct btrfs_path *path;
1486         struct extent_buffer *leaf;
1487         struct btrfs_extent_item *item;
1488         struct btrfs_key key;
1489         u64 bytenr = node->bytenr;
1490         u64 num_bytes = node->num_bytes;
1491         u64 refs;
1492         int ret;
1493
1494         path = btrfs_alloc_path();
1495         if (!path)
1496                 return -ENOMEM;
1497
1498         /* this will setup the path even if it fails to insert the back ref */
1499         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1500                                            parent, root_objectid, owner,
1501                                            offset, refs_to_add, extent_op);
1502         if ((ret < 0 && ret != -EAGAIN) || !ret)
1503                 goto out;
1504
1505         /*
1506          * Ok we had -EAGAIN which means we didn't have space to insert and
1507          * inline extent ref, so just update the reference count and add a
1508          * normal backref.
1509          */
1510         leaf = path->nodes[0];
1511         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1512         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1513         refs = btrfs_extent_refs(leaf, item);
1514         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1515         if (extent_op)
1516                 __run_delayed_extent_op(extent_op, leaf, item);
1517
1518         btrfs_mark_buffer_dirty(trans, leaf);
1519         btrfs_release_path(path);
1520
1521         /* now insert the actual backref */
1522         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1523                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1524                                             root_objectid);
1525         else
1526                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1527                                              root_objectid, owner, offset,
1528                                              refs_to_add);
1529
1530         if (ret)
1531                 btrfs_abort_transaction(trans, ret);
1532 out:
1533         btrfs_free_path(path);
1534         return ret;
1535 }
1536
1537 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1538                                 struct btrfs_delayed_ref_node *node,
1539                                 struct btrfs_delayed_extent_op *extent_op,
1540                                 bool insert_reserved)
1541 {
1542         int ret = 0;
1543         struct btrfs_delayed_data_ref *ref;
1544         struct btrfs_key ins;
1545         u64 parent = 0;
1546         u64 ref_root = 0;
1547         u64 flags = 0;
1548
1549         ins.objectid = node->bytenr;
1550         ins.offset = node->num_bytes;
1551         ins.type = BTRFS_EXTENT_ITEM_KEY;
1552
1553         ref = btrfs_delayed_node_to_data_ref(node);
1554         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1555
1556         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1557                 parent = ref->parent;
1558         ref_root = ref->root;
1559
1560         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1561                 if (extent_op)
1562                         flags |= extent_op->flags_to_set;
1563                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1564                                                  flags, ref->objectid,
1565                                                  ref->offset, &ins,
1566                                                  node->ref_mod);
1567         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1568                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1569                                              ref->objectid, ref->offset,
1570                                              node->ref_mod, extent_op);
1571         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1572                 ret = __btrfs_free_extent(trans, node, parent,
1573                                           ref_root, ref->objectid,
1574                                           ref->offset, node->ref_mod,
1575                                           extent_op);
1576         } else {
1577                 BUG();
1578         }
1579         return ret;
1580 }
1581
1582 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1583                                     struct extent_buffer *leaf,
1584                                     struct btrfs_extent_item *ei)
1585 {
1586         u64 flags = btrfs_extent_flags(leaf, ei);
1587         if (extent_op->update_flags) {
1588                 flags |= extent_op->flags_to_set;
1589                 btrfs_set_extent_flags(leaf, ei, flags);
1590         }
1591
1592         if (extent_op->update_key) {
1593                 struct btrfs_tree_block_info *bi;
1594                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1595                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1596                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1597         }
1598 }
1599
1600 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1601                                  struct btrfs_delayed_ref_head *head,
1602                                  struct btrfs_delayed_extent_op *extent_op)
1603 {
1604         struct btrfs_fs_info *fs_info = trans->fs_info;
1605         struct btrfs_root *root;
1606         struct btrfs_key key;
1607         struct btrfs_path *path;
1608         struct btrfs_extent_item *ei;
1609         struct extent_buffer *leaf;
1610         u32 item_size;
1611         int ret;
1612         int err = 0;
1613         int metadata = 1;
1614
1615         if (TRANS_ABORTED(trans))
1616                 return 0;
1617
1618         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1619                 metadata = 0;
1620
1621         path = btrfs_alloc_path();
1622         if (!path)
1623                 return -ENOMEM;
1624
1625         key.objectid = head->bytenr;
1626
1627         if (metadata) {
1628                 key.type = BTRFS_METADATA_ITEM_KEY;
1629                 key.offset = extent_op->level;
1630         } else {
1631                 key.type = BTRFS_EXTENT_ITEM_KEY;
1632                 key.offset = head->num_bytes;
1633         }
1634
1635         root = btrfs_extent_root(fs_info, key.objectid);
1636 again:
1637         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1638         if (ret < 0) {
1639                 err = ret;
1640                 goto out;
1641         }
1642         if (ret > 0) {
1643                 if (metadata) {
1644                         if (path->slots[0] > 0) {
1645                                 path->slots[0]--;
1646                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1647                                                       path->slots[0]);
1648                                 if (key.objectid == head->bytenr &&
1649                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1650                                     key.offset == head->num_bytes)
1651                                         ret = 0;
1652                         }
1653                         if (ret > 0) {
1654                                 btrfs_release_path(path);
1655                                 metadata = 0;
1656
1657                                 key.objectid = head->bytenr;
1658                                 key.offset = head->num_bytes;
1659                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1660                                 goto again;
1661                         }
1662                 } else {
1663                         err = -EUCLEAN;
1664                         btrfs_err(fs_info,
1665                   "missing extent item for extent %llu num_bytes %llu level %d",
1666                                   head->bytenr, head->num_bytes, extent_op->level);
1667                         goto out;
1668                 }
1669         }
1670
1671         leaf = path->nodes[0];
1672         item_size = btrfs_item_size(leaf, path->slots[0]);
1673
1674         if (unlikely(item_size < sizeof(*ei))) {
1675                 err = -EUCLEAN;
1676                 btrfs_err(fs_info,
1677                           "unexpected extent item size, has %u expect >= %zu",
1678                           item_size, sizeof(*ei));
1679                 btrfs_abort_transaction(trans, err);
1680                 goto out;
1681         }
1682
1683         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1684         __run_delayed_extent_op(extent_op, leaf, ei);
1685
1686         btrfs_mark_buffer_dirty(trans, leaf);
1687 out:
1688         btrfs_free_path(path);
1689         return err;
1690 }
1691
1692 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1693                                 struct btrfs_delayed_ref_node *node,
1694                                 struct btrfs_delayed_extent_op *extent_op,
1695                                 bool insert_reserved)
1696 {
1697         int ret = 0;
1698         struct btrfs_delayed_tree_ref *ref;
1699         u64 parent = 0;
1700         u64 ref_root = 0;
1701
1702         ref = btrfs_delayed_node_to_tree_ref(node);
1703         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1704
1705         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1706                 parent = ref->parent;
1707         ref_root = ref->root;
1708
1709         if (unlikely(node->ref_mod != 1)) {
1710                 btrfs_err(trans->fs_info,
1711         "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
1712                           node->bytenr, node->ref_mod, node->action, ref_root,
1713                           parent);
1714                 return -EUCLEAN;
1715         }
1716         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1717                 BUG_ON(!extent_op || !extent_op->update_flags);
1718                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1719         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1720                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1721                                              ref->level, 0, 1, extent_op);
1722         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1723                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
1724                                           ref->level, 0, 1, extent_op);
1725         } else {
1726                 BUG();
1727         }
1728         return ret;
1729 }
1730
1731 /* helper function to actually process a single delayed ref entry */
1732 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1733                                struct btrfs_delayed_ref_node *node,
1734                                struct btrfs_delayed_extent_op *extent_op,
1735                                bool insert_reserved)
1736 {
1737         int ret = 0;
1738
1739         if (TRANS_ABORTED(trans)) {
1740                 if (insert_reserved)
1741                         btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1742                 return 0;
1743         }
1744
1745         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1746             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1747                 ret = run_delayed_tree_ref(trans, node, extent_op,
1748                                            insert_reserved);
1749         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1750                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1751                 ret = run_delayed_data_ref(trans, node, extent_op,
1752                                            insert_reserved);
1753         else
1754                 BUG();
1755         if (ret && insert_reserved)
1756                 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1757         if (ret < 0)
1758                 btrfs_err(trans->fs_info,
1759 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
1760                           node->bytenr, node->num_bytes, node->type,
1761                           node->action, node->ref_mod, ret);
1762         return ret;
1763 }
1764
1765 static inline struct btrfs_delayed_ref_node *
1766 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1767 {
1768         struct btrfs_delayed_ref_node *ref;
1769
1770         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1771                 return NULL;
1772
1773         /*
1774          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1775          * This is to prevent a ref count from going down to zero, which deletes
1776          * the extent item from the extent tree, when there still are references
1777          * to add, which would fail because they would not find the extent item.
1778          */
1779         if (!list_empty(&head->ref_add_list))
1780                 return list_first_entry(&head->ref_add_list,
1781                                 struct btrfs_delayed_ref_node, add_list);
1782
1783         ref = rb_entry(rb_first_cached(&head->ref_tree),
1784                        struct btrfs_delayed_ref_node, ref_node);
1785         ASSERT(list_empty(&ref->add_list));
1786         return ref;
1787 }
1788
1789 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1790                                       struct btrfs_delayed_ref_head *head)
1791 {
1792         spin_lock(&delayed_refs->lock);
1793         head->processing = false;
1794         delayed_refs->num_heads_ready++;
1795         spin_unlock(&delayed_refs->lock);
1796         btrfs_delayed_ref_unlock(head);
1797 }
1798
1799 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1800                                 struct btrfs_delayed_ref_head *head)
1801 {
1802         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1803
1804         if (!extent_op)
1805                 return NULL;
1806
1807         if (head->must_insert_reserved) {
1808                 head->extent_op = NULL;
1809                 btrfs_free_delayed_extent_op(extent_op);
1810                 return NULL;
1811         }
1812         return extent_op;
1813 }
1814
1815 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1816                                      struct btrfs_delayed_ref_head *head)
1817 {
1818         struct btrfs_delayed_extent_op *extent_op;
1819         int ret;
1820
1821         extent_op = cleanup_extent_op(head);
1822         if (!extent_op)
1823                 return 0;
1824         head->extent_op = NULL;
1825         spin_unlock(&head->lock);
1826         ret = run_delayed_extent_op(trans, head, extent_op);
1827         btrfs_free_delayed_extent_op(extent_op);
1828         return ret ? ret : 1;
1829 }
1830
1831 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1832                                   struct btrfs_delayed_ref_root *delayed_refs,
1833                                   struct btrfs_delayed_ref_head *head)
1834 {
1835         int nr_items = 1;       /* Dropping this ref head update. */
1836
1837         /*
1838          * We had csum deletions accounted for in our delayed refs rsv, we need
1839          * to drop the csum leaves for this update from our delayed_refs_rsv.
1840          */
1841         if (head->total_ref_mod < 0 && head->is_data) {
1842                 spin_lock(&delayed_refs->lock);
1843                 delayed_refs->pending_csums -= head->num_bytes;
1844                 spin_unlock(&delayed_refs->lock);
1845                 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
1846         }
1847
1848         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1849 }
1850
1851 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1852                             struct btrfs_delayed_ref_head *head)
1853 {
1854
1855         struct btrfs_fs_info *fs_info = trans->fs_info;
1856         struct btrfs_delayed_ref_root *delayed_refs;
1857         int ret;
1858
1859         delayed_refs = &trans->transaction->delayed_refs;
1860
1861         ret = run_and_cleanup_extent_op(trans, head);
1862         if (ret < 0) {
1863                 unselect_delayed_ref_head(delayed_refs, head);
1864                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1865                 return ret;
1866         } else if (ret) {
1867                 return ret;
1868         }
1869
1870         /*
1871          * Need to drop our head ref lock and re-acquire the delayed ref lock
1872          * and then re-check to make sure nobody got added.
1873          */
1874         spin_unlock(&head->lock);
1875         spin_lock(&delayed_refs->lock);
1876         spin_lock(&head->lock);
1877         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1878                 spin_unlock(&head->lock);
1879                 spin_unlock(&delayed_refs->lock);
1880                 return 1;
1881         }
1882         btrfs_delete_ref_head(delayed_refs, head);
1883         spin_unlock(&head->lock);
1884         spin_unlock(&delayed_refs->lock);
1885
1886         if (head->must_insert_reserved) {
1887                 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1888                 if (head->is_data) {
1889                         struct btrfs_root *csum_root;
1890
1891                         csum_root = btrfs_csum_root(fs_info, head->bytenr);
1892                         ret = btrfs_del_csums(trans, csum_root, head->bytenr,
1893                                               head->num_bytes);
1894                 }
1895         }
1896
1897         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1898
1899         trace_run_delayed_ref_head(fs_info, head, 0);
1900         btrfs_delayed_ref_unlock(head);
1901         btrfs_put_delayed_ref_head(head);
1902         return ret;
1903 }
1904
1905 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1906                                         struct btrfs_trans_handle *trans)
1907 {
1908         struct btrfs_delayed_ref_root *delayed_refs =
1909                 &trans->transaction->delayed_refs;
1910         struct btrfs_delayed_ref_head *head = NULL;
1911         int ret;
1912
1913         spin_lock(&delayed_refs->lock);
1914         head = btrfs_select_ref_head(delayed_refs);
1915         if (!head) {
1916                 spin_unlock(&delayed_refs->lock);
1917                 return head;
1918         }
1919
1920         /*
1921          * Grab the lock that says we are going to process all the refs for
1922          * this head
1923          */
1924         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1925         spin_unlock(&delayed_refs->lock);
1926
1927         /*
1928          * We may have dropped the spin lock to get the head mutex lock, and
1929          * that might have given someone else time to free the head.  If that's
1930          * true, it has been removed from our list and we can move on.
1931          */
1932         if (ret == -EAGAIN)
1933                 head = ERR_PTR(-EAGAIN);
1934
1935         return head;
1936 }
1937
1938 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1939                                            struct btrfs_delayed_ref_head *locked_ref)
1940 {
1941         struct btrfs_fs_info *fs_info = trans->fs_info;
1942         struct btrfs_delayed_ref_root *delayed_refs;
1943         struct btrfs_delayed_extent_op *extent_op;
1944         struct btrfs_delayed_ref_node *ref;
1945         bool must_insert_reserved;
1946         int ret;
1947
1948         delayed_refs = &trans->transaction->delayed_refs;
1949
1950         lockdep_assert_held(&locked_ref->mutex);
1951         lockdep_assert_held(&locked_ref->lock);
1952
1953         while ((ref = select_delayed_ref(locked_ref))) {
1954                 if (ref->seq &&
1955                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1956                         spin_unlock(&locked_ref->lock);
1957                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1958                         return -EAGAIN;
1959                 }
1960
1961                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1962                 RB_CLEAR_NODE(&ref->ref_node);
1963                 if (!list_empty(&ref->add_list))
1964                         list_del(&ref->add_list);
1965                 /*
1966                  * When we play the delayed ref, also correct the ref_mod on
1967                  * head
1968                  */
1969                 switch (ref->action) {
1970                 case BTRFS_ADD_DELAYED_REF:
1971                 case BTRFS_ADD_DELAYED_EXTENT:
1972                         locked_ref->ref_mod -= ref->ref_mod;
1973                         break;
1974                 case BTRFS_DROP_DELAYED_REF:
1975                         locked_ref->ref_mod += ref->ref_mod;
1976                         break;
1977                 default:
1978                         WARN_ON(1);
1979                 }
1980                 atomic_dec(&delayed_refs->num_entries);
1981
1982                 /*
1983                  * Record the must_insert_reserved flag before we drop the
1984                  * spin lock.
1985                  */
1986                 must_insert_reserved = locked_ref->must_insert_reserved;
1987                 locked_ref->must_insert_reserved = false;
1988
1989                 extent_op = locked_ref->extent_op;
1990                 locked_ref->extent_op = NULL;
1991                 spin_unlock(&locked_ref->lock);
1992
1993                 ret = run_one_delayed_ref(trans, ref, extent_op,
1994                                           must_insert_reserved);
1995
1996                 btrfs_free_delayed_extent_op(extent_op);
1997                 if (ret) {
1998                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1999                         btrfs_put_delayed_ref(ref);
2000                         return ret;
2001                 }
2002
2003                 btrfs_put_delayed_ref(ref);
2004                 cond_resched();
2005
2006                 spin_lock(&locked_ref->lock);
2007                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2008         }
2009
2010         return 0;
2011 }
2012
2013 /*
2014  * Returns 0 on success or if called with an already aborted transaction.
2015  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2016  */
2017 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2018                                              unsigned long nr)
2019 {
2020         struct btrfs_fs_info *fs_info = trans->fs_info;
2021         struct btrfs_delayed_ref_root *delayed_refs;
2022         struct btrfs_delayed_ref_head *locked_ref = NULL;
2023         int ret;
2024         unsigned long count = 0;
2025
2026         delayed_refs = &trans->transaction->delayed_refs;
2027         do {
2028                 if (!locked_ref) {
2029                         locked_ref = btrfs_obtain_ref_head(trans);
2030                         if (IS_ERR_OR_NULL(locked_ref)) {
2031                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2032                                         continue;
2033                                 } else {
2034                                         break;
2035                                 }
2036                         }
2037                         count++;
2038                 }
2039                 /*
2040                  * We need to try and merge add/drops of the same ref since we
2041                  * can run into issues with relocate dropping the implicit ref
2042                  * and then it being added back again before the drop can
2043                  * finish.  If we merged anything we need to re-loop so we can
2044                  * get a good ref.
2045                  * Or we can get node references of the same type that weren't
2046                  * merged when created due to bumps in the tree mod seq, and
2047                  * we need to merge them to prevent adding an inline extent
2048                  * backref before dropping it (triggering a BUG_ON at
2049                  * insert_inline_extent_backref()).
2050                  */
2051                 spin_lock(&locked_ref->lock);
2052                 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
2053
2054                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
2055                 if (ret < 0 && ret != -EAGAIN) {
2056                         /*
2057                          * Error, btrfs_run_delayed_refs_for_head already
2058                          * unlocked everything so just bail out
2059                          */
2060                         return ret;
2061                 } else if (!ret) {
2062                         /*
2063                          * Success, perform the usual cleanup of a processed
2064                          * head
2065                          */
2066                         ret = cleanup_ref_head(trans, locked_ref);
2067                         if (ret > 0 ) {
2068                                 /* We dropped our lock, we need to loop. */
2069                                 ret = 0;
2070                                 continue;
2071                         } else if (ret) {
2072                                 return ret;
2073                         }
2074                 }
2075
2076                 /*
2077                  * Either success case or btrfs_run_delayed_refs_for_head
2078                  * returned -EAGAIN, meaning we need to select another head
2079                  */
2080
2081                 locked_ref = NULL;
2082                 cond_resched();
2083         } while ((nr != -1 && count < nr) || locked_ref);
2084
2085         return 0;
2086 }
2087
2088 #ifdef SCRAMBLE_DELAYED_REFS
2089 /*
2090  * Normally delayed refs get processed in ascending bytenr order. This
2091  * correlates in most cases to the order added. To expose dependencies on this
2092  * order, we start to process the tree in the middle instead of the beginning
2093  */
2094 static u64 find_middle(struct rb_root *root)
2095 {
2096         struct rb_node *n = root->rb_node;
2097         struct btrfs_delayed_ref_node *entry;
2098         int alt = 1;
2099         u64 middle;
2100         u64 first = 0, last = 0;
2101
2102         n = rb_first(root);
2103         if (n) {
2104                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2105                 first = entry->bytenr;
2106         }
2107         n = rb_last(root);
2108         if (n) {
2109                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2110                 last = entry->bytenr;
2111         }
2112         n = root->rb_node;
2113
2114         while (n) {
2115                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2116                 WARN_ON(!entry->in_tree);
2117
2118                 middle = entry->bytenr;
2119
2120                 if (alt)
2121                         n = n->rb_left;
2122                 else
2123                         n = n->rb_right;
2124
2125                 alt = 1 - alt;
2126         }
2127         return middle;
2128 }
2129 #endif
2130
2131 /*
2132  * this starts processing the delayed reference count updates and
2133  * extent insertions we have queued up so far.  count can be
2134  * 0, which means to process everything in the tree at the start
2135  * of the run (but not newly added entries), or it can be some target
2136  * number you'd like to process.
2137  *
2138  * Returns 0 on success or if called with an aborted transaction
2139  * Returns <0 on error and aborts the transaction
2140  */
2141 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142                            unsigned long count)
2143 {
2144         struct btrfs_fs_info *fs_info = trans->fs_info;
2145         struct rb_node *node;
2146         struct btrfs_delayed_ref_root *delayed_refs;
2147         struct btrfs_delayed_ref_head *head;
2148         int ret;
2149         int run_all = count == (unsigned long)-1;
2150
2151         /* We'll clean this up in btrfs_cleanup_transaction */
2152         if (TRANS_ABORTED(trans))
2153                 return 0;
2154
2155         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2156                 return 0;
2157
2158         delayed_refs = &trans->transaction->delayed_refs;
2159         if (count == 0)
2160                 count = delayed_refs->num_heads_ready;
2161
2162 again:
2163 #ifdef SCRAMBLE_DELAYED_REFS
2164         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2165 #endif
2166         ret = __btrfs_run_delayed_refs(trans, count);
2167         if (ret < 0) {
2168                 btrfs_abort_transaction(trans, ret);
2169                 return ret;
2170         }
2171
2172         if (run_all) {
2173                 btrfs_create_pending_block_groups(trans);
2174
2175                 spin_lock(&delayed_refs->lock);
2176                 node = rb_first_cached(&delayed_refs->href_root);
2177                 if (!node) {
2178                         spin_unlock(&delayed_refs->lock);
2179                         goto out;
2180                 }
2181                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2182                                 href_node);
2183                 refcount_inc(&head->refs);
2184                 spin_unlock(&delayed_refs->lock);
2185
2186                 /* Mutex was contended, block until it's released and retry. */
2187                 mutex_lock(&head->mutex);
2188                 mutex_unlock(&head->mutex);
2189
2190                 btrfs_put_delayed_ref_head(head);
2191                 cond_resched();
2192                 goto again;
2193         }
2194 out:
2195         return 0;
2196 }
2197
2198 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2199                                 struct extent_buffer *eb, u64 flags)
2200 {
2201         struct btrfs_delayed_extent_op *extent_op;
2202         int level = btrfs_header_level(eb);
2203         int ret;
2204
2205         extent_op = btrfs_alloc_delayed_extent_op();
2206         if (!extent_op)
2207                 return -ENOMEM;
2208
2209         extent_op->flags_to_set = flags;
2210         extent_op->update_flags = true;
2211         extent_op->update_key = false;
2212         extent_op->level = level;
2213
2214         ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2215         if (ret)
2216                 btrfs_free_delayed_extent_op(extent_op);
2217         return ret;
2218 }
2219
2220 static noinline int check_delayed_ref(struct btrfs_root *root,
2221                                       struct btrfs_path *path,
2222                                       u64 objectid, u64 offset, u64 bytenr)
2223 {
2224         struct btrfs_delayed_ref_head *head;
2225         struct btrfs_delayed_ref_node *ref;
2226         struct btrfs_delayed_data_ref *data_ref;
2227         struct btrfs_delayed_ref_root *delayed_refs;
2228         struct btrfs_transaction *cur_trans;
2229         struct rb_node *node;
2230         int ret = 0;
2231
2232         spin_lock(&root->fs_info->trans_lock);
2233         cur_trans = root->fs_info->running_transaction;
2234         if (cur_trans)
2235                 refcount_inc(&cur_trans->use_count);
2236         spin_unlock(&root->fs_info->trans_lock);
2237         if (!cur_trans)
2238                 return 0;
2239
2240         delayed_refs = &cur_trans->delayed_refs;
2241         spin_lock(&delayed_refs->lock);
2242         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2243         if (!head) {
2244                 spin_unlock(&delayed_refs->lock);
2245                 btrfs_put_transaction(cur_trans);
2246                 return 0;
2247         }
2248
2249         if (!mutex_trylock(&head->mutex)) {
2250                 if (path->nowait) {
2251                         spin_unlock(&delayed_refs->lock);
2252                         btrfs_put_transaction(cur_trans);
2253                         return -EAGAIN;
2254                 }
2255
2256                 refcount_inc(&head->refs);
2257                 spin_unlock(&delayed_refs->lock);
2258
2259                 btrfs_release_path(path);
2260
2261                 /*
2262                  * Mutex was contended, block until it's released and let
2263                  * caller try again
2264                  */
2265                 mutex_lock(&head->mutex);
2266                 mutex_unlock(&head->mutex);
2267                 btrfs_put_delayed_ref_head(head);
2268                 btrfs_put_transaction(cur_trans);
2269                 return -EAGAIN;
2270         }
2271         spin_unlock(&delayed_refs->lock);
2272
2273         spin_lock(&head->lock);
2274         /*
2275          * XXX: We should replace this with a proper search function in the
2276          * future.
2277          */
2278         for (node = rb_first_cached(&head->ref_tree); node;
2279              node = rb_next(node)) {
2280                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2281                 /* If it's a shared ref we know a cross reference exists */
2282                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2283                         ret = 1;
2284                         break;
2285                 }
2286
2287                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2288
2289                 /*
2290                  * If our ref doesn't match the one we're currently looking at
2291                  * then we have a cross reference.
2292                  */
2293                 if (data_ref->root != root->root_key.objectid ||
2294                     data_ref->objectid != objectid ||
2295                     data_ref->offset != offset) {
2296                         ret = 1;
2297                         break;
2298                 }
2299         }
2300         spin_unlock(&head->lock);
2301         mutex_unlock(&head->mutex);
2302         btrfs_put_transaction(cur_trans);
2303         return ret;
2304 }
2305
2306 static noinline int check_committed_ref(struct btrfs_root *root,
2307                                         struct btrfs_path *path,
2308                                         u64 objectid, u64 offset, u64 bytenr,
2309                                         bool strict)
2310 {
2311         struct btrfs_fs_info *fs_info = root->fs_info;
2312         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2313         struct extent_buffer *leaf;
2314         struct btrfs_extent_data_ref *ref;
2315         struct btrfs_extent_inline_ref *iref;
2316         struct btrfs_extent_item *ei;
2317         struct btrfs_key key;
2318         u32 item_size;
2319         int type;
2320         int ret;
2321
2322         key.objectid = bytenr;
2323         key.offset = (u64)-1;
2324         key.type = BTRFS_EXTENT_ITEM_KEY;
2325
2326         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2327         if (ret < 0)
2328                 goto out;
2329         BUG_ON(ret == 0); /* Corruption */
2330
2331         ret = -ENOENT;
2332         if (path->slots[0] == 0)
2333                 goto out;
2334
2335         path->slots[0]--;
2336         leaf = path->nodes[0];
2337         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2338
2339         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2340                 goto out;
2341
2342         ret = 1;
2343         item_size = btrfs_item_size(leaf, path->slots[0]);
2344         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2345
2346         /* If extent item has more than 1 inline ref then it's shared */
2347         if (item_size != sizeof(*ei) +
2348             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2349                 goto out;
2350
2351         /*
2352          * If extent created before last snapshot => it's shared unless the
2353          * snapshot has been deleted. Use the heuristic if strict is false.
2354          */
2355         if (!strict &&
2356             (btrfs_extent_generation(leaf, ei) <=
2357              btrfs_root_last_snapshot(&root->root_item)))
2358                 goto out;
2359
2360         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2361
2362         /* If this extent has SHARED_DATA_REF then it's shared */
2363         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2364         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2365                 goto out;
2366
2367         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2368         if (btrfs_extent_refs(leaf, ei) !=
2369             btrfs_extent_data_ref_count(leaf, ref) ||
2370             btrfs_extent_data_ref_root(leaf, ref) !=
2371             root->root_key.objectid ||
2372             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2373             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2374                 goto out;
2375
2376         ret = 0;
2377 out:
2378         return ret;
2379 }
2380
2381 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2382                           u64 bytenr, bool strict, struct btrfs_path *path)
2383 {
2384         int ret;
2385
2386         do {
2387                 ret = check_committed_ref(root, path, objectid,
2388                                           offset, bytenr, strict);
2389                 if (ret && ret != -ENOENT)
2390                         goto out;
2391
2392                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2393         } while (ret == -EAGAIN);
2394
2395 out:
2396         btrfs_release_path(path);
2397         if (btrfs_is_data_reloc_root(root))
2398                 WARN_ON(ret > 0);
2399         return ret;
2400 }
2401
2402 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2403                            struct btrfs_root *root,
2404                            struct extent_buffer *buf,
2405                            int full_backref, int inc)
2406 {
2407         struct btrfs_fs_info *fs_info = root->fs_info;
2408         u64 bytenr;
2409         u64 num_bytes;
2410         u64 parent;
2411         u64 ref_root;
2412         u32 nritems;
2413         struct btrfs_key key;
2414         struct btrfs_file_extent_item *fi;
2415         struct btrfs_ref generic_ref = { 0 };
2416         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2417         int i;
2418         int action;
2419         int level;
2420         int ret = 0;
2421
2422         if (btrfs_is_testing(fs_info))
2423                 return 0;
2424
2425         ref_root = btrfs_header_owner(buf);
2426         nritems = btrfs_header_nritems(buf);
2427         level = btrfs_header_level(buf);
2428
2429         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2430                 return 0;
2431
2432         if (full_backref)
2433                 parent = buf->start;
2434         else
2435                 parent = 0;
2436         if (inc)
2437                 action = BTRFS_ADD_DELAYED_REF;
2438         else
2439                 action = BTRFS_DROP_DELAYED_REF;
2440
2441         for (i = 0; i < nritems; i++) {
2442                 if (level == 0) {
2443                         btrfs_item_key_to_cpu(buf, &key, i);
2444                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2445                                 continue;
2446                         fi = btrfs_item_ptr(buf, i,
2447                                             struct btrfs_file_extent_item);
2448                         if (btrfs_file_extent_type(buf, fi) ==
2449                             BTRFS_FILE_EXTENT_INLINE)
2450                                 continue;
2451                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2452                         if (bytenr == 0)
2453                                 continue;
2454
2455                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2456                         key.offset -= btrfs_file_extent_offset(buf, fi);
2457                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2458                                                num_bytes, parent);
2459                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2460                                             key.offset, root->root_key.objectid,
2461                                             for_reloc);
2462                         if (inc)
2463                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2464                         else
2465                                 ret = btrfs_free_extent(trans, &generic_ref);
2466                         if (ret)
2467                                 goto fail;
2468                 } else {
2469                         bytenr = btrfs_node_blockptr(buf, i);
2470                         num_bytes = fs_info->nodesize;
2471                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2472                                                num_bytes, parent);
2473                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
2474                                             root->root_key.objectid, for_reloc);
2475                         if (inc)
2476                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2477                         else
2478                                 ret = btrfs_free_extent(trans, &generic_ref);
2479                         if (ret)
2480                                 goto fail;
2481                 }
2482         }
2483         return 0;
2484 fail:
2485         return ret;
2486 }
2487
2488 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2489                   struct extent_buffer *buf, int full_backref)
2490 {
2491         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2492 }
2493
2494 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2495                   struct extent_buffer *buf, int full_backref)
2496 {
2497         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2498 }
2499
2500 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2501 {
2502         struct btrfs_fs_info *fs_info = root->fs_info;
2503         u64 flags;
2504         u64 ret;
2505
2506         if (data)
2507                 flags = BTRFS_BLOCK_GROUP_DATA;
2508         else if (root == fs_info->chunk_root)
2509                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2510         else
2511                 flags = BTRFS_BLOCK_GROUP_METADATA;
2512
2513         ret = btrfs_get_alloc_profile(fs_info, flags);
2514         return ret;
2515 }
2516
2517 static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
2518 {
2519         struct rb_node *leftmost;
2520         u64 bytenr = 0;
2521
2522         read_lock(&fs_info->block_group_cache_lock);
2523         /* Get the block group with the lowest logical start address. */
2524         leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
2525         if (leftmost) {
2526                 struct btrfs_block_group *bg;
2527
2528                 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
2529                 bytenr = bg->start;
2530         }
2531         read_unlock(&fs_info->block_group_cache_lock);
2532
2533         return bytenr;
2534 }
2535
2536 static int pin_down_extent(struct btrfs_trans_handle *trans,
2537                            struct btrfs_block_group *cache,
2538                            u64 bytenr, u64 num_bytes, int reserved)
2539 {
2540         struct btrfs_fs_info *fs_info = cache->fs_info;
2541
2542         spin_lock(&cache->space_info->lock);
2543         spin_lock(&cache->lock);
2544         cache->pinned += num_bytes;
2545         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2546                                              num_bytes);
2547         if (reserved) {
2548                 cache->reserved -= num_bytes;
2549                 cache->space_info->bytes_reserved -= num_bytes;
2550         }
2551         spin_unlock(&cache->lock);
2552         spin_unlock(&cache->space_info->lock);
2553
2554         set_extent_bit(&trans->transaction->pinned_extents, bytenr,
2555                        bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
2556         return 0;
2557 }
2558
2559 int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2560                      u64 bytenr, u64 num_bytes, int reserved)
2561 {
2562         struct btrfs_block_group *cache;
2563
2564         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2565         BUG_ON(!cache); /* Logic error */
2566
2567         pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2568
2569         btrfs_put_block_group(cache);
2570         return 0;
2571 }
2572
2573 /*
2574  * this function must be called within transaction
2575  */
2576 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2577                                     u64 bytenr, u64 num_bytes)
2578 {
2579         struct btrfs_block_group *cache;
2580         int ret;
2581
2582         cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2583         if (!cache)
2584                 return -EINVAL;
2585
2586         /*
2587          * Fully cache the free space first so that our pin removes the free space
2588          * from the cache.
2589          */
2590         ret = btrfs_cache_block_group(cache, true);
2591         if (ret)
2592                 goto out;
2593
2594         pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2595
2596         /* remove us from the free space cache (if we're there at all) */
2597         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2598 out:
2599         btrfs_put_block_group(cache);
2600         return ret;
2601 }
2602
2603 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2604                                    u64 start, u64 num_bytes)
2605 {
2606         int ret;
2607         struct btrfs_block_group *block_group;
2608
2609         block_group = btrfs_lookup_block_group(fs_info, start);
2610         if (!block_group)
2611                 return -EINVAL;
2612
2613         ret = btrfs_cache_block_group(block_group, true);
2614         if (ret)
2615                 goto out;
2616
2617         ret = btrfs_remove_free_space(block_group, start, num_bytes);
2618 out:
2619         btrfs_put_block_group(block_group);
2620         return ret;
2621 }
2622
2623 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2624 {
2625         struct btrfs_fs_info *fs_info = eb->fs_info;
2626         struct btrfs_file_extent_item *item;
2627         struct btrfs_key key;
2628         int found_type;
2629         int i;
2630         int ret = 0;
2631
2632         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2633                 return 0;
2634
2635         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2636                 btrfs_item_key_to_cpu(eb, &key, i);
2637                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2638                         continue;
2639                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2640                 found_type = btrfs_file_extent_type(eb, item);
2641                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2642                         continue;
2643                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2644                         continue;
2645                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2646                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2647                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2648                 if (ret)
2649                         break;
2650         }
2651
2652         return ret;
2653 }
2654
2655 static void
2656 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2657 {
2658         atomic_inc(&bg->reservations);
2659 }
2660
2661 /*
2662  * Returns the free cluster for the given space info and sets empty_cluster to
2663  * what it should be based on the mount options.
2664  */
2665 static struct btrfs_free_cluster *
2666 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2667                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2668 {
2669         struct btrfs_free_cluster *ret = NULL;
2670
2671         *empty_cluster = 0;
2672         if (btrfs_mixed_space_info(space_info))
2673                 return ret;
2674
2675         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2676                 ret = &fs_info->meta_alloc_cluster;
2677                 if (btrfs_test_opt(fs_info, SSD))
2678                         *empty_cluster = SZ_2M;
2679                 else
2680                         *empty_cluster = SZ_64K;
2681         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2682                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2683                 *empty_cluster = SZ_2M;
2684                 ret = &fs_info->data_alloc_cluster;
2685         }
2686
2687         return ret;
2688 }
2689
2690 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2691                               u64 start, u64 end,
2692                               const bool return_free_space)
2693 {
2694         struct btrfs_block_group *cache = NULL;
2695         struct btrfs_space_info *space_info;
2696         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2697         struct btrfs_free_cluster *cluster = NULL;
2698         u64 len;
2699         u64 total_unpinned = 0;
2700         u64 empty_cluster = 0;
2701         bool readonly;
2702
2703         while (start <= end) {
2704                 readonly = false;
2705                 if (!cache ||
2706                     start >= cache->start + cache->length) {
2707                         if (cache)
2708                                 btrfs_put_block_group(cache);
2709                         total_unpinned = 0;
2710                         cache = btrfs_lookup_block_group(fs_info, start);
2711                         BUG_ON(!cache); /* Logic error */
2712
2713                         cluster = fetch_cluster_info(fs_info,
2714                                                      cache->space_info,
2715                                                      &empty_cluster);
2716                         empty_cluster <<= 1;
2717                 }
2718
2719                 len = cache->start + cache->length - start;
2720                 len = min(len, end + 1 - start);
2721
2722                 if (return_free_space)
2723                         btrfs_add_free_space(cache, start, len);
2724
2725                 start += len;
2726                 total_unpinned += len;
2727                 space_info = cache->space_info;
2728
2729                 /*
2730                  * If this space cluster has been marked as fragmented and we've
2731                  * unpinned enough in this block group to potentially allow a
2732                  * cluster to be created inside of it go ahead and clear the
2733                  * fragmented check.
2734                  */
2735                 if (cluster && cluster->fragmented &&
2736                     total_unpinned > empty_cluster) {
2737                         spin_lock(&cluster->lock);
2738                         cluster->fragmented = 0;
2739                         spin_unlock(&cluster->lock);
2740                 }
2741
2742                 spin_lock(&space_info->lock);
2743                 spin_lock(&cache->lock);
2744                 cache->pinned -= len;
2745                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2746                 space_info->max_extent_size = 0;
2747                 if (cache->ro) {
2748                         space_info->bytes_readonly += len;
2749                         readonly = true;
2750                 } else if (btrfs_is_zoned(fs_info)) {
2751                         /* Need reset before reusing in a zoned block group */
2752                         space_info->bytes_zone_unusable += len;
2753                         readonly = true;
2754                 }
2755                 spin_unlock(&cache->lock);
2756                 if (!readonly && return_free_space &&
2757                     global_rsv->space_info == space_info) {
2758                         spin_lock(&global_rsv->lock);
2759                         if (!global_rsv->full) {
2760                                 u64 to_add = min(len, global_rsv->size -
2761                                                       global_rsv->reserved);
2762
2763                                 global_rsv->reserved += to_add;
2764                                 btrfs_space_info_update_bytes_may_use(fs_info,
2765                                                 space_info, to_add);
2766                                 if (global_rsv->reserved >= global_rsv->size)
2767                                         global_rsv->full = 1;
2768                                 len -= to_add;
2769                         }
2770                         spin_unlock(&global_rsv->lock);
2771                 }
2772                 /* Add to any tickets we may have */
2773                 if (!readonly && return_free_space && len)
2774                         btrfs_try_granting_tickets(fs_info, space_info);
2775                 spin_unlock(&space_info->lock);
2776         }
2777
2778         if (cache)
2779                 btrfs_put_block_group(cache);
2780         return 0;
2781 }
2782
2783 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2784 {
2785         struct btrfs_fs_info *fs_info = trans->fs_info;
2786         struct btrfs_block_group *block_group, *tmp;
2787         struct list_head *deleted_bgs;
2788         struct extent_io_tree *unpin;
2789         u64 start;
2790         u64 end;
2791         int ret;
2792
2793         unpin = &trans->transaction->pinned_extents;
2794
2795         while (!TRANS_ABORTED(trans)) {
2796                 struct extent_state *cached_state = NULL;
2797
2798                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2799                 if (!find_first_extent_bit(unpin, 0, &start, &end,
2800                                            EXTENT_DIRTY, &cached_state)) {
2801                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2802                         break;
2803                 }
2804
2805                 if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2806                         ret = btrfs_discard_extent(fs_info, start,
2807                                                    end + 1 - start, NULL);
2808
2809                 clear_extent_dirty(unpin, start, end, &cached_state);
2810                 unpin_extent_range(fs_info, start, end, true);
2811                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2812                 free_extent_state(cached_state);
2813                 cond_resched();
2814         }
2815
2816         if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
2817                 btrfs_discard_calc_delay(&fs_info->discard_ctl);
2818                 btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2819         }
2820
2821         /*
2822          * Transaction is finished.  We don't need the lock anymore.  We
2823          * do need to clean up the block groups in case of a transaction
2824          * abort.
2825          */
2826         deleted_bgs = &trans->transaction->deleted_bgs;
2827         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2828                 u64 trimmed = 0;
2829
2830                 ret = -EROFS;
2831                 if (!TRANS_ABORTED(trans))
2832                         ret = btrfs_discard_extent(fs_info,
2833                                                    block_group->start,
2834                                                    block_group->length,
2835                                                    &trimmed);
2836
2837                 list_del_init(&block_group->bg_list);
2838                 btrfs_unfreeze_block_group(block_group);
2839                 btrfs_put_block_group(block_group);
2840
2841                 if (ret) {
2842                         const char *errstr = btrfs_decode_error(ret);
2843                         btrfs_warn(fs_info,
2844                            "discard failed while removing blockgroup: errno=%d %s",
2845                                    ret, errstr);
2846                 }
2847         }
2848
2849         return 0;
2850 }
2851
2852 static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
2853                                      u64 bytenr, u64 num_bytes, bool is_data)
2854 {
2855         int ret;
2856
2857         if (is_data) {
2858                 struct btrfs_root *csum_root;
2859
2860                 csum_root = btrfs_csum_root(trans->fs_info, bytenr);
2861                 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
2862                 if (ret) {
2863                         btrfs_abort_transaction(trans, ret);
2864                         return ret;
2865                 }
2866         }
2867
2868         ret = add_to_free_space_tree(trans, bytenr, num_bytes);
2869         if (ret) {
2870                 btrfs_abort_transaction(trans, ret);
2871                 return ret;
2872         }
2873
2874         ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
2875         if (ret)
2876                 btrfs_abort_transaction(trans, ret);
2877
2878         return ret;
2879 }
2880
2881 #define abort_and_dump(trans, path, fmt, args...)       \
2882 ({                                                      \
2883         btrfs_abort_transaction(trans, -EUCLEAN);       \
2884         btrfs_print_leaf(path->nodes[0]);               \
2885         btrfs_crit(trans->fs_info, fmt, ##args);        \
2886 })
2887
2888 /*
2889  * Drop one or more refs of @node.
2890  *
2891  * 1. Locate the extent refs.
2892  *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
2893  *    Locate it, then reduce the refs number or remove the ref line completely.
2894  *
2895  * 2. Update the refs count in EXTENT/METADATA_ITEM
2896  *
2897  * Inline backref case:
2898  *
2899  * in extent tree we have:
2900  *
2901  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2902  *              refs 2 gen 6 flags DATA
2903  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2904  *              extent data backref root FS_TREE objectid 257 offset 0 count 1
2905  *
2906  * This function gets called with:
2907  *
2908  *    node->bytenr = 13631488
2909  *    node->num_bytes = 1048576
2910  *    root_objectid = FS_TREE
2911  *    owner_objectid = 257
2912  *    owner_offset = 0
2913  *    refs_to_drop = 1
2914  *
2915  * Then we should get some like:
2916  *
2917  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
2918  *              refs 1 gen 6 flags DATA
2919  *              extent data backref root FS_TREE objectid 258 offset 0 count 1
2920  *
2921  * Keyed backref case:
2922  *
2923  * in extent tree we have:
2924  *
2925  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2926  *              refs 754 gen 6 flags DATA
2927  *      [...]
2928  *      item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
2929  *              extent data backref root FS_TREE objectid 866 offset 0 count 1
2930  *
2931  * This function get called with:
2932  *
2933  *    node->bytenr = 13631488
2934  *    node->num_bytes = 1048576
2935  *    root_objectid = FS_TREE
2936  *    owner_objectid = 866
2937  *    owner_offset = 0
2938  *    refs_to_drop = 1
2939  *
2940  * Then we should get some like:
2941  *
2942  *      item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
2943  *              refs 753 gen 6 flags DATA
2944  *
2945  * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
2946  */
2947 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2948                                struct btrfs_delayed_ref_node *node, u64 parent,
2949                                u64 root_objectid, u64 owner_objectid,
2950                                u64 owner_offset, int refs_to_drop,
2951                                struct btrfs_delayed_extent_op *extent_op)
2952 {
2953         struct btrfs_fs_info *info = trans->fs_info;
2954         struct btrfs_key key;
2955         struct btrfs_path *path;
2956         struct btrfs_root *extent_root;
2957         struct extent_buffer *leaf;
2958         struct btrfs_extent_item *ei;
2959         struct btrfs_extent_inline_ref *iref;
2960         int ret;
2961         int is_data;
2962         int extent_slot = 0;
2963         int found_extent = 0;
2964         int num_to_del = 1;
2965         u32 item_size;
2966         u64 refs;
2967         u64 bytenr = node->bytenr;
2968         u64 num_bytes = node->num_bytes;
2969         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
2970
2971         extent_root = btrfs_extent_root(info, bytenr);
2972         ASSERT(extent_root);
2973
2974         path = btrfs_alloc_path();
2975         if (!path)
2976                 return -ENOMEM;
2977
2978         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2979
2980         if (!is_data && refs_to_drop != 1) {
2981                 btrfs_crit(info,
2982 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
2983                            node->bytenr, refs_to_drop);
2984                 ret = -EINVAL;
2985                 btrfs_abort_transaction(trans, ret);
2986                 goto out;
2987         }
2988
2989         if (is_data)
2990                 skinny_metadata = false;
2991
2992         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
2993                                     parent, root_objectid, owner_objectid,
2994                                     owner_offset);
2995         if (ret == 0) {
2996                 /*
2997                  * Either the inline backref or the SHARED_DATA_REF/
2998                  * SHARED_BLOCK_REF is found
2999                  *
3000                  * Here is a quick path to locate EXTENT/METADATA_ITEM.
3001                  * It's possible the EXTENT/METADATA_ITEM is near current slot.
3002                  */
3003                 extent_slot = path->slots[0];
3004                 while (extent_slot >= 0) {
3005                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3006                                               extent_slot);
3007                         if (key.objectid != bytenr)
3008                                 break;
3009                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3010                             key.offset == num_bytes) {
3011                                 found_extent = 1;
3012                                 break;
3013                         }
3014                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
3015                             key.offset == owner_objectid) {
3016                                 found_extent = 1;
3017                                 break;
3018                         }
3019
3020                         /* Quick path didn't find the EXTEMT/METADATA_ITEM */
3021                         if (path->slots[0] - extent_slot > 5)
3022                                 break;
3023                         extent_slot--;
3024                 }
3025
3026                 if (!found_extent) {
3027                         if (iref) {
3028                                 abort_and_dump(trans, path,
3029 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref",
3030                                            path->slots[0]);
3031                                 ret = -EUCLEAN;
3032                                 goto out;
3033                         }
3034                         /* Must be SHARED_* item, remove the backref first */
3035                         ret = remove_extent_backref(trans, extent_root, path,
3036                                                     NULL, refs_to_drop, is_data);
3037                         if (ret) {
3038                                 btrfs_abort_transaction(trans, ret);
3039                                 goto out;
3040                         }
3041                         btrfs_release_path(path);
3042
3043                         /* Slow path to locate EXTENT/METADATA_ITEM */
3044                         key.objectid = bytenr;
3045                         key.type = BTRFS_EXTENT_ITEM_KEY;
3046                         key.offset = num_bytes;
3047
3048                         if (!is_data && skinny_metadata) {
3049                                 key.type = BTRFS_METADATA_ITEM_KEY;
3050                                 key.offset = owner_objectid;
3051                         }
3052
3053                         ret = btrfs_search_slot(trans, extent_root,
3054                                                 &key, path, -1, 1);
3055                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3056                                 /*
3057                                  * Couldn't find our skinny metadata item,
3058                                  * see if we have ye olde extent item.
3059                                  */
3060                                 path->slots[0]--;
3061                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3062                                                       path->slots[0]);
3063                                 if (key.objectid == bytenr &&
3064                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3065                                     key.offset == num_bytes)
3066                                         ret = 0;
3067                         }
3068
3069                         if (ret > 0 && skinny_metadata) {
3070                                 skinny_metadata = false;
3071                                 key.objectid = bytenr;
3072                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3073                                 key.offset = num_bytes;
3074                                 btrfs_release_path(path);
3075                                 ret = btrfs_search_slot(trans, extent_root,
3076                                                         &key, path, -1, 1);
3077                         }
3078
3079                         if (ret) {
3080                                 if (ret > 0)
3081                                         btrfs_print_leaf(path->nodes[0]);
3082                                 btrfs_err(info,
3083                         "umm, got %d back from search, was looking for %llu, slot %d",
3084                                           ret, bytenr, path->slots[0]);
3085                         }
3086                         if (ret < 0) {
3087                                 btrfs_abort_transaction(trans, ret);
3088                                 goto out;
3089                         }
3090                         extent_slot = path->slots[0];
3091                 }
3092         } else if (WARN_ON(ret == -ENOENT)) {
3093                 abort_and_dump(trans, path,
3094 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d",
3095                                bytenr, parent, root_objectid, owner_objectid,
3096                                owner_offset, path->slots[0]);
3097                 goto out;
3098         } else {
3099                 btrfs_abort_transaction(trans, ret);
3100                 goto out;
3101         }
3102
3103         leaf = path->nodes[0];
3104         item_size = btrfs_item_size(leaf, extent_slot);
3105         if (unlikely(item_size < sizeof(*ei))) {
3106                 ret = -EUCLEAN;
3107                 btrfs_err(trans->fs_info,
3108                           "unexpected extent item size, has %u expect >= %zu",
3109                           item_size, sizeof(*ei));
3110                 btrfs_abort_transaction(trans, ret);
3111                 goto out;
3112         }
3113         ei = btrfs_item_ptr(leaf, extent_slot,
3114                             struct btrfs_extent_item);
3115         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3116             key.type == BTRFS_EXTENT_ITEM_KEY) {
3117                 struct btrfs_tree_block_info *bi;
3118
3119                 if (item_size < sizeof(*ei) + sizeof(*bi)) {
3120                         abort_and_dump(trans, path,
3121 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu",
3122                                        key.objectid, key.type, key.offset,
3123                                        path->slots[0], owner_objectid, item_size,
3124                                        sizeof(*ei) + sizeof(*bi));
3125                         ret = -EUCLEAN;
3126                         goto out;
3127                 }
3128                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3129                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3130         }
3131
3132         refs = btrfs_extent_refs(leaf, ei);
3133         if (refs < refs_to_drop) {
3134                 abort_and_dump(trans, path,
3135                 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u",
3136                                refs_to_drop, refs, bytenr, path->slots[0]);
3137                 ret = -EUCLEAN;
3138                 goto out;
3139         }
3140         refs -= refs_to_drop;
3141
3142         if (refs > 0) {
3143                 if (extent_op)
3144                         __run_delayed_extent_op(extent_op, leaf, ei);
3145                 /*
3146                  * In the case of inline back ref, reference count will
3147                  * be updated by remove_extent_backref
3148                  */
3149                 if (iref) {
3150                         if (!found_extent) {
3151                                 abort_and_dump(trans, path,
3152 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u",
3153                                                path->slots[0]);
3154                                 ret = -EUCLEAN;
3155                                 goto out;
3156                         }
3157                 } else {
3158                         btrfs_set_extent_refs(leaf, ei, refs);
3159                         btrfs_mark_buffer_dirty(trans, leaf);
3160                 }
3161                 if (found_extent) {
3162                         ret = remove_extent_backref(trans, extent_root, path,
3163                                                     iref, refs_to_drop, is_data);
3164                         if (ret) {
3165                                 btrfs_abort_transaction(trans, ret);
3166                                 goto out;
3167                         }
3168                 }
3169         } else {
3170                 /* In this branch refs == 1 */
3171                 if (found_extent) {
3172                         if (is_data && refs_to_drop !=
3173                             extent_data_ref_count(path, iref)) {
3174                                 abort_and_dump(trans, path,
3175                 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u",
3176                                                extent_data_ref_count(path, iref),
3177                                                refs_to_drop, path->slots[0]);
3178                                 ret = -EUCLEAN;
3179                                 goto out;
3180                         }
3181                         if (iref) {
3182                                 if (path->slots[0] != extent_slot) {
3183                                         abort_and_dump(trans, path,
3184 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref",
3185                                                        key.objectid, key.type,
3186                                                        key.offset, path->slots[0]);
3187                                         ret = -EUCLEAN;
3188                                         goto out;
3189                                 }
3190                         } else {
3191                                 /*
3192                                  * No inline ref, we must be at SHARED_* item,
3193                                  * And it's single ref, it must be:
3194                                  * |    extent_slot       ||extent_slot + 1|
3195                                  * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
3196                                  */
3197                                 if (path->slots[0] != extent_slot + 1) {
3198                                         abort_and_dump(trans, path,
3199         "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM",
3200                                                        path->slots[0]);
3201                                         ret = -EUCLEAN;
3202                                         goto out;
3203                                 }
3204                                 path->slots[0] = extent_slot;
3205                                 num_to_del = 2;
3206                         }
3207                 }
3208
3209                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3210                                       num_to_del);
3211                 if (ret) {
3212                         btrfs_abort_transaction(trans, ret);
3213                         goto out;
3214                 }
3215                 btrfs_release_path(path);
3216
3217                 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
3218         }
3219         btrfs_release_path(path);
3220
3221 out:
3222         btrfs_free_path(path);
3223         return ret;
3224 }
3225
3226 /*
3227  * when we free an block, it is possible (and likely) that we free the last
3228  * delayed ref for that extent as well.  This searches the delayed ref tree for
3229  * a given extent, and if there are no other delayed refs to be processed, it
3230  * removes it from the tree.
3231  */
3232 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3233                                       u64 bytenr)
3234 {
3235         struct btrfs_delayed_ref_head *head;
3236         struct btrfs_delayed_ref_root *delayed_refs;
3237         int ret = 0;
3238
3239         delayed_refs = &trans->transaction->delayed_refs;
3240         spin_lock(&delayed_refs->lock);
3241         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3242         if (!head)
3243                 goto out_delayed_unlock;
3244
3245         spin_lock(&head->lock);
3246         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3247                 goto out;
3248
3249         if (cleanup_extent_op(head) != NULL)
3250                 goto out;
3251
3252         /*
3253          * waiting for the lock here would deadlock.  If someone else has it
3254          * locked they are already in the process of dropping it anyway
3255          */
3256         if (!mutex_trylock(&head->mutex))
3257                 goto out;
3258
3259         btrfs_delete_ref_head(delayed_refs, head);
3260         head->processing = false;
3261
3262         spin_unlock(&head->lock);
3263         spin_unlock(&delayed_refs->lock);
3264
3265         BUG_ON(head->extent_op);
3266         if (head->must_insert_reserved)
3267                 ret = 1;
3268
3269         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3270         mutex_unlock(&head->mutex);
3271         btrfs_put_delayed_ref_head(head);
3272         return ret;
3273 out:
3274         spin_unlock(&head->lock);
3275
3276 out_delayed_unlock:
3277         spin_unlock(&delayed_refs->lock);
3278         return 0;
3279 }
3280
3281 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3282                            u64 root_id,
3283                            struct extent_buffer *buf,
3284                            u64 parent, int last_ref)
3285 {
3286         struct btrfs_fs_info *fs_info = trans->fs_info;
3287         struct btrfs_ref generic_ref = { 0 };
3288         int ret;
3289
3290         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3291                                buf->start, buf->len, parent);
3292         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3293                             root_id, 0, false);
3294
3295         if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3296                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3297                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3298                 BUG_ON(ret); /* -ENOMEM */
3299         }
3300
3301         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3302                 struct btrfs_block_group *cache;
3303                 bool must_pin = false;
3304
3305                 if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3306                         ret = check_ref_cleanup(trans, buf->start);
3307                         if (!ret) {
3308                                 btrfs_redirty_list_add(trans->transaction, buf);
3309                                 goto out;
3310                         }
3311                 }
3312
3313                 cache = btrfs_lookup_block_group(fs_info, buf->start);
3314
3315                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3316                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3317                         btrfs_put_block_group(cache);
3318                         goto out;
3319                 }
3320
3321                 /*
3322                  * If there are tree mod log users we may have recorded mod log
3323                  * operations for this node.  If we re-allocate this node we
3324                  * could replay operations on this node that happened when it
3325                  * existed in a completely different root.  For example if it
3326                  * was part of root A, then was reallocated to root B, and we
3327                  * are doing a btrfs_old_search_slot(root b), we could replay
3328                  * operations that happened when the block was part of root A,
3329                  * giving us an inconsistent view of the btree.
3330                  *
3331                  * We are safe from races here because at this point no other
3332                  * node or root points to this extent buffer, so if after this
3333                  * check a new tree mod log user joins we will not have an
3334                  * existing log of operations on this node that we have to
3335                  * contend with.
3336                  */
3337                 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
3338                         must_pin = true;
3339
3340                 if (must_pin || btrfs_is_zoned(fs_info)) {
3341                         btrfs_redirty_list_add(trans->transaction, buf);
3342                         pin_down_extent(trans, cache, buf->start, buf->len, 1);
3343                         btrfs_put_block_group(cache);
3344                         goto out;
3345                 }
3346
3347                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3348
3349                 btrfs_add_free_space(cache, buf->start, buf->len);
3350                 btrfs_free_reserved_bytes(cache, buf->len, 0);
3351                 btrfs_put_block_group(cache);
3352                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3353         }
3354 out:
3355         if (last_ref) {
3356                 /*
3357                  * Deleting the buffer, clear the corrupt flag since it doesn't
3358                  * matter anymore.
3359                  */
3360                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3361         }
3362 }
3363
3364 /* Can return -ENOMEM */
3365 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3366 {
3367         struct btrfs_fs_info *fs_info = trans->fs_info;
3368         int ret;
3369
3370         if (btrfs_is_testing(fs_info))
3371                 return 0;
3372
3373         /*
3374          * tree log blocks never actually go into the extent allocation
3375          * tree, just update pinning info and exit early.
3376          */
3377         if ((ref->type == BTRFS_REF_METADATA &&
3378              ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3379             (ref->type == BTRFS_REF_DATA &&
3380              ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3381                 /* unlocks the pinned mutex */
3382                 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3383                 ret = 0;
3384         } else if (ref->type == BTRFS_REF_METADATA) {
3385                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3386         } else {
3387                 ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3388         }
3389
3390         if (!((ref->type == BTRFS_REF_METADATA &&
3391                ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3392               (ref->type == BTRFS_REF_DATA &&
3393                ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3394                 btrfs_ref_tree_mod(fs_info, ref);
3395
3396         return ret;
3397 }
3398
3399 enum btrfs_loop_type {
3400         /*
3401          * Start caching block groups but do not wait for progress or for them
3402          * to be done.
3403          */
3404         LOOP_CACHING_NOWAIT,
3405
3406         /*
3407          * Wait for the block group free_space >= the space we're waiting for if
3408          * the block group isn't cached.
3409          */
3410         LOOP_CACHING_WAIT,
3411
3412         /*
3413          * Allow allocations to happen from block groups that do not yet have a
3414          * size classification.
3415          */
3416         LOOP_UNSET_SIZE_CLASS,
3417
3418         /*
3419          * Allocate a chunk and then retry the allocation.
3420          */
3421         LOOP_ALLOC_CHUNK,
3422
3423         /*
3424          * Ignore the size class restrictions for this allocation.
3425          */
3426         LOOP_WRONG_SIZE_CLASS,
3427
3428         /*
3429          * Ignore the empty size, only try to allocate the number of bytes
3430          * needed for this allocation.
3431          */
3432         LOOP_NO_EMPTY_SIZE,
3433 };
3434
3435 static inline void
3436 btrfs_lock_block_group(struct btrfs_block_group *cache,
3437                        int delalloc)
3438 {
3439         if (delalloc)
3440                 down_read(&cache->data_rwsem);
3441 }
3442
3443 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3444                        int delalloc)
3445 {
3446         btrfs_get_block_group(cache);
3447         if (delalloc)
3448                 down_read(&cache->data_rwsem);
3449 }
3450
3451 static struct btrfs_block_group *btrfs_lock_cluster(
3452                    struct btrfs_block_group *block_group,
3453                    struct btrfs_free_cluster *cluster,
3454                    int delalloc)
3455         __acquires(&cluster->refill_lock)
3456 {
3457         struct btrfs_block_group *used_bg = NULL;
3458
3459         spin_lock(&cluster->refill_lock);
3460         while (1) {
3461                 used_bg = cluster->block_group;
3462                 if (!used_bg)
3463                         return NULL;
3464
3465                 if (used_bg == block_group)
3466                         return used_bg;
3467
3468                 btrfs_get_block_group(used_bg);
3469
3470                 if (!delalloc)
3471                         return used_bg;
3472
3473                 if (down_read_trylock(&used_bg->data_rwsem))
3474                         return used_bg;
3475
3476                 spin_unlock(&cluster->refill_lock);
3477
3478                 /* We should only have one-level nested. */
3479                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3480
3481                 spin_lock(&cluster->refill_lock);
3482                 if (used_bg == cluster->block_group)
3483                         return used_bg;
3484
3485                 up_read(&used_bg->data_rwsem);
3486                 btrfs_put_block_group(used_bg);
3487         }
3488 }
3489
3490 static inline void
3491 btrfs_release_block_group(struct btrfs_block_group *cache,
3492                          int delalloc)
3493 {
3494         if (delalloc)
3495                 up_read(&cache->data_rwsem);
3496         btrfs_put_block_group(cache);
3497 }
3498
3499 /*
3500  * Helper function for find_free_extent().
3501  *
3502  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3503  * Return >0 to inform caller that we find nothing
3504  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3505  */
3506 static int find_free_extent_clustered(struct btrfs_block_group *bg,
3507                                       struct find_free_extent_ctl *ffe_ctl,
3508                                       struct btrfs_block_group **cluster_bg_ret)
3509 {
3510         struct btrfs_block_group *cluster_bg;
3511         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3512         u64 aligned_cluster;
3513         u64 offset;
3514         int ret;
3515
3516         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3517         if (!cluster_bg)
3518                 goto refill_cluster;
3519         if (cluster_bg != bg && (cluster_bg->ro ||
3520             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3521                 goto release_cluster;
3522
3523         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3524                         ffe_ctl->num_bytes, cluster_bg->start,
3525                         &ffe_ctl->max_extent_size);
3526         if (offset) {
3527                 /* We have a block, we're done */
3528                 spin_unlock(&last_ptr->refill_lock);
3529                 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl);
3530                 *cluster_bg_ret = cluster_bg;
3531                 ffe_ctl->found_offset = offset;
3532                 return 0;
3533         }
3534         WARN_ON(last_ptr->block_group != cluster_bg);
3535
3536 release_cluster:
3537         /*
3538          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3539          * lets just skip it and let the allocator find whatever block it can
3540          * find. If we reach this point, we will have tried the cluster
3541          * allocator plenty of times and not have found anything, so we are
3542          * likely way too fragmented for the clustering stuff to find anything.
3543          *
3544          * However, if the cluster is taken from the current block group,
3545          * release the cluster first, so that we stand a better chance of
3546          * succeeding in the unclustered allocation.
3547          */
3548         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3549                 spin_unlock(&last_ptr->refill_lock);
3550                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3551                 return -ENOENT;
3552         }
3553
3554         /* This cluster didn't work out, free it and start over */
3555         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3556
3557         if (cluster_bg != bg)
3558                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3559
3560 refill_cluster:
3561         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3562                 spin_unlock(&last_ptr->refill_lock);
3563                 return -ENOENT;
3564         }
3565
3566         aligned_cluster = max_t(u64,
3567                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3568                         bg->full_stripe_len);
3569         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3570                         ffe_ctl->num_bytes, aligned_cluster);
3571         if (ret == 0) {
3572                 /* Now pull our allocation out of this cluster */
3573                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3574                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3575                                 &ffe_ctl->max_extent_size);
3576                 if (offset) {
3577                         /* We found one, proceed */
3578                         spin_unlock(&last_ptr->refill_lock);
3579                         ffe_ctl->found_offset = offset;
3580                         trace_btrfs_reserve_extent_cluster(bg, ffe_ctl);
3581                         return 0;
3582                 }
3583         }
3584         /*
3585          * At this point we either didn't find a cluster or we weren't able to
3586          * allocate a block from our cluster.  Free the cluster we've been
3587          * trying to use, and go to the next block group.
3588          */
3589         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3590         spin_unlock(&last_ptr->refill_lock);
3591         return 1;
3592 }
3593
3594 /*
3595  * Return >0 to inform caller that we find nothing
3596  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3597  */
3598 static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3599                                         struct find_free_extent_ctl *ffe_ctl)
3600 {
3601         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3602         u64 offset;
3603
3604         /*
3605          * We are doing an unclustered allocation, set the fragmented flag so
3606          * we don't bother trying to setup a cluster again until we get more
3607          * space.
3608          */
3609         if (unlikely(last_ptr)) {
3610                 spin_lock(&last_ptr->lock);
3611                 last_ptr->fragmented = 1;
3612                 spin_unlock(&last_ptr->lock);
3613         }
3614         if (ffe_ctl->cached) {
3615                 struct btrfs_free_space_ctl *free_space_ctl;
3616
3617                 free_space_ctl = bg->free_space_ctl;
3618                 spin_lock(&free_space_ctl->tree_lock);
3619                 if (free_space_ctl->free_space <
3620                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3621                     ffe_ctl->empty_size) {
3622                         ffe_ctl->total_free_space = max_t(u64,
3623                                         ffe_ctl->total_free_space,
3624                                         free_space_ctl->free_space);
3625                         spin_unlock(&free_space_ctl->tree_lock);
3626                         return 1;
3627                 }
3628                 spin_unlock(&free_space_ctl->tree_lock);
3629         }
3630
3631         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3632                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3633                         &ffe_ctl->max_extent_size);
3634         if (!offset)
3635                 return 1;
3636         ffe_ctl->found_offset = offset;
3637         return 0;
3638 }
3639
3640 static int do_allocation_clustered(struct btrfs_block_group *block_group,
3641                                    struct find_free_extent_ctl *ffe_ctl,
3642                                    struct btrfs_block_group **bg_ret)
3643 {
3644         int ret;
3645
3646         /* We want to try and use the cluster allocator, so lets look there */
3647         if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3648                 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3649                 if (ret >= 0)
3650                         return ret;
3651                 /* ret == -ENOENT case falls through */
3652         }
3653
3654         return find_free_extent_unclustered(block_group, ffe_ctl);
3655 }
3656
3657 /*
3658  * Tree-log block group locking
3659  * ============================
3660  *
3661  * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
3662  * indicates the starting address of a block group, which is reserved only
3663  * for tree-log metadata.
3664  *
3665  * Lock nesting
3666  * ============
3667  *
3668  * space_info::lock
3669  *   block_group::lock
3670  *     fs_info::treelog_bg_lock
3671  */
3672
3673 /*
3674  * Simple allocator for sequential-only block group. It only allows sequential
3675  * allocation. No need to play with trees. This function also reserves the
3676  * bytes as in btrfs_add_reserved_bytes.
3677  */
3678 static int do_allocation_zoned(struct btrfs_block_group *block_group,
3679                                struct find_free_extent_ctl *ffe_ctl,
3680                                struct btrfs_block_group **bg_ret)
3681 {
3682         struct btrfs_fs_info *fs_info = block_group->fs_info;
3683         struct btrfs_space_info *space_info = block_group->space_info;
3684         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3685         u64 start = block_group->start;
3686         u64 num_bytes = ffe_ctl->num_bytes;
3687         u64 avail;
3688         u64 bytenr = block_group->start;
3689         u64 log_bytenr;
3690         u64 data_reloc_bytenr;
3691         int ret = 0;
3692         bool skip = false;
3693
3694         ASSERT(btrfs_is_zoned(block_group->fs_info));
3695
3696         /*
3697          * Do not allow non-tree-log blocks in the dedicated tree-log block
3698          * group, and vice versa.
3699          */
3700         spin_lock(&fs_info->treelog_bg_lock);
3701         log_bytenr = fs_info->treelog_bg;
3702         if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
3703                            (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
3704                 skip = true;
3705         spin_unlock(&fs_info->treelog_bg_lock);
3706         if (skip)
3707                 return 1;
3708
3709         /*
3710          * Do not allow non-relocation blocks in the dedicated relocation block
3711          * group, and vice versa.
3712          */
3713         spin_lock(&fs_info->relocation_bg_lock);
3714         data_reloc_bytenr = fs_info->data_reloc_bg;
3715         if (data_reloc_bytenr &&
3716             ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
3717              (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
3718                 skip = true;
3719         spin_unlock(&fs_info->relocation_bg_lock);
3720         if (skip)
3721                 return 1;
3722
3723         /* Check RO and no space case before trying to activate it */
3724         spin_lock(&block_group->lock);
3725         if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
3726                 ret = 1;
3727                 /*
3728                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3729                  * Return the error after taking the locks.
3730                  */
3731         }
3732         spin_unlock(&block_group->lock);
3733
3734         /* Metadata block group is activated at write time. */
3735         if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
3736             !btrfs_zone_activate(block_group)) {
3737                 ret = 1;
3738                 /*
3739                  * May need to clear fs_info->{treelog,data_reloc}_bg.
3740                  * Return the error after taking the locks.
3741                  */
3742         }
3743
3744         spin_lock(&space_info->lock);
3745         spin_lock(&block_group->lock);
3746         spin_lock(&fs_info->treelog_bg_lock);
3747         spin_lock(&fs_info->relocation_bg_lock);
3748
3749         if (ret)
3750                 goto out;
3751
3752         ASSERT(!ffe_ctl->for_treelog ||
3753                block_group->start == fs_info->treelog_bg ||
3754                fs_info->treelog_bg == 0);
3755         ASSERT(!ffe_ctl->for_data_reloc ||
3756                block_group->start == fs_info->data_reloc_bg ||
3757                fs_info->data_reloc_bg == 0);
3758
3759         if (block_group->ro ||
3760             (!ffe_ctl->for_data_reloc &&
3761              test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) {
3762                 ret = 1;
3763                 goto out;
3764         }
3765
3766         /*
3767          * Do not allow currently using block group to be tree-log dedicated
3768          * block group.
3769          */
3770         if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
3771             (block_group->used || block_group->reserved)) {
3772                 ret = 1;
3773                 goto out;
3774         }
3775
3776         /*
3777          * Do not allow currently used block group to be the data relocation
3778          * dedicated block group.
3779          */
3780         if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
3781             (block_group->used || block_group->reserved)) {
3782                 ret = 1;
3783                 goto out;
3784         }
3785
3786         WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
3787         avail = block_group->zone_capacity - block_group->alloc_offset;
3788         if (avail < num_bytes) {
3789                 if (ffe_ctl->max_extent_size < avail) {
3790                         /*
3791                          * With sequential allocator, free space is always
3792                          * contiguous
3793                          */
3794                         ffe_ctl->max_extent_size = avail;
3795                         ffe_ctl->total_free_space = avail;
3796                 }
3797                 ret = 1;
3798                 goto out;
3799         }
3800
3801         if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
3802                 fs_info->treelog_bg = block_group->start;
3803
3804         if (ffe_ctl->for_data_reloc) {
3805                 if (!fs_info->data_reloc_bg)
3806                         fs_info->data_reloc_bg = block_group->start;
3807                 /*
3808                  * Do not allow allocations from this block group, unless it is
3809                  * for data relocation. Compared to increasing the ->ro, setting
3810                  * the ->zoned_data_reloc_ongoing flag still allows nocow
3811                  * writers to come in. See btrfs_inc_nocow_writers().
3812                  *
3813                  * We need to disable an allocation to avoid an allocation of
3814                  * regular (non-relocation data) extent. With mix of relocation
3815                  * extents and regular extents, we can dispatch WRITE commands
3816                  * (for relocation extents) and ZONE APPEND commands (for
3817                  * regular extents) at the same time to the same zone, which
3818                  * easily break the write pointer.
3819                  *
3820                  * Also, this flag avoids this block group to be zone finished.
3821                  */
3822                 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
3823         }
3824
3825         ffe_ctl->found_offset = start + block_group->alloc_offset;
3826         block_group->alloc_offset += num_bytes;
3827         spin_lock(&ctl->tree_lock);
3828         ctl->free_space -= num_bytes;
3829         spin_unlock(&ctl->tree_lock);
3830
3831         /*
3832          * We do not check if found_offset is aligned to stripesize. The
3833          * address is anyway rewritten when using zone append writing.
3834          */
3835
3836         ffe_ctl->search_start = ffe_ctl->found_offset;
3837
3838 out:
3839         if (ret && ffe_ctl->for_treelog)
3840                 fs_info->treelog_bg = 0;
3841         if (ret && ffe_ctl->for_data_reloc)
3842                 fs_info->data_reloc_bg = 0;
3843         spin_unlock(&fs_info->relocation_bg_lock);
3844         spin_unlock(&fs_info->treelog_bg_lock);
3845         spin_unlock(&block_group->lock);
3846         spin_unlock(&space_info->lock);
3847         return ret;
3848 }
3849
3850 static int do_allocation(struct btrfs_block_group *block_group,
3851                          struct find_free_extent_ctl *ffe_ctl,
3852                          struct btrfs_block_group **bg_ret)
3853 {
3854         switch (ffe_ctl->policy) {
3855         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3856                 return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3857         case BTRFS_EXTENT_ALLOC_ZONED:
3858                 return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3859         default:
3860                 BUG();
3861         }
3862 }
3863
3864 static void release_block_group(struct btrfs_block_group *block_group,
3865                                 struct find_free_extent_ctl *ffe_ctl,
3866                                 int delalloc)
3867 {
3868         switch (ffe_ctl->policy) {
3869         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3870                 ffe_ctl->retry_uncached = false;
3871                 break;
3872         case BTRFS_EXTENT_ALLOC_ZONED:
3873                 /* Nothing to do */
3874                 break;
3875         default:
3876                 BUG();
3877         }
3878
3879         BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
3880                ffe_ctl->index);
3881         btrfs_release_block_group(block_group, delalloc);
3882 }
3883
3884 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
3885                                    struct btrfs_key *ins)
3886 {
3887         struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3888
3889         if (!ffe_ctl->use_cluster && last_ptr) {
3890                 spin_lock(&last_ptr->lock);
3891                 last_ptr->window_start = ins->objectid;
3892                 spin_unlock(&last_ptr->lock);
3893         }
3894 }
3895
3896 static void found_extent(struct find_free_extent_ctl *ffe_ctl,
3897                          struct btrfs_key *ins)
3898 {
3899         switch (ffe_ctl->policy) {
3900         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3901                 found_extent_clustered(ffe_ctl, ins);
3902                 break;
3903         case BTRFS_EXTENT_ALLOC_ZONED:
3904                 /* Nothing to do */
3905                 break;
3906         default:
3907                 BUG();
3908         }
3909 }
3910
3911 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
3912                                     struct find_free_extent_ctl *ffe_ctl)
3913 {
3914         /* Block group's activeness is not a requirement for METADATA block groups. */
3915         if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA))
3916                 return 0;
3917
3918         /* If we can activate new zone, just allocate a chunk and use it */
3919         if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
3920                 return 0;
3921
3922         /*
3923          * We already reached the max active zones. Try to finish one block
3924          * group to make a room for a new block group. This is only possible
3925          * for a data block group because btrfs_zone_finish() may need to wait
3926          * for a running transaction which can cause a deadlock for metadata
3927          * allocation.
3928          */
3929         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
3930                 int ret = btrfs_zone_finish_one_bg(fs_info);
3931
3932                 if (ret == 1)
3933                         return 0;
3934                 else if (ret < 0)
3935                         return ret;
3936         }
3937
3938         /*
3939          * If we have enough free space left in an already active block group
3940          * and we can't activate any other zone now, do not allow allocating a
3941          * new chunk and let find_free_extent() retry with a smaller size.
3942          */
3943         if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
3944                 return -ENOSPC;
3945
3946         /*
3947          * Even min_alloc_size is not left in any block groups. Since we cannot
3948          * activate a new block group, allocating it may not help. Let's tell a
3949          * caller to try again and hope it progress something by writing some
3950          * parts of the region. That is only possible for data block groups,
3951          * where a part of the region can be written.
3952          */
3953         if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
3954                 return -EAGAIN;
3955
3956         /*
3957          * We cannot activate a new block group and no enough space left in any
3958          * block groups. So, allocating a new block group may not help. But,
3959          * there is nothing to do anyway, so let's go with it.
3960          */
3961         return 0;
3962 }
3963
3964 static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
3965                               struct find_free_extent_ctl *ffe_ctl)
3966 {
3967         switch (ffe_ctl->policy) {
3968         case BTRFS_EXTENT_ALLOC_CLUSTERED:
3969                 return 0;
3970         case BTRFS_EXTENT_ALLOC_ZONED:
3971                 return can_allocate_chunk_zoned(fs_info, ffe_ctl);
3972         default:
3973                 BUG();
3974         }
3975 }
3976
3977 /*
3978  * Return >0 means caller needs to re-search for free extent
3979  * Return 0 means we have the needed free extent.
3980  * Return <0 means we failed to locate any free extent.
3981  */
3982 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3983                                         struct btrfs_key *ins,
3984                                         struct find_free_extent_ctl *ffe_ctl,
3985                                         bool full_search)
3986 {
3987         struct btrfs_root *root = fs_info->chunk_root;
3988         int ret;
3989
3990         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3991             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3992                 ffe_ctl->orig_have_caching_bg = true;
3993
3994         if (ins->objectid) {
3995                 found_extent(ffe_ctl, ins);
3996                 return 0;
3997         }
3998
3999         if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
4000                 return 1;
4001
4002         ffe_ctl->index++;
4003         if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
4004                 return 1;
4005
4006         /* See the comments for btrfs_loop_type for an explanation of the phases. */
4007         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
4008                 ffe_ctl->index = 0;
4009                 /*
4010                  * We want to skip the LOOP_CACHING_WAIT step if we don't have
4011                  * any uncached bgs and we've already done a full search
4012                  * through.
4013                  */
4014                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
4015                     (!ffe_ctl->orig_have_caching_bg && full_search))
4016                         ffe_ctl->loop++;
4017                 ffe_ctl->loop++;
4018
4019                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
4020                         struct btrfs_trans_handle *trans;
4021                         int exist = 0;
4022
4023                         /* Check if allocation policy allows to create a new chunk */
4024                         ret = can_allocate_chunk(fs_info, ffe_ctl);
4025                         if (ret)
4026                                 return ret;
4027
4028                         trans = current->journal_info;
4029                         if (trans)
4030                                 exist = 1;
4031                         else
4032                                 trans = btrfs_join_transaction(root);
4033
4034                         if (IS_ERR(trans)) {
4035                                 ret = PTR_ERR(trans);
4036                                 return ret;
4037                         }
4038
4039                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
4040                                                 CHUNK_ALLOC_FORCE_FOR_EXTENT);
4041
4042                         /* Do not bail out on ENOSPC since we can do more. */
4043                         if (ret == -ENOSPC) {
4044                                 ret = 0;
4045                                 ffe_ctl->loop++;
4046                         }
4047                         else if (ret < 0)
4048                                 btrfs_abort_transaction(trans, ret);
4049                         else
4050                                 ret = 0;
4051                         if (!exist)
4052                                 btrfs_end_transaction(trans);
4053                         if (ret)
4054                                 return ret;
4055                 }
4056
4057                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4058                         if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
4059                                 return -ENOSPC;
4060
4061                         /*
4062                          * Don't loop again if we already have no empty_size and
4063                          * no empty_cluster.
4064                          */
4065                         if (ffe_ctl->empty_size == 0 &&
4066                             ffe_ctl->empty_cluster == 0)
4067                                 return -ENOSPC;
4068                         ffe_ctl->empty_size = 0;
4069                         ffe_ctl->empty_cluster = 0;
4070                 }
4071                 return 1;
4072         }
4073         return -ENOSPC;
4074 }
4075
4076 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
4077                                               struct btrfs_block_group *bg)
4078 {
4079         if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
4080                 return true;
4081         if (!btrfs_block_group_should_use_size_class(bg))
4082                 return true;
4083         if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
4084                 return true;
4085         if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
4086             bg->size_class == BTRFS_BG_SZ_NONE)
4087                 return true;
4088         return ffe_ctl->size_class == bg->size_class;
4089 }
4090
4091 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
4092                                         struct find_free_extent_ctl *ffe_ctl,
4093                                         struct btrfs_space_info *space_info,
4094                                         struct btrfs_key *ins)
4095 {
4096         /*
4097          * If our free space is heavily fragmented we may not be able to make
4098          * big contiguous allocations, so instead of doing the expensive search
4099          * for free space, simply return ENOSPC with our max_extent_size so we
4100          * can go ahead and search for a more manageable chunk.
4101          *
4102          * If our max_extent_size is large enough for our allocation simply
4103          * disable clustering since we will likely not be able to find enough
4104          * space to create a cluster and induce latency trying.
4105          */
4106         if (space_info->max_extent_size) {
4107                 spin_lock(&space_info->lock);
4108                 if (space_info->max_extent_size &&
4109                     ffe_ctl->num_bytes > space_info->max_extent_size) {
4110                         ins->offset = space_info->max_extent_size;
4111                         spin_unlock(&space_info->lock);
4112                         return -ENOSPC;
4113                 } else if (space_info->max_extent_size) {
4114                         ffe_ctl->use_cluster = false;
4115                 }
4116                 spin_unlock(&space_info->lock);
4117         }
4118
4119         ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
4120                                                &ffe_ctl->empty_cluster);
4121         if (ffe_ctl->last_ptr) {
4122                 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
4123
4124                 spin_lock(&last_ptr->lock);
4125                 if (last_ptr->block_group)
4126                         ffe_ctl->hint_byte = last_ptr->window_start;
4127                 if (last_ptr->fragmented) {
4128                         /*
4129                          * We still set window_start so we can keep track of the
4130                          * last place we found an allocation to try and save
4131                          * some time.
4132                          */
4133                         ffe_ctl->hint_byte = last_ptr->window_start;
4134                         ffe_ctl->use_cluster = false;
4135                 }
4136                 spin_unlock(&last_ptr->lock);
4137         }
4138
4139         return 0;
4140 }
4141
4142 static int prepare_allocation(struct btrfs_fs_info *fs_info,
4143                               struct find_free_extent_ctl *ffe_ctl,
4144                               struct btrfs_space_info *space_info,
4145                               struct btrfs_key *ins)
4146 {
4147         switch (ffe_ctl->policy) {
4148         case BTRFS_EXTENT_ALLOC_CLUSTERED:
4149                 return prepare_allocation_clustered(fs_info, ffe_ctl,
4150                                                     space_info, ins);
4151         case BTRFS_EXTENT_ALLOC_ZONED:
4152                 if (ffe_ctl->for_treelog) {
4153                         spin_lock(&fs_info->treelog_bg_lock);
4154                         if (fs_info->treelog_bg)
4155                                 ffe_ctl->hint_byte = fs_info->treelog_bg;
4156                         spin_unlock(&fs_info->treelog_bg_lock);
4157                 }
4158                 if (ffe_ctl->for_data_reloc) {
4159                         spin_lock(&fs_info->relocation_bg_lock);
4160                         if (fs_info->data_reloc_bg)
4161                                 ffe_ctl->hint_byte = fs_info->data_reloc_bg;
4162                         spin_unlock(&fs_info->relocation_bg_lock);
4163                 }
4164                 return 0;
4165         default:
4166                 BUG();
4167         }
4168 }
4169
4170 /*
4171  * walks the btree of allocated extents and find a hole of a given size.
4172  * The key ins is changed to record the hole:
4173  * ins->objectid == start position
4174  * ins->flags = BTRFS_EXTENT_ITEM_KEY
4175  * ins->offset == the size of the hole.
4176  * Any available blocks before search_start are skipped.
4177  *
4178  * If there is no suitable free space, we will record the max size of
4179  * the free space extent currently.
4180  *
4181  * The overall logic and call chain:
4182  *
4183  * find_free_extent()
4184  * |- Iterate through all block groups
4185  * |  |- Get a valid block group
4186  * |  |- Try to do clustered allocation in that block group
4187  * |  |- Try to do unclustered allocation in that block group
4188  * |  |- Check if the result is valid
4189  * |  |  |- If valid, then exit
4190  * |  |- Jump to next block group
4191  * |
4192  * |- Push harder to find free extents
4193  *    |- If not found, re-iterate all block groups
4194  */
4195 static noinline int find_free_extent(struct btrfs_root *root,
4196                                      struct btrfs_key *ins,
4197                                      struct find_free_extent_ctl *ffe_ctl)
4198 {
4199         struct btrfs_fs_info *fs_info = root->fs_info;
4200         int ret = 0;
4201         int cache_block_group_error = 0;
4202         struct btrfs_block_group *block_group = NULL;
4203         struct btrfs_space_info *space_info;
4204         bool full_search = false;
4205
4206         WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4207
4208         ffe_ctl->search_start = 0;
4209         /* For clustered allocation */
4210         ffe_ctl->empty_cluster = 0;
4211         ffe_ctl->last_ptr = NULL;
4212         ffe_ctl->use_cluster = true;
4213         ffe_ctl->have_caching_bg = false;
4214         ffe_ctl->orig_have_caching_bg = false;
4215         ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
4216         ffe_ctl->loop = 0;
4217         ffe_ctl->retry_uncached = false;
4218         ffe_ctl->cached = 0;
4219         ffe_ctl->max_extent_size = 0;
4220         ffe_ctl->total_free_space = 0;
4221         ffe_ctl->found_offset = 0;
4222         ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4223         ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes);
4224
4225         if (btrfs_is_zoned(fs_info))
4226                 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4227
4228         ins->type = BTRFS_EXTENT_ITEM_KEY;
4229         ins->objectid = 0;
4230         ins->offset = 0;
4231
4232         trace_find_free_extent(root, ffe_ctl);
4233
4234         space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4235         if (!space_info) {
4236                 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4237                 return -ENOSPC;
4238         }
4239
4240         ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4241         if (ret < 0)
4242                 return ret;
4243
4244         ffe_ctl->search_start = max(ffe_ctl->search_start,
4245                                     first_logical_byte(fs_info));
4246         ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
4247         if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4248                 block_group = btrfs_lookup_block_group(fs_info,
4249                                                        ffe_ctl->search_start);
4250                 /*
4251                  * we don't want to use the block group if it doesn't match our
4252                  * allocation bits, or if its not cached.
4253                  *
4254                  * However if we are re-searching with an ideal block group
4255                  * picked out then we don't care that the block group is cached.
4256                  */
4257                 if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4258                     block_group->cached != BTRFS_CACHE_NO) {
4259                         down_read(&space_info->groups_sem);
4260                         if (list_empty(&block_group->list) ||
4261                             block_group->ro) {
4262                                 /*
4263                                  * someone is removing this block group,
4264                                  * we can't jump into the have_block_group
4265                                  * target because our list pointers are not
4266                                  * valid
4267                                  */
4268                                 btrfs_put_block_group(block_group);
4269                                 up_read(&space_info->groups_sem);
4270                         } else {
4271                                 ffe_ctl->index = btrfs_bg_flags_to_raid_index(
4272                                                         block_group->flags);
4273                                 btrfs_lock_block_group(block_group,
4274                                                        ffe_ctl->delalloc);
4275                                 ffe_ctl->hinted = true;
4276                                 goto have_block_group;
4277                         }
4278                 } else if (block_group) {
4279                         btrfs_put_block_group(block_group);
4280                 }
4281         }
4282 search:
4283         trace_find_free_extent_search_loop(root, ffe_ctl);
4284         ffe_ctl->have_caching_bg = false;
4285         if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
4286             ffe_ctl->index == 0)
4287                 full_search = true;
4288         down_read(&space_info->groups_sem);
4289         list_for_each_entry(block_group,
4290                             &space_info->block_groups[ffe_ctl->index], list) {
4291                 struct btrfs_block_group *bg_ret;
4292
4293                 ffe_ctl->hinted = false;
4294                 /* If the block group is read-only, we can skip it entirely. */
4295                 if (unlikely(block_group->ro)) {
4296                         if (ffe_ctl->for_treelog)
4297                                 btrfs_clear_treelog_bg(block_group);
4298                         if (ffe_ctl->for_data_reloc)
4299                                 btrfs_clear_data_reloc_bg(block_group);
4300                         continue;
4301                 }
4302
4303                 btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
4304                 ffe_ctl->search_start = block_group->start;
4305
4306                 /*
4307                  * this can happen if we end up cycling through all the
4308                  * raid types, but we want to make sure we only allocate
4309                  * for the proper type.
4310                  */
4311                 if (!block_group_bits(block_group, ffe_ctl->flags)) {
4312                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
4313                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
4314                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
4315                                 BTRFS_BLOCK_GROUP_RAID10;
4316
4317                         /*
4318                          * if they asked for extra copies and this block group
4319                          * doesn't provide them, bail.  This does allow us to
4320                          * fill raid0 from raid1.
4321                          */
4322                         if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4323                                 goto loop;
4324
4325                         /*
4326                          * This block group has different flags than we want.
4327                          * It's possible that we have MIXED_GROUP flag but no
4328                          * block group is mixed.  Just skip such block group.
4329                          */
4330                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4331                         continue;
4332                 }
4333
4334 have_block_group:
4335                 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
4336                 ffe_ctl->cached = btrfs_block_group_done(block_group);
4337                 if (unlikely(!ffe_ctl->cached)) {
4338                         ffe_ctl->have_caching_bg = true;
4339                         ret = btrfs_cache_block_group(block_group, false);
4340
4341                         /*
4342                          * If we get ENOMEM here or something else we want to
4343                          * try other block groups, because it may not be fatal.
4344                          * However if we can't find anything else we need to
4345                          * save our return here so that we return the actual
4346                          * error that caused problems, not ENOSPC.
4347                          */
4348                         if (ret < 0) {
4349                                 if (!cache_block_group_error)
4350                                         cache_block_group_error = ret;
4351                                 ret = 0;
4352                                 goto loop;
4353                         }
4354                         ret = 0;
4355                 }
4356
4357                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
4358                         if (!cache_block_group_error)
4359                                 cache_block_group_error = -EIO;
4360                         goto loop;
4361                 }
4362
4363                 if (!find_free_extent_check_size_class(ffe_ctl, block_group))
4364                         goto loop;
4365
4366                 bg_ret = NULL;
4367                 ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4368                 if (ret > 0)
4369                         goto loop;
4370
4371                 if (bg_ret && bg_ret != block_group) {
4372                         btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4373                         block_group = bg_ret;
4374                 }
4375
4376                 /* Checks */
4377                 ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
4378                                                  fs_info->stripesize);
4379
4380                 /* move on to the next group */
4381                 if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4382                     block_group->start + block_group->length) {
4383                         btrfs_add_free_space_unused(block_group,
4384                                             ffe_ctl->found_offset,
4385                                             ffe_ctl->num_bytes);
4386                         goto loop;
4387                 }
4388
4389                 if (ffe_ctl->found_offset < ffe_ctl->search_start)
4390                         btrfs_add_free_space_unused(block_group,
4391                                         ffe_ctl->found_offset,
4392                                         ffe_ctl->search_start - ffe_ctl->found_offset);
4393
4394                 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
4395                                                ffe_ctl->num_bytes,
4396                                                ffe_ctl->delalloc,
4397                                                ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
4398                 if (ret == -EAGAIN) {
4399                         btrfs_add_free_space_unused(block_group,
4400                                         ffe_ctl->found_offset,
4401                                         ffe_ctl->num_bytes);
4402                         goto loop;
4403                 }
4404                 btrfs_inc_block_group_reservations(block_group);
4405
4406                 /* we are all good, lets return */
4407                 ins->objectid = ffe_ctl->search_start;
4408                 ins->offset = ffe_ctl->num_bytes;
4409
4410                 trace_btrfs_reserve_extent(block_group, ffe_ctl);
4411                 btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4412                 break;
4413 loop:
4414                 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
4415                     !ffe_ctl->retry_uncached) {
4416                         ffe_ctl->retry_uncached = true;
4417                         btrfs_wait_block_group_cache_progress(block_group,
4418                                                 ffe_ctl->num_bytes +
4419                                                 ffe_ctl->empty_cluster +
4420                                                 ffe_ctl->empty_size);
4421                         goto have_block_group;
4422                 }
4423                 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4424                 cond_resched();
4425         }
4426         up_read(&space_info->groups_sem);
4427
4428         ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4429         if (ret > 0)
4430                 goto search;
4431
4432         if (ret == -ENOSPC && !cache_block_group_error) {
4433                 /*
4434                  * Use ffe_ctl->total_free_space as fallback if we can't find
4435                  * any contiguous hole.
4436                  */
4437                 if (!ffe_ctl->max_extent_size)
4438                         ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4439                 spin_lock(&space_info->lock);
4440                 space_info->max_extent_size = ffe_ctl->max_extent_size;
4441                 spin_unlock(&space_info->lock);
4442                 ins->offset = ffe_ctl->max_extent_size;
4443         } else if (ret == -ENOSPC) {
4444                 ret = cache_block_group_error;
4445         }
4446         return ret;
4447 }
4448
4449 /*
4450  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4451  *                        hole that is at least as big as @num_bytes.
4452  *
4453  * @root           -    The root that will contain this extent
4454  *
4455  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4456  *                      is used for accounting purposes. This value differs
4457  *                      from @num_bytes only in the case of compressed extents.
4458  *
4459  * @num_bytes      -    Number of bytes to allocate on-disk.
4460  *
4461  * @min_alloc_size -    Indicates the minimum amount of space that the
4462  *                      allocator should try to satisfy. In some cases
4463  *                      @num_bytes may be larger than what is required and if
4464  *                      the filesystem is fragmented then allocation fails.
4465  *                      However, the presence of @min_alloc_size gives a
4466  *                      chance to try and satisfy the smaller allocation.
4467  *
4468  * @empty_size     -    A hint that you plan on doing more COW. This is the
4469  *                      size in bytes the allocator should try to find free
4470  *                      next to the block it returns.  This is just a hint and
4471  *                      may be ignored by the allocator.
4472  *
4473  * @hint_byte      -    Hint to the allocator to start searching above the byte
4474  *                      address passed. It might be ignored.
4475  *
4476  * @ins            -    This key is modified to record the found hole. It will
4477  *                      have the following values:
4478  *                      ins->objectid == start position
4479  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4480  *                      ins->offset == the size of the hole.
4481  *
4482  * @is_data        -    Boolean flag indicating whether an extent is
4483  *                      allocated for data (true) or metadata (false)
4484  *
4485  * @delalloc       -    Boolean flag indicating whether this allocation is for
4486  *                      delalloc or not. If 'true' data_rwsem of block groups
4487  *                      is going to be acquired.
4488  *
4489  *
4490  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4491  * case -ENOSPC is returned then @ins->offset will contain the size of the
4492  * largest available hole the allocator managed to find.
4493  */
4494 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4495                          u64 num_bytes, u64 min_alloc_size,
4496                          u64 empty_size, u64 hint_byte,
4497                          struct btrfs_key *ins, int is_data, int delalloc)
4498 {
4499         struct btrfs_fs_info *fs_info = root->fs_info;
4500         struct find_free_extent_ctl ffe_ctl = {};
4501         bool final_tried = num_bytes == min_alloc_size;
4502         u64 flags;
4503         int ret;
4504         bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4505         bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4506
4507         flags = get_alloc_profile_by_root(root, is_data);
4508 again:
4509         WARN_ON(num_bytes < fs_info->sectorsize);
4510
4511         ffe_ctl.ram_bytes = ram_bytes;
4512         ffe_ctl.num_bytes = num_bytes;
4513         ffe_ctl.min_alloc_size = min_alloc_size;
4514         ffe_ctl.empty_size = empty_size;
4515         ffe_ctl.flags = flags;
4516         ffe_ctl.delalloc = delalloc;
4517         ffe_ctl.hint_byte = hint_byte;
4518         ffe_ctl.for_treelog = for_treelog;
4519         ffe_ctl.for_data_reloc = for_data_reloc;
4520
4521         ret = find_free_extent(root, ins, &ffe_ctl);
4522         if (!ret && !is_data) {
4523                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4524         } else if (ret == -ENOSPC) {
4525                 if (!final_tried && ins->offset) {
4526                         num_bytes = min(num_bytes >> 1, ins->offset);
4527                         num_bytes = round_down(num_bytes,
4528                                                fs_info->sectorsize);
4529                         num_bytes = max(num_bytes, min_alloc_size);
4530                         ram_bytes = num_bytes;
4531                         if (num_bytes == min_alloc_size)
4532                                 final_tried = true;
4533                         goto again;
4534                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4535                         struct btrfs_space_info *sinfo;
4536
4537                         sinfo = btrfs_find_space_info(fs_info, flags);
4538                         btrfs_err(fs_info,
4539         "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
4540                                   flags, num_bytes, for_treelog, for_data_reloc);
4541                         if (sinfo)
4542                                 btrfs_dump_space_info(fs_info, sinfo,
4543                                                       num_bytes, 1);
4544                 }
4545         }
4546
4547         return ret;
4548 }
4549
4550 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4551                                u64 start, u64 len, int delalloc)
4552 {
4553         struct btrfs_block_group *cache;
4554
4555         cache = btrfs_lookup_block_group(fs_info, start);
4556         if (!cache) {
4557                 btrfs_err(fs_info, "Unable to find block group for %llu",
4558                           start);
4559                 return -ENOSPC;
4560         }
4561
4562         btrfs_add_free_space(cache, start, len);
4563         btrfs_free_reserved_bytes(cache, len, delalloc);
4564         trace_btrfs_reserved_extent_free(fs_info, start, len);
4565
4566         btrfs_put_block_group(cache);
4567         return 0;
4568 }
4569
4570 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
4571                               u64 len)
4572 {
4573         struct btrfs_block_group *cache;
4574         int ret = 0;
4575
4576         cache = btrfs_lookup_block_group(trans->fs_info, start);
4577         if (!cache) {
4578                 btrfs_err(trans->fs_info, "unable to find block group for %llu",
4579                           start);
4580                 return -ENOSPC;
4581         }
4582
4583         ret = pin_down_extent(trans, cache, start, len, 1);
4584         btrfs_put_block_group(cache);
4585         return ret;
4586 }
4587
4588 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
4589                                  u64 num_bytes)
4590 {
4591         struct btrfs_fs_info *fs_info = trans->fs_info;
4592         int ret;
4593
4594         ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
4595         if (ret)
4596                 return ret;
4597
4598         ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
4599         if (ret) {
4600                 ASSERT(!ret);
4601                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4602                           bytenr, num_bytes);
4603                 return ret;
4604         }
4605
4606         trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
4607         return 0;
4608 }
4609
4610 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4611                                       u64 parent, u64 root_objectid,
4612                                       u64 flags, u64 owner, u64 offset,
4613                                       struct btrfs_key *ins, int ref_mod)
4614 {
4615         struct btrfs_fs_info *fs_info = trans->fs_info;
4616         struct btrfs_root *extent_root;
4617         int ret;
4618         struct btrfs_extent_item *extent_item;
4619         struct btrfs_extent_inline_ref *iref;
4620         struct btrfs_path *path;
4621         struct extent_buffer *leaf;
4622         int type;
4623         u32 size;
4624
4625         if (parent > 0)
4626                 type = BTRFS_SHARED_DATA_REF_KEY;
4627         else
4628                 type = BTRFS_EXTENT_DATA_REF_KEY;
4629
4630         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4631
4632         path = btrfs_alloc_path();
4633         if (!path)
4634                 return -ENOMEM;
4635
4636         extent_root = btrfs_extent_root(fs_info, ins->objectid);
4637         ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4638         if (ret) {
4639                 btrfs_free_path(path);
4640                 return ret;
4641         }
4642
4643         leaf = path->nodes[0];
4644         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4645                                      struct btrfs_extent_item);
4646         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4647         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4648         btrfs_set_extent_flags(leaf, extent_item,
4649                                flags | BTRFS_EXTENT_FLAG_DATA);
4650
4651         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4652         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4653         if (parent > 0) {
4654                 struct btrfs_shared_data_ref *ref;
4655                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4656                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4657                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4658         } else {
4659                 struct btrfs_extent_data_ref *ref;
4660                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4661                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4662                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4663                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4664                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4665         }
4666
4667         btrfs_mark_buffer_dirty(trans, path->nodes[0]);
4668         btrfs_free_path(path);
4669
4670         return alloc_reserved_extent(trans, ins->objectid, ins->offset);
4671 }
4672
4673 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4674                                      struct btrfs_delayed_ref_node *node,
4675                                      struct btrfs_delayed_extent_op *extent_op)
4676 {
4677         struct btrfs_fs_info *fs_info = trans->fs_info;
4678         struct btrfs_root *extent_root;
4679         int ret;
4680         struct btrfs_extent_item *extent_item;
4681         struct btrfs_key extent_key;
4682         struct btrfs_tree_block_info *block_info;
4683         struct btrfs_extent_inline_ref *iref;
4684         struct btrfs_path *path;
4685         struct extent_buffer *leaf;
4686         struct btrfs_delayed_tree_ref *ref;
4687         u32 size = sizeof(*extent_item) + sizeof(*iref);
4688         u64 flags = extent_op->flags_to_set;
4689         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4690
4691         ref = btrfs_delayed_node_to_tree_ref(node);
4692
4693         extent_key.objectid = node->bytenr;
4694         if (skinny_metadata) {
4695                 extent_key.offset = ref->level;
4696                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4697         } else {
4698                 extent_key.offset = node->num_bytes;
4699                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4700                 size += sizeof(*block_info);
4701         }
4702
4703         path = btrfs_alloc_path();
4704         if (!path)
4705                 return -ENOMEM;
4706
4707         extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
4708         ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
4709                                       size);
4710         if (ret) {
4711                 btrfs_free_path(path);
4712                 return ret;
4713         }
4714
4715         leaf = path->nodes[0];
4716         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4717                                      struct btrfs_extent_item);
4718         btrfs_set_extent_refs(leaf, extent_item, 1);
4719         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4720         btrfs_set_extent_flags(leaf, extent_item,
4721                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4722
4723         if (skinny_metadata) {
4724                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4725         } else {
4726                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4727                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4728                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
4729                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4730         }
4731
4732         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4733                 btrfs_set_extent_inline_ref_type(leaf, iref,
4734                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4735                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4736         } else {
4737                 btrfs_set_extent_inline_ref_type(leaf, iref,
4738                                                  BTRFS_TREE_BLOCK_REF_KEY);
4739                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4740         }
4741
4742         btrfs_mark_buffer_dirty(trans, leaf);
4743         btrfs_free_path(path);
4744
4745         return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
4746 }
4747
4748 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4749                                      struct btrfs_root *root, u64 owner,
4750                                      u64 offset, u64 ram_bytes,
4751                                      struct btrfs_key *ins)
4752 {
4753         struct btrfs_ref generic_ref = { 0 };
4754
4755         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4756
4757         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4758                                ins->objectid, ins->offset, 0);
4759         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
4760                             offset, 0, false);
4761         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4762
4763         return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4764 }
4765
4766 /*
4767  * this is used by the tree logging recovery code.  It records that
4768  * an extent has been allocated and makes sure to clear the free
4769  * space cache bits as well
4770  */
4771 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4772                                    u64 root_objectid, u64 owner, u64 offset,
4773                                    struct btrfs_key *ins)
4774 {
4775         struct btrfs_fs_info *fs_info = trans->fs_info;
4776         int ret;
4777         struct btrfs_block_group *block_group;
4778         struct btrfs_space_info *space_info;
4779
4780         /*
4781          * Mixed block groups will exclude before processing the log so we only
4782          * need to do the exclude dance if this fs isn't mixed.
4783          */
4784         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4785                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4786                                               ins->offset);
4787                 if (ret)
4788                         return ret;
4789         }
4790
4791         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4792         if (!block_group)
4793                 return -EINVAL;
4794
4795         space_info = block_group->space_info;
4796         spin_lock(&space_info->lock);
4797         spin_lock(&block_group->lock);
4798         space_info->bytes_reserved += ins->offset;
4799         block_group->reserved += ins->offset;
4800         spin_unlock(&block_group->lock);
4801         spin_unlock(&space_info->lock);
4802
4803         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4804                                          offset, ins, 1);
4805         if (ret)
4806                 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4807         btrfs_put_block_group(block_group);
4808         return ret;
4809 }
4810
4811 static struct extent_buffer *
4812 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4813                       u64 bytenr, int level, u64 owner,
4814                       enum btrfs_lock_nesting nest)
4815 {
4816         struct btrfs_fs_info *fs_info = root->fs_info;
4817         struct extent_buffer *buf;
4818         u64 lockdep_owner = owner;
4819
4820         buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4821         if (IS_ERR(buf))
4822                 return buf;
4823
4824         /*
4825          * Extra safety check in case the extent tree is corrupted and extent
4826          * allocator chooses to use a tree block which is already used and
4827          * locked.
4828          */
4829         if (buf->lock_owner == current->pid) {
4830                 btrfs_err_rl(fs_info,
4831 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4832                         buf->start, btrfs_header_owner(buf), current->pid);
4833                 free_extent_buffer(buf);
4834                 return ERR_PTR(-EUCLEAN);
4835         }
4836
4837         /*
4838          * The reloc trees are just snapshots, so we need them to appear to be
4839          * just like any other fs tree WRT lockdep.
4840          *
4841          * The exception however is in replace_path() in relocation, where we
4842          * hold the lock on the original fs root and then search for the reloc
4843          * root.  At that point we need to make sure any reloc root buffers are
4844          * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make
4845          * lockdep happy.
4846          */
4847         if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID &&
4848             !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
4849                 lockdep_owner = BTRFS_FS_TREE_OBJECTID;
4850
4851         /* btrfs_clear_buffer_dirty() accesses generation field. */
4852         btrfs_set_header_generation(buf, trans->transid);
4853
4854         /*
4855          * This needs to stay, because we could allocate a freed block from an
4856          * old tree into a new tree, so we need to make sure this new block is
4857          * set to the appropriate level and owner.
4858          */
4859         btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level);
4860
4861         __btrfs_tree_lock(buf, nest);
4862         btrfs_clear_buffer_dirty(trans, buf);
4863         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4864         clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4865
4866         set_extent_buffer_uptodate(buf);
4867
4868         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4869         btrfs_set_header_level(buf, level);
4870         btrfs_set_header_bytenr(buf, buf->start);
4871         btrfs_set_header_generation(buf, trans->transid);
4872         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4873         btrfs_set_header_owner(buf, owner);
4874         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4875         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4876         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4877                 buf->log_index = root->log_transid % 2;
4878                 /*
4879                  * we allow two log transactions at a time, use different
4880                  * EXTENT bit to differentiate dirty pages.
4881                  */
4882                 if (buf->log_index == 0)
4883                         set_extent_bit(&root->dirty_log_pages, buf->start,
4884                                        buf->start + buf->len - 1,
4885                                        EXTENT_DIRTY, NULL);
4886                 else
4887                         set_extent_bit(&root->dirty_log_pages, buf->start,
4888                                        buf->start + buf->len - 1,
4889                                        EXTENT_NEW, NULL);
4890         } else {
4891                 buf->log_index = -1;
4892                 set_extent_bit(&trans->transaction->dirty_pages, buf->start,
4893                                buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
4894         }
4895         /* this returns a buffer locked for blocking */
4896         return buf;
4897 }
4898
4899 /*
4900  * finds a free extent and does all the dirty work required for allocation
4901  * returns the tree buffer or an ERR_PTR on error.
4902  */
4903 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4904                                              struct btrfs_root *root,
4905                                              u64 parent, u64 root_objectid,
4906                                              const struct btrfs_disk_key *key,
4907                                              int level, u64 hint,
4908                                              u64 empty_size,
4909                                              enum btrfs_lock_nesting nest)
4910 {
4911         struct btrfs_fs_info *fs_info = root->fs_info;
4912         struct btrfs_key ins;
4913         struct btrfs_block_rsv *block_rsv;
4914         struct extent_buffer *buf;
4915         struct btrfs_delayed_extent_op *extent_op;
4916         struct btrfs_ref generic_ref = { 0 };
4917         u64 flags = 0;
4918         int ret;
4919         u32 blocksize = fs_info->nodesize;
4920         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4921
4922 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4923         if (btrfs_is_testing(fs_info)) {
4924                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4925                                             level, root_objectid, nest);
4926                 if (!IS_ERR(buf))
4927                         root->alloc_bytenr += blocksize;
4928                 return buf;
4929         }
4930 #endif
4931
4932         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4933         if (IS_ERR(block_rsv))
4934                 return ERR_CAST(block_rsv);
4935
4936         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4937                                    empty_size, hint, &ins, 0, 0);
4938         if (ret)
4939                 goto out_unuse;
4940
4941         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4942                                     root_objectid, nest);
4943         if (IS_ERR(buf)) {
4944                 ret = PTR_ERR(buf);
4945                 goto out_free_reserved;
4946         }
4947
4948         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4949                 if (parent == 0)
4950                         parent = ins.objectid;
4951                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4952         } else
4953                 BUG_ON(parent > 0);
4954
4955         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4956                 extent_op = btrfs_alloc_delayed_extent_op();
4957                 if (!extent_op) {
4958                         ret = -ENOMEM;
4959                         goto out_free_buf;
4960                 }
4961                 if (key)
4962                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4963                 else
4964                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4965                 extent_op->flags_to_set = flags;
4966                 extent_op->update_key = skinny_metadata ? false : true;
4967                 extent_op->update_flags = true;
4968                 extent_op->level = level;
4969
4970                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4971                                        ins.objectid, ins.offset, parent);
4972                 btrfs_init_tree_ref(&generic_ref, level, root_objectid,
4973                                     root->root_key.objectid, false);
4974                 btrfs_ref_tree_mod(fs_info, &generic_ref);
4975                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4976                 if (ret)
4977                         goto out_free_delayed;
4978         }
4979         return buf;
4980
4981 out_free_delayed:
4982         btrfs_free_delayed_extent_op(extent_op);
4983 out_free_buf:
4984         btrfs_tree_unlock(buf);
4985         free_extent_buffer(buf);
4986 out_free_reserved:
4987         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4988 out_unuse:
4989         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4990         return ERR_PTR(ret);
4991 }
4992
4993 struct walk_control {
4994         u64 refs[BTRFS_MAX_LEVEL];
4995         u64 flags[BTRFS_MAX_LEVEL];
4996         struct btrfs_key update_progress;
4997         struct btrfs_key drop_progress;
4998         int drop_level;
4999         int stage;
5000         int level;
5001         int shared_level;
5002         int update_ref;
5003         int keep_locks;
5004         int reada_slot;
5005         int reada_count;
5006         int restarted;
5007 };
5008
5009 #define DROP_REFERENCE  1
5010 #define UPDATE_BACKREF  2
5011
5012 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5013                                      struct btrfs_root *root,
5014                                      struct walk_control *wc,
5015                                      struct btrfs_path *path)
5016 {
5017         struct btrfs_fs_info *fs_info = root->fs_info;
5018         u64 bytenr;
5019         u64 generation;
5020         u64 refs;
5021         u64 flags;
5022         u32 nritems;
5023         struct btrfs_key key;
5024         struct extent_buffer *eb;
5025         int ret;
5026         int slot;
5027         int nread = 0;
5028
5029         if (path->slots[wc->level] < wc->reada_slot) {
5030                 wc->reada_count = wc->reada_count * 2 / 3;
5031                 wc->reada_count = max(wc->reada_count, 2);
5032         } else {
5033                 wc->reada_count = wc->reada_count * 3 / 2;
5034                 wc->reada_count = min_t(int, wc->reada_count,
5035                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
5036         }
5037
5038         eb = path->nodes[wc->level];
5039         nritems = btrfs_header_nritems(eb);
5040
5041         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5042                 if (nread >= wc->reada_count)
5043                         break;
5044
5045                 cond_resched();
5046                 bytenr = btrfs_node_blockptr(eb, slot);
5047                 generation = btrfs_node_ptr_generation(eb, slot);
5048
5049                 if (slot == path->slots[wc->level])
5050                         goto reada;
5051
5052                 if (wc->stage == UPDATE_BACKREF &&
5053                     generation <= root->root_key.offset)
5054                         continue;
5055
5056                 /* We don't lock the tree block, it's OK to be racy here */
5057                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5058                                                wc->level - 1, 1, &refs,
5059                                                &flags);
5060                 /* We don't care about errors in readahead. */
5061                 if (ret < 0)
5062                         continue;
5063                 BUG_ON(refs == 0);
5064
5065                 if (wc->stage == DROP_REFERENCE) {
5066                         if (refs == 1)
5067                                 goto reada;
5068
5069                         if (wc->level == 1 &&
5070                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5071                                 continue;
5072                         if (!wc->update_ref ||
5073                             generation <= root->root_key.offset)
5074                                 continue;
5075                         btrfs_node_key_to_cpu(eb, &key, slot);
5076                         ret = btrfs_comp_cpu_keys(&key,
5077                                                   &wc->update_progress);
5078                         if (ret < 0)
5079                                 continue;
5080                 } else {
5081                         if (wc->level == 1 &&
5082                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5083                                 continue;
5084                 }
5085 reada:
5086                 btrfs_readahead_node_child(eb, slot);
5087                 nread++;
5088         }
5089         wc->reada_slot = slot;
5090 }
5091
5092 /*
5093  * helper to process tree block while walking down the tree.
5094  *
5095  * when wc->stage == UPDATE_BACKREF, this function updates
5096  * back refs for pointers in the block.
5097  *
5098  * NOTE: return value 1 means we should stop walking down.
5099  */
5100 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5101                                    struct btrfs_root *root,
5102                                    struct btrfs_path *path,
5103                                    struct walk_control *wc, int lookup_info)
5104 {
5105         struct btrfs_fs_info *fs_info = root->fs_info;
5106         int level = wc->level;
5107         struct extent_buffer *eb = path->nodes[level];
5108         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5109         int ret;
5110
5111         if (wc->stage == UPDATE_BACKREF &&
5112             btrfs_header_owner(eb) != root->root_key.objectid)
5113                 return 1;
5114
5115         /*
5116          * when reference count of tree block is 1, it won't increase
5117          * again. once full backref flag is set, we never clear it.
5118          */
5119         if (lookup_info &&
5120             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5121              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5122                 BUG_ON(!path->locks[level]);
5123                 ret = btrfs_lookup_extent_info(trans, fs_info,
5124                                                eb->start, level, 1,
5125                                                &wc->refs[level],
5126                                                &wc->flags[level]);
5127                 BUG_ON(ret == -ENOMEM);
5128                 if (ret)
5129                         return ret;
5130                 BUG_ON(wc->refs[level] == 0);
5131         }
5132
5133         if (wc->stage == DROP_REFERENCE) {
5134                 if (wc->refs[level] > 1)
5135                         return 1;
5136
5137                 if (path->locks[level] && !wc->keep_locks) {
5138                         btrfs_tree_unlock_rw(eb, path->locks[level]);
5139                         path->locks[level] = 0;
5140                 }
5141                 return 0;
5142         }
5143
5144         /* wc->stage == UPDATE_BACKREF */
5145         if (!(wc->flags[level] & flag)) {
5146                 BUG_ON(!path->locks[level]);
5147                 ret = btrfs_inc_ref(trans, root, eb, 1);
5148                 BUG_ON(ret); /* -ENOMEM */
5149                 ret = btrfs_dec_ref(trans, root, eb, 0);
5150                 BUG_ON(ret); /* -ENOMEM */
5151                 ret = btrfs_set_disk_extent_flags(trans, eb, flag);
5152                 BUG_ON(ret); /* -ENOMEM */
5153                 wc->flags[level] |= flag;
5154         }
5155
5156         /*
5157          * the block is shared by multiple trees, so it's not good to
5158          * keep the tree lock
5159          */
5160         if (path->locks[level] && level > 0) {
5161                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5162                 path->locks[level] = 0;
5163         }
5164         return 0;
5165 }
5166
5167 /*
5168  * This is used to verify a ref exists for this root to deal with a bug where we
5169  * would have a drop_progress key that hadn't been updated properly.
5170  */
5171 static int check_ref_exists(struct btrfs_trans_handle *trans,
5172                             struct btrfs_root *root, u64 bytenr, u64 parent,
5173                             int level)
5174 {
5175         struct btrfs_path *path;
5176         struct btrfs_extent_inline_ref *iref;
5177         int ret;
5178
5179         path = btrfs_alloc_path();
5180         if (!path)
5181                 return -ENOMEM;
5182
5183         ret = lookup_extent_backref(trans, path, &iref, bytenr,
5184                                     root->fs_info->nodesize, parent,
5185                                     root->root_key.objectid, level, 0);
5186         btrfs_free_path(path);
5187         if (ret == -ENOENT)
5188                 return 0;
5189         if (ret < 0)
5190                 return ret;
5191         return 1;
5192 }
5193
5194 /*
5195  * helper to process tree block pointer.
5196  *
5197  * when wc->stage == DROP_REFERENCE, this function checks
5198  * reference count of the block pointed to. if the block
5199  * is shared and we need update back refs for the subtree
5200  * rooted at the block, this function changes wc->stage to
5201  * UPDATE_BACKREF. if the block is shared and there is no
5202  * need to update back, this function drops the reference
5203  * to the block.
5204  *
5205  * NOTE: return value 1 means we should stop walking down.
5206  */
5207 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5208                                  struct btrfs_root *root,
5209                                  struct btrfs_path *path,
5210                                  struct walk_control *wc, int *lookup_info)
5211 {
5212         struct btrfs_fs_info *fs_info = root->fs_info;
5213         u64 bytenr;
5214         u64 generation;
5215         u64 parent;
5216         struct btrfs_tree_parent_check check = { 0 };
5217         struct btrfs_key key;
5218         struct btrfs_ref ref = { 0 };
5219         struct extent_buffer *next;
5220         int level = wc->level;
5221         int reada = 0;
5222         int ret = 0;
5223         bool need_account = false;
5224
5225         generation = btrfs_node_ptr_generation(path->nodes[level],
5226                                                path->slots[level]);
5227         /*
5228          * if the lower level block was created before the snapshot
5229          * was created, we know there is no need to update back refs
5230          * for the subtree
5231          */
5232         if (wc->stage == UPDATE_BACKREF &&
5233             generation <= root->root_key.offset) {
5234                 *lookup_info = 1;
5235                 return 1;
5236         }
5237
5238         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5239
5240         check.level = level - 1;
5241         check.transid = generation;
5242         check.owner_root = root->root_key.objectid;
5243         check.has_first_key = true;
5244         btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
5245                               path->slots[level]);
5246
5247         next = find_extent_buffer(fs_info, bytenr);
5248         if (!next) {
5249                 next = btrfs_find_create_tree_block(fs_info, bytenr,
5250                                 root->root_key.objectid, level - 1);
5251                 if (IS_ERR(next))
5252                         return PTR_ERR(next);
5253                 reada = 1;
5254         }
5255         btrfs_tree_lock(next);
5256
5257         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5258                                        &wc->refs[level - 1],
5259                                        &wc->flags[level - 1]);
5260         if (ret < 0)
5261                 goto out_unlock;
5262
5263         if (unlikely(wc->refs[level - 1] == 0)) {
5264                 btrfs_err(fs_info, "Missing references.");
5265                 ret = -EIO;
5266                 goto out_unlock;
5267         }
5268         *lookup_info = 0;
5269
5270         if (wc->stage == DROP_REFERENCE) {
5271                 if (wc->refs[level - 1] > 1) {
5272                         need_account = true;
5273                         if (level == 1 &&
5274                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5275                                 goto skip;
5276
5277                         if (!wc->update_ref ||
5278                             generation <= root->root_key.offset)
5279                                 goto skip;
5280
5281                         btrfs_node_key_to_cpu(path->nodes[level], &key,
5282                                               path->slots[level]);
5283                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5284                         if (ret < 0)
5285                                 goto skip;
5286
5287                         wc->stage = UPDATE_BACKREF;
5288                         wc->shared_level = level - 1;
5289                 }
5290         } else {
5291                 if (level == 1 &&
5292                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5293                         goto skip;
5294         }
5295
5296         if (!btrfs_buffer_uptodate(next, generation, 0)) {
5297                 btrfs_tree_unlock(next);
5298                 free_extent_buffer(next);
5299                 next = NULL;
5300                 *lookup_info = 1;
5301         }
5302
5303         if (!next) {
5304                 if (reada && level == 1)
5305                         reada_walk_down(trans, root, wc, path);
5306                 next = read_tree_block(fs_info, bytenr, &check);
5307                 if (IS_ERR(next)) {
5308                         return PTR_ERR(next);
5309                 } else if (!extent_buffer_uptodate(next)) {
5310                         free_extent_buffer(next);
5311                         return -EIO;
5312                 }
5313                 btrfs_tree_lock(next);
5314         }
5315
5316         level--;
5317         ASSERT(level == btrfs_header_level(next));
5318         if (level != btrfs_header_level(next)) {
5319                 btrfs_err(root->fs_info, "mismatched level");
5320                 ret = -EIO;
5321                 goto out_unlock;
5322         }
5323         path->nodes[level] = next;
5324         path->slots[level] = 0;
5325         path->locks[level] = BTRFS_WRITE_LOCK;
5326         wc->level = level;
5327         if (wc->level == 1)
5328                 wc->reada_slot = 0;
5329         return 0;
5330 skip:
5331         wc->refs[level - 1] = 0;
5332         wc->flags[level - 1] = 0;
5333         if (wc->stage == DROP_REFERENCE) {
5334                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5335                         parent = path->nodes[level]->start;
5336                 } else {
5337                         ASSERT(root->root_key.objectid ==
5338                                btrfs_header_owner(path->nodes[level]));
5339                         if (root->root_key.objectid !=
5340                             btrfs_header_owner(path->nodes[level])) {
5341                                 btrfs_err(root->fs_info,
5342                                                 "mismatched block owner");
5343                                 ret = -EIO;
5344                                 goto out_unlock;
5345                         }
5346                         parent = 0;
5347                 }
5348
5349                 /*
5350                  * If we had a drop_progress we need to verify the refs are set
5351                  * as expected.  If we find our ref then we know that from here
5352                  * on out everything should be correct, and we can clear the
5353                  * ->restarted flag.
5354                  */
5355                 if (wc->restarted) {
5356                         ret = check_ref_exists(trans, root, bytenr, parent,
5357                                                level - 1);
5358                         if (ret < 0)
5359                                 goto out_unlock;
5360                         if (ret == 0)
5361                                 goto no_delete;
5362                         ret = 0;
5363                         wc->restarted = 0;
5364                 }
5365
5366                 /*
5367                  * Reloc tree doesn't contribute to qgroup numbers, and we have
5368                  * already accounted them at merge time (replace_path),
5369                  * thus we could skip expensive subtree trace here.
5370                  */
5371                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
5372                     need_account) {
5373                         ret = btrfs_qgroup_trace_subtree(trans, next,
5374                                                          generation, level - 1);
5375                         if (ret) {
5376                                 btrfs_err_rl(fs_info,
5377                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5378                                              ret);
5379                         }
5380                 }
5381
5382                 /*
5383                  * We need to update the next key in our walk control so we can
5384                  * update the drop_progress key accordingly.  We don't care if
5385                  * find_next_key doesn't find a key because that means we're at
5386                  * the end and are going to clean up now.
5387                  */
5388                 wc->drop_level = level;
5389                 find_next_key(path, level, &wc->drop_progress);
5390
5391                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5392                                        fs_info->nodesize, parent);
5393                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
5394                                     0, false);
5395                 ret = btrfs_free_extent(trans, &ref);
5396                 if (ret)
5397                         goto out_unlock;
5398         }
5399 no_delete:
5400         *lookup_info = 1;
5401         ret = 1;
5402
5403 out_unlock:
5404         btrfs_tree_unlock(next);
5405         free_extent_buffer(next);
5406
5407         return ret;
5408 }
5409
5410 /*
5411  * helper to process tree block while walking up the tree.
5412  *
5413  * when wc->stage == DROP_REFERENCE, this function drops
5414  * reference count on the block.
5415  *
5416  * when wc->stage == UPDATE_BACKREF, this function changes
5417  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5418  * to UPDATE_BACKREF previously while processing the block.
5419  *
5420  * NOTE: return value 1 means we should stop walking up.
5421  */
5422 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5423                                  struct btrfs_root *root,
5424                                  struct btrfs_path *path,
5425                                  struct walk_control *wc)
5426 {
5427         struct btrfs_fs_info *fs_info = root->fs_info;
5428         int ret;
5429         int level = wc->level;
5430         struct extent_buffer *eb = path->nodes[level];
5431         u64 parent = 0;
5432
5433         if (wc->stage == UPDATE_BACKREF) {
5434                 BUG_ON(wc->shared_level < level);
5435                 if (level < wc->shared_level)
5436                         goto out;
5437
5438                 ret = find_next_key(path, level + 1, &wc->update_progress);
5439                 if (ret > 0)
5440                         wc->update_ref = 0;
5441
5442                 wc->stage = DROP_REFERENCE;
5443                 wc->shared_level = -1;
5444                 path->slots[level] = 0;
5445
5446                 /*
5447                  * check reference count again if the block isn't locked.
5448                  * we should start walking down the tree again if reference
5449                  * count is one.
5450                  */
5451                 if (!path->locks[level]) {
5452                         BUG_ON(level == 0);
5453                         btrfs_tree_lock(eb);
5454                         path->locks[level] = BTRFS_WRITE_LOCK;
5455
5456                         ret = btrfs_lookup_extent_info(trans, fs_info,
5457                                                        eb->start, level, 1,
5458                                                        &wc->refs[level],
5459                                                        &wc->flags[level]);
5460                         if (ret < 0) {
5461                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5462                                 path->locks[level] = 0;
5463                                 return ret;
5464                         }
5465                         BUG_ON(wc->refs[level] == 0);
5466                         if (wc->refs[level] == 1) {
5467                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5468                                 path->locks[level] = 0;
5469                                 return 1;
5470                         }
5471                 }
5472         }
5473
5474         /* wc->stage == DROP_REFERENCE */
5475         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5476
5477         if (wc->refs[level] == 1) {
5478                 if (level == 0) {
5479                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5480                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5481                         else
5482                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5483                         BUG_ON(ret); /* -ENOMEM */
5484                         if (is_fstree(root->root_key.objectid)) {
5485                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5486                                 if (ret) {
5487                                         btrfs_err_rl(fs_info,
5488         "error %d accounting leaf items, quota is out of sync, rescan required",
5489                                              ret);
5490                                 }
5491                         }
5492                 }
5493                 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */
5494                 if (!path->locks[level]) {
5495                         btrfs_tree_lock(eb);
5496                         path->locks[level] = BTRFS_WRITE_LOCK;
5497                 }
5498                 btrfs_clear_buffer_dirty(trans, eb);
5499         }
5500
5501         if (eb == root->node) {
5502                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5503                         parent = eb->start;
5504                 else if (root->root_key.objectid != btrfs_header_owner(eb))
5505                         goto owner_mismatch;
5506         } else {
5507                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5508                         parent = path->nodes[level + 1]->start;
5509                 else if (root->root_key.objectid !=
5510                          btrfs_header_owner(path->nodes[level + 1]))
5511                         goto owner_mismatch;
5512         }
5513
5514         btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
5515                               wc->refs[level] == 1);
5516 out:
5517         wc->refs[level] = 0;
5518         wc->flags[level] = 0;
5519         return 0;
5520
5521 owner_mismatch:
5522         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5523                      btrfs_header_owner(eb), root->root_key.objectid);
5524         return -EUCLEAN;
5525 }
5526
5527 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5528                                    struct btrfs_root *root,
5529                                    struct btrfs_path *path,
5530                                    struct walk_control *wc)
5531 {
5532         int level = wc->level;
5533         int lookup_info = 1;
5534         int ret = 0;
5535
5536         while (level >= 0) {
5537                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5538                 if (ret)
5539                         break;
5540
5541                 if (level == 0)
5542                         break;
5543
5544                 if (path->slots[level] >=
5545                     btrfs_header_nritems(path->nodes[level]))
5546                         break;
5547
5548                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5549                 if (ret > 0) {
5550                         path->slots[level]++;
5551                         continue;
5552                 } else if (ret < 0)
5553                         break;
5554                 level = wc->level;
5555         }
5556         return (ret == 1) ? 0 : ret;
5557 }
5558
5559 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5560                                  struct btrfs_root *root,
5561                                  struct btrfs_path *path,
5562                                  struct walk_control *wc, int max_level)
5563 {
5564         int level = wc->level;
5565         int ret;
5566
5567         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5568         while (level < max_level && path->nodes[level]) {
5569                 wc->level = level;
5570                 if (path->slots[level] + 1 <
5571                     btrfs_header_nritems(path->nodes[level])) {
5572                         path->slots[level]++;
5573                         return 0;
5574                 } else {
5575                         ret = walk_up_proc(trans, root, path, wc);
5576                         if (ret > 0)
5577                                 return 0;
5578                         if (ret < 0)
5579                                 return ret;
5580
5581                         if (path->locks[level]) {
5582                                 btrfs_tree_unlock_rw(path->nodes[level],
5583                                                      path->locks[level]);
5584                                 path->locks[level] = 0;
5585                         }
5586                         free_extent_buffer(path->nodes[level]);
5587                         path->nodes[level] = NULL;
5588                         level++;
5589                 }
5590         }
5591         return 1;
5592 }
5593
5594 /*
5595  * drop a subvolume tree.
5596  *
5597  * this function traverses the tree freeing any blocks that only
5598  * referenced by the tree.
5599  *
5600  * when a shared tree block is found. this function decreases its
5601  * reference count by one. if update_ref is true, this function
5602  * also make sure backrefs for the shared block and all lower level
5603  * blocks are properly updated.
5604  *
5605  * If called with for_reloc == 0, may exit early with -EAGAIN
5606  */
5607 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
5608 {
5609         const bool is_reloc_root = (root->root_key.objectid ==
5610                                     BTRFS_TREE_RELOC_OBJECTID);
5611         struct btrfs_fs_info *fs_info = root->fs_info;
5612         struct btrfs_path *path;
5613         struct btrfs_trans_handle *trans;
5614         struct btrfs_root *tree_root = fs_info->tree_root;
5615         struct btrfs_root_item *root_item = &root->root_item;
5616         struct walk_control *wc;
5617         struct btrfs_key key;
5618         int err = 0;
5619         int ret;
5620         int level;
5621         bool root_dropped = false;
5622         bool unfinished_drop = false;
5623
5624         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5625
5626         path = btrfs_alloc_path();
5627         if (!path) {
5628                 err = -ENOMEM;
5629                 goto out;
5630         }
5631
5632         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5633         if (!wc) {
5634                 btrfs_free_path(path);
5635                 err = -ENOMEM;
5636                 goto out;
5637         }
5638
5639         /*
5640          * Use join to avoid potential EINTR from transaction start. See
5641          * wait_reserve_ticket and the whole reservation callchain.
5642          */
5643         if (for_reloc)
5644                 trans = btrfs_join_transaction(tree_root);
5645         else
5646                 trans = btrfs_start_transaction(tree_root, 0);
5647         if (IS_ERR(trans)) {
5648                 err = PTR_ERR(trans);
5649                 goto out_free;
5650         }
5651
5652         err = btrfs_run_delayed_items(trans);
5653         if (err)
5654                 goto out_end_trans;
5655
5656         /*
5657          * This will help us catch people modifying the fs tree while we're
5658          * dropping it.  It is unsafe to mess with the fs tree while it's being
5659          * dropped as we unlock the root node and parent nodes as we walk down
5660          * the tree, assuming nothing will change.  If something does change
5661          * then we'll have stale information and drop references to blocks we've
5662          * already dropped.
5663          */
5664         set_bit(BTRFS_ROOT_DELETING, &root->state);
5665         unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5666
5667         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5668                 level = btrfs_header_level(root->node);
5669                 path->nodes[level] = btrfs_lock_root_node(root);
5670                 path->slots[level] = 0;
5671                 path->locks[level] = BTRFS_WRITE_LOCK;
5672                 memset(&wc->update_progress, 0,
5673                        sizeof(wc->update_progress));
5674         } else {
5675                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5676                 memcpy(&wc->update_progress, &key,
5677                        sizeof(wc->update_progress));
5678
5679                 level = btrfs_root_drop_level(root_item);
5680                 BUG_ON(level == 0);
5681                 path->lowest_level = level;
5682                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5683                 path->lowest_level = 0;
5684                 if (ret < 0) {
5685                         err = ret;
5686                         goto out_end_trans;
5687                 }
5688                 WARN_ON(ret > 0);
5689
5690                 /*
5691                  * unlock our path, this is safe because only this
5692                  * function is allowed to delete this snapshot
5693                  */
5694                 btrfs_unlock_up_safe(path, 0);
5695
5696                 level = btrfs_header_level(root->node);
5697                 while (1) {
5698                         btrfs_tree_lock(path->nodes[level]);
5699                         path->locks[level] = BTRFS_WRITE_LOCK;
5700
5701                         ret = btrfs_lookup_extent_info(trans, fs_info,
5702                                                 path->nodes[level]->start,
5703                                                 level, 1, &wc->refs[level],
5704                                                 &wc->flags[level]);
5705                         if (ret < 0) {
5706                                 err = ret;
5707                                 goto out_end_trans;
5708                         }
5709                         BUG_ON(wc->refs[level] == 0);
5710
5711                         if (level == btrfs_root_drop_level(root_item))
5712                                 break;
5713
5714                         btrfs_tree_unlock(path->nodes[level]);
5715                         path->locks[level] = 0;
5716                         WARN_ON(wc->refs[level] != 1);
5717                         level--;
5718                 }
5719         }
5720
5721         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5722         wc->level = level;
5723         wc->shared_level = -1;
5724         wc->stage = DROP_REFERENCE;
5725         wc->update_ref = update_ref;
5726         wc->keep_locks = 0;
5727         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5728
5729         while (1) {
5730
5731                 ret = walk_down_tree(trans, root, path, wc);
5732                 if (ret < 0) {
5733                         btrfs_abort_transaction(trans, ret);
5734                         err = ret;
5735                         break;
5736                 }
5737
5738                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5739                 if (ret < 0) {
5740                         btrfs_abort_transaction(trans, ret);
5741                         err = ret;
5742                         break;
5743                 }
5744
5745                 if (ret > 0) {
5746                         BUG_ON(wc->stage != DROP_REFERENCE);
5747                         break;
5748                 }
5749
5750                 if (wc->stage == DROP_REFERENCE) {
5751                         wc->drop_level = wc->level;
5752                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5753                                               &wc->drop_progress,
5754                                               path->slots[wc->drop_level]);
5755                 }
5756                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5757                                       &wc->drop_progress);
5758                 btrfs_set_root_drop_level(root_item, wc->drop_level);
5759
5760                 BUG_ON(wc->level == 0);
5761                 if (btrfs_should_end_transaction(trans) ||
5762                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5763                         ret = btrfs_update_root(trans, tree_root,
5764                                                 &root->root_key,
5765                                                 root_item);
5766                         if (ret) {
5767                                 btrfs_abort_transaction(trans, ret);
5768                                 err = ret;
5769                                 goto out_end_trans;
5770                         }
5771
5772                         if (!is_reloc_root)
5773                                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5774
5775                         btrfs_end_transaction_throttle(trans);
5776                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5777                                 btrfs_debug(fs_info,
5778                                             "drop snapshot early exit");
5779                                 err = -EAGAIN;
5780                                 goto out_free;
5781                         }
5782
5783                        /*
5784                         * Use join to avoid potential EINTR from transaction
5785                         * start. See wait_reserve_ticket and the whole
5786                         * reservation callchain.
5787                         */
5788                         if (for_reloc)
5789                                 trans = btrfs_join_transaction(tree_root);
5790                         else
5791                                 trans = btrfs_start_transaction(tree_root, 0);
5792                         if (IS_ERR(trans)) {
5793                                 err = PTR_ERR(trans);
5794                                 goto out_free;
5795                         }
5796                 }
5797         }
5798         btrfs_release_path(path);
5799         if (err)
5800                 goto out_end_trans;
5801
5802         ret = btrfs_del_root(trans, &root->root_key);
5803         if (ret) {
5804                 btrfs_abort_transaction(trans, ret);
5805                 err = ret;
5806                 goto out_end_trans;
5807         }
5808
5809         if (!is_reloc_root) {
5810                 ret = btrfs_find_root(tree_root, &root->root_key, path,
5811                                       NULL, NULL);
5812                 if (ret < 0) {
5813                         btrfs_abort_transaction(trans, ret);
5814                         err = ret;
5815                         goto out_end_trans;
5816                 } else if (ret > 0) {
5817                         /* if we fail to delete the orphan item this time
5818                          * around, it'll get picked up the next time.
5819                          *
5820                          * The most common failure here is just -ENOENT.
5821                          */
5822                         btrfs_del_orphan_item(trans, tree_root,
5823                                               root->root_key.objectid);
5824                 }
5825         }
5826
5827         /*
5828          * This subvolume is going to be completely dropped, and won't be
5829          * recorded as dirty roots, thus pertrans meta rsv will not be freed at
5830          * commit transaction time.  So free it here manually.
5831          */
5832         btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
5833         btrfs_qgroup_free_meta_all_pertrans(root);
5834
5835         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5836                 btrfs_add_dropped_root(trans, root);
5837         else
5838                 btrfs_put_root(root);
5839         root_dropped = true;
5840 out_end_trans:
5841         if (!is_reloc_root)
5842                 btrfs_set_last_root_drop_gen(fs_info, trans->transid);
5843
5844         btrfs_end_transaction_throttle(trans);
5845 out_free:
5846         kfree(wc);
5847         btrfs_free_path(path);
5848 out:
5849         /*
5850          * We were an unfinished drop root, check to see if there are any
5851          * pending, and if not clear and wake up any waiters.
5852          */
5853         if (!err && unfinished_drop)
5854                 btrfs_maybe_wake_unfinished_drop(fs_info);
5855
5856         /*
5857          * So if we need to stop dropping the snapshot for whatever reason we
5858          * need to make sure to add it back to the dead root list so that we
5859          * keep trying to do the work later.  This also cleans up roots if we
5860          * don't have it in the radix (like when we recover after a power fail
5861          * or unmount) so we don't leak memory.
5862          */
5863         if (!for_reloc && !root_dropped)
5864                 btrfs_add_dead_root(root);
5865         return err;
5866 }
5867
5868 /*
5869  * drop subtree rooted at tree block 'node'.
5870  *
5871  * NOTE: this function will unlock and release tree block 'node'
5872  * only used by relocation code
5873  */
5874 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5875                         struct btrfs_root *root,
5876                         struct extent_buffer *node,
5877                         struct extent_buffer *parent)
5878 {
5879         struct btrfs_fs_info *fs_info = root->fs_info;
5880         struct btrfs_path *path;
5881         struct walk_control *wc;
5882         int level;
5883         int parent_level;
5884         int ret = 0;
5885         int wret;
5886
5887         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5888
5889         path = btrfs_alloc_path();
5890         if (!path)
5891                 return -ENOMEM;
5892
5893         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5894         if (!wc) {
5895                 btrfs_free_path(path);
5896                 return -ENOMEM;
5897         }
5898
5899         btrfs_assert_tree_write_locked(parent);
5900         parent_level = btrfs_header_level(parent);
5901         atomic_inc(&parent->refs);
5902         path->nodes[parent_level] = parent;
5903         path->slots[parent_level] = btrfs_header_nritems(parent);
5904
5905         btrfs_assert_tree_write_locked(node);
5906         level = btrfs_header_level(node);
5907         path->nodes[level] = node;
5908         path->slots[level] = 0;
5909         path->locks[level] = BTRFS_WRITE_LOCK;
5910
5911         wc->refs[parent_level] = 1;
5912         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5913         wc->level = level;
5914         wc->shared_level = -1;
5915         wc->stage = DROP_REFERENCE;
5916         wc->update_ref = 0;
5917         wc->keep_locks = 1;
5918         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5919
5920         while (1) {
5921                 wret = walk_down_tree(trans, root, path, wc);
5922                 if (wret < 0) {
5923                         ret = wret;
5924                         break;
5925                 }
5926
5927                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5928                 if (wret < 0)
5929                         ret = wret;
5930                 if (wret != 0)
5931                         break;
5932         }
5933
5934         kfree(wc);
5935         btrfs_free_path(path);
5936         return ret;
5937 }
5938
5939 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5940                                    u64 start, u64 end)
5941 {
5942         return unpin_extent_range(fs_info, start, end, false);
5943 }
5944
5945 /*
5946  * It used to be that old block groups would be left around forever.
5947  * Iterating over them would be enough to trim unused space.  Since we
5948  * now automatically remove them, we also need to iterate over unallocated
5949  * space.
5950  *
5951  * We don't want a transaction for this since the discard may take a
5952  * substantial amount of time.  We don't require that a transaction be
5953  * running, but we do need to take a running transaction into account
5954  * to ensure that we're not discarding chunks that were released or
5955  * allocated in the current transaction.
5956  *
5957  * Holding the chunks lock will prevent other threads from allocating
5958  * or releasing chunks, but it won't prevent a running transaction
5959  * from committing and releasing the memory that the pending chunks
5960  * list head uses.  For that, we need to take a reference to the
5961  * transaction and hold the commit root sem.  We only need to hold
5962  * it while performing the free space search since we have already
5963  * held back allocations.
5964  */
5965 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5966 {
5967         u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
5968         int ret;
5969
5970         *trimmed = 0;
5971
5972         /* Discard not supported = nothing to do. */
5973         if (!bdev_max_discard_sectors(device->bdev))
5974                 return 0;
5975
5976         /* Not writable = nothing to do. */
5977         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5978                 return 0;
5979
5980         /* No free space = nothing to do. */
5981         if (device->total_bytes <= device->bytes_used)
5982                 return 0;
5983
5984         ret = 0;
5985
5986         while (1) {
5987                 struct btrfs_fs_info *fs_info = device->fs_info;
5988                 u64 bytes;
5989
5990                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5991                 if (ret)
5992                         break;
5993
5994                 find_first_clear_extent_bit(&device->alloc_state, start,
5995                                             &start, &end,
5996                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
5997
5998                 /* Check if there are any CHUNK_* bits left */
5999                 if (start > device->total_bytes) {
6000                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6001                         btrfs_warn_in_rcu(fs_info,
6002 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
6003                                           start, end - start + 1,
6004                                           btrfs_dev_name(device),
6005                                           device->total_bytes);
6006                         mutex_unlock(&fs_info->chunk_mutex);
6007                         ret = 0;
6008                         break;
6009                 }
6010
6011                 /* Ensure we skip the reserved space on each device. */
6012                 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED);
6013
6014                 /*
6015                  * If find_first_clear_extent_bit find a range that spans the
6016                  * end of the device it will set end to -1, in this case it's up
6017                  * to the caller to trim the value to the size of the device.
6018                  */
6019                 end = min(end, device->total_bytes - 1);
6020
6021                 len = end - start + 1;
6022
6023                 /* We didn't find any extents */
6024                 if (!len) {
6025                         mutex_unlock(&fs_info->chunk_mutex);
6026                         ret = 0;
6027                         break;
6028                 }
6029
6030                 ret = btrfs_issue_discard(device->bdev, start, len,
6031                                           &bytes);
6032                 if (!ret)
6033                         set_extent_bit(&device->alloc_state, start,
6034                                        start + bytes - 1, CHUNK_TRIMMED, NULL);
6035                 mutex_unlock(&fs_info->chunk_mutex);
6036
6037                 if (ret)
6038                         break;
6039
6040                 start += len;
6041                 *trimmed += bytes;
6042
6043                 if (fatal_signal_pending(current)) {
6044                         ret = -ERESTARTSYS;
6045                         break;
6046                 }
6047
6048                 cond_resched();
6049         }
6050
6051         return ret;
6052 }
6053
6054 /*
6055  * Trim the whole filesystem by:
6056  * 1) trimming the free space in each block group
6057  * 2) trimming the unallocated space on each device
6058  *
6059  * This will also continue trimming even if a block group or device encounters
6060  * an error.  The return value will be the last error, or 0 if nothing bad
6061  * happens.
6062  */
6063 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6064 {
6065         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6066         struct btrfs_block_group *cache = NULL;
6067         struct btrfs_device *device;
6068         u64 group_trimmed;
6069         u64 range_end = U64_MAX;
6070         u64 start;
6071         u64 end;
6072         u64 trimmed = 0;
6073         u64 bg_failed = 0;
6074         u64 dev_failed = 0;
6075         int bg_ret = 0;
6076         int dev_ret = 0;
6077         int ret = 0;
6078
6079         if (range->start == U64_MAX)
6080                 return -EINVAL;
6081
6082         /*
6083          * Check range overflow if range->len is set.
6084          * The default range->len is U64_MAX.
6085          */
6086         if (range->len != U64_MAX &&
6087             check_add_overflow(range->start, range->len, &range_end))
6088                 return -EINVAL;
6089
6090         cache = btrfs_lookup_first_block_group(fs_info, range->start);
6091         for (; cache; cache = btrfs_next_block_group(cache)) {
6092                 if (cache->start >= range_end) {
6093                         btrfs_put_block_group(cache);
6094                         break;
6095                 }
6096
6097                 start = max(range->start, cache->start);
6098                 end = min(range_end, cache->start + cache->length);
6099
6100                 if (end - start >= range->minlen) {
6101                         if (!btrfs_block_group_done(cache)) {
6102                                 ret = btrfs_cache_block_group(cache, true);
6103                                 if (ret) {
6104                                         bg_failed++;
6105                                         bg_ret = ret;
6106                                         continue;
6107                                 }
6108                         }
6109                         ret = btrfs_trim_block_group(cache,
6110                                                      &group_trimmed,
6111                                                      start,
6112                                                      end,
6113                                                      range->minlen);
6114
6115                         trimmed += group_trimmed;
6116                         if (ret) {
6117                                 bg_failed++;
6118                                 bg_ret = ret;
6119                                 continue;
6120                         }
6121                 }
6122         }
6123
6124         if (bg_failed)
6125                 btrfs_warn(fs_info,
6126                         "failed to trim %llu block group(s), last error %d",
6127                         bg_failed, bg_ret);
6128
6129         mutex_lock(&fs_devices->device_list_mutex);
6130         list_for_each_entry(device, &fs_devices->devices, dev_list) {
6131                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
6132                         continue;
6133
6134                 ret = btrfs_trim_free_extents(device, &group_trimmed);
6135                 if (ret) {
6136                         dev_failed++;
6137                         dev_ret = ret;
6138                         break;
6139                 }
6140
6141                 trimmed += group_trimmed;
6142         }
6143         mutex_unlock(&fs_devices->device_list_mutex);
6144
6145         if (dev_failed)
6146                 btrfs_warn(fs_info,
6147                         "failed to trim %llu device(s), last error %d",
6148                         dev_failed, dev_ret);
6149         range->len = trimmed;
6150         if (bg_ret)
6151                 return bg_ret;
6152         return dev_ret;
6153 }