Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
[platform/kernel/linux-starfive.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "misc.h"
15 #include "ctree.h"
16 #include "free-space-cache.h"
17 #include "transaction.h"
18 #include "disk-io.h"
19 #include "extent_io.h"
20 #include "volumes.h"
21 #include "space-info.h"
22 #include "delalloc-space.h"
23 #include "block-group.h"
24 #include "discard.h"
25
26 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
27 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
28 #define FORCE_EXTENT_THRESHOLD  SZ_1M
29
30 struct btrfs_trim_range {
31         u64 start;
32         u64 bytes;
33         struct list_head list;
34 };
35
36 static int link_free_space(struct btrfs_free_space_ctl *ctl,
37                            struct btrfs_free_space *info);
38 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
39                               struct btrfs_free_space *info);
40 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
41                          struct btrfs_free_space *bitmap_info, u64 *offset,
42                          u64 *bytes, bool for_alloc);
43 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
44                         struct btrfs_free_space *bitmap_info);
45 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
46                               struct btrfs_free_space *info, u64 offset,
47                               u64 bytes);
48
49 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
50                                                struct btrfs_path *path,
51                                                u64 offset)
52 {
53         struct btrfs_fs_info *fs_info = root->fs_info;
54         struct btrfs_key key;
55         struct btrfs_key location;
56         struct btrfs_disk_key disk_key;
57         struct btrfs_free_space_header *header;
58         struct extent_buffer *leaf;
59         struct inode *inode = NULL;
60         unsigned nofs_flag;
61         int ret;
62
63         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
64         key.offset = offset;
65         key.type = 0;
66
67         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
68         if (ret < 0)
69                 return ERR_PTR(ret);
70         if (ret > 0) {
71                 btrfs_release_path(path);
72                 return ERR_PTR(-ENOENT);
73         }
74
75         leaf = path->nodes[0];
76         header = btrfs_item_ptr(leaf, path->slots[0],
77                                 struct btrfs_free_space_header);
78         btrfs_free_space_key(leaf, header, &disk_key);
79         btrfs_disk_key_to_cpu(&location, &disk_key);
80         btrfs_release_path(path);
81
82         /*
83          * We are often under a trans handle at this point, so we need to make
84          * sure NOFS is set to keep us from deadlocking.
85          */
86         nofs_flag = memalloc_nofs_save();
87         inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
88         btrfs_release_path(path);
89         memalloc_nofs_restore(nofs_flag);
90         if (IS_ERR(inode))
91                 return inode;
92
93         mapping_set_gfp_mask(inode->i_mapping,
94                         mapping_gfp_constraint(inode->i_mapping,
95                         ~(__GFP_FS | __GFP_HIGHMEM)));
96
97         return inode;
98 }
99
100 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
101                 struct btrfs_path *path)
102 {
103         struct btrfs_fs_info *fs_info = block_group->fs_info;
104         struct inode *inode = NULL;
105         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
106
107         spin_lock(&block_group->lock);
108         if (block_group->inode)
109                 inode = igrab(block_group->inode);
110         spin_unlock(&block_group->lock);
111         if (inode)
112                 return inode;
113
114         inode = __lookup_free_space_inode(fs_info->tree_root, path,
115                                           block_group->start);
116         if (IS_ERR(inode))
117                 return inode;
118
119         spin_lock(&block_group->lock);
120         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
121                 btrfs_info(fs_info, "Old style space inode found, converting.");
122                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
123                         BTRFS_INODE_NODATACOW;
124                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
125         }
126
127         if (!block_group->iref) {
128                 block_group->inode = igrab(inode);
129                 block_group->iref = 1;
130         }
131         spin_unlock(&block_group->lock);
132
133         return inode;
134 }
135
136 static int __create_free_space_inode(struct btrfs_root *root,
137                                      struct btrfs_trans_handle *trans,
138                                      struct btrfs_path *path,
139                                      u64 ino, u64 offset)
140 {
141         struct btrfs_key key;
142         struct btrfs_disk_key disk_key;
143         struct btrfs_free_space_header *header;
144         struct btrfs_inode_item *inode_item;
145         struct extent_buffer *leaf;
146         /* We inline CRCs for the free disk space cache */
147         const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
148                           BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
149         int ret;
150
151         ret = btrfs_insert_empty_inode(trans, root, path, ino);
152         if (ret)
153                 return ret;
154
155         leaf = path->nodes[0];
156         inode_item = btrfs_item_ptr(leaf, path->slots[0],
157                                     struct btrfs_inode_item);
158         btrfs_item_key(leaf, &disk_key, path->slots[0]);
159         memzero_extent_buffer(leaf, (unsigned long)inode_item,
160                              sizeof(*inode_item));
161         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
162         btrfs_set_inode_size(leaf, inode_item, 0);
163         btrfs_set_inode_nbytes(leaf, inode_item, 0);
164         btrfs_set_inode_uid(leaf, inode_item, 0);
165         btrfs_set_inode_gid(leaf, inode_item, 0);
166         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
167         btrfs_set_inode_flags(leaf, inode_item, flags);
168         btrfs_set_inode_nlink(leaf, inode_item, 1);
169         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
170         btrfs_set_inode_block_group(leaf, inode_item, offset);
171         btrfs_mark_buffer_dirty(leaf);
172         btrfs_release_path(path);
173
174         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
175         key.offset = offset;
176         key.type = 0;
177         ret = btrfs_insert_empty_item(trans, root, path, &key,
178                                       sizeof(struct btrfs_free_space_header));
179         if (ret < 0) {
180                 btrfs_release_path(path);
181                 return ret;
182         }
183
184         leaf = path->nodes[0];
185         header = btrfs_item_ptr(leaf, path->slots[0],
186                                 struct btrfs_free_space_header);
187         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
188         btrfs_set_free_space_key(leaf, header, &disk_key);
189         btrfs_mark_buffer_dirty(leaf);
190         btrfs_release_path(path);
191
192         return 0;
193 }
194
195 int create_free_space_inode(struct btrfs_trans_handle *trans,
196                             struct btrfs_block_group *block_group,
197                             struct btrfs_path *path)
198 {
199         int ret;
200         u64 ino;
201
202         ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
203         if (ret < 0)
204                 return ret;
205
206         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
207                                          ino, block_group->start);
208 }
209
210 /*
211  * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
212  * handles lookup, otherwise it takes ownership and iputs the inode.
213  * Don't reuse an inode pointer after passing it into this function.
214  */
215 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
216                                   struct inode *inode,
217                                   struct btrfs_block_group *block_group)
218 {
219         struct btrfs_path *path;
220         struct btrfs_key key;
221         int ret = 0;
222
223         path = btrfs_alloc_path();
224         if (!path)
225                 return -ENOMEM;
226
227         if (!inode)
228                 inode = lookup_free_space_inode(block_group, path);
229         if (IS_ERR(inode)) {
230                 if (PTR_ERR(inode) != -ENOENT)
231                         ret = PTR_ERR(inode);
232                 goto out;
233         }
234         ret = btrfs_orphan_add(trans, BTRFS_I(inode));
235         if (ret) {
236                 btrfs_add_delayed_iput(inode);
237                 goto out;
238         }
239         clear_nlink(inode);
240         /* One for the block groups ref */
241         spin_lock(&block_group->lock);
242         if (block_group->iref) {
243                 block_group->iref = 0;
244                 block_group->inode = NULL;
245                 spin_unlock(&block_group->lock);
246                 iput(inode);
247         } else {
248                 spin_unlock(&block_group->lock);
249         }
250         /* One for the lookup ref */
251         btrfs_add_delayed_iput(inode);
252
253         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
254         key.type = 0;
255         key.offset = block_group->start;
256         ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
257                                 -1, 1);
258         if (ret) {
259                 if (ret > 0)
260                         ret = 0;
261                 goto out;
262         }
263         ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
264 out:
265         btrfs_free_path(path);
266         return ret;
267 }
268
269 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
270                                        struct btrfs_block_rsv *rsv)
271 {
272         u64 needed_bytes;
273         int ret;
274
275         /* 1 for slack space, 1 for updating the inode */
276         needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
277                 btrfs_calc_metadata_size(fs_info, 1);
278
279         spin_lock(&rsv->lock);
280         if (rsv->reserved < needed_bytes)
281                 ret = -ENOSPC;
282         else
283                 ret = 0;
284         spin_unlock(&rsv->lock);
285         return ret;
286 }
287
288 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
289                                     struct btrfs_block_group *block_group,
290                                     struct inode *inode)
291 {
292         struct btrfs_root *root = BTRFS_I(inode)->root;
293         int ret = 0;
294         bool locked = false;
295
296         if (block_group) {
297                 struct btrfs_path *path = btrfs_alloc_path();
298
299                 if (!path) {
300                         ret = -ENOMEM;
301                         goto fail;
302                 }
303                 locked = true;
304                 mutex_lock(&trans->transaction->cache_write_mutex);
305                 if (!list_empty(&block_group->io_list)) {
306                         list_del_init(&block_group->io_list);
307
308                         btrfs_wait_cache_io(trans, block_group, path);
309                         btrfs_put_block_group(block_group);
310                 }
311
312                 /*
313                  * now that we've truncated the cache away, its no longer
314                  * setup or written
315                  */
316                 spin_lock(&block_group->lock);
317                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
318                 spin_unlock(&block_group->lock);
319                 btrfs_free_path(path);
320         }
321
322         btrfs_i_size_write(BTRFS_I(inode), 0);
323         truncate_pagecache(inode, 0);
324
325         /*
326          * We skip the throttling logic for free space cache inodes, so we don't
327          * need to check for -EAGAIN.
328          */
329         ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
330                                          0, BTRFS_EXTENT_DATA_KEY, NULL);
331         if (ret)
332                 goto fail;
333
334         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
335
336 fail:
337         if (locked)
338                 mutex_unlock(&trans->transaction->cache_write_mutex);
339         if (ret)
340                 btrfs_abort_transaction(trans, ret);
341
342         return ret;
343 }
344
345 static void readahead_cache(struct inode *inode)
346 {
347         struct file_ra_state ra;
348         unsigned long last_index;
349
350         file_ra_state_init(&ra, inode->i_mapping);
351         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
352
353         page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
354 }
355
356 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
357                        int write)
358 {
359         int num_pages;
360
361         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
362
363         /* Make sure we can fit our crcs and generation into the first page */
364         if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
365                 return -ENOSPC;
366
367         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
368
369         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
370         if (!io_ctl->pages)
371                 return -ENOMEM;
372
373         io_ctl->num_pages = num_pages;
374         io_ctl->fs_info = btrfs_sb(inode->i_sb);
375         io_ctl->inode = inode;
376
377         return 0;
378 }
379 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
380
381 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
382 {
383         kfree(io_ctl->pages);
384         io_ctl->pages = NULL;
385 }
386
387 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
388 {
389         if (io_ctl->cur) {
390                 io_ctl->cur = NULL;
391                 io_ctl->orig = NULL;
392         }
393 }
394
395 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
396 {
397         ASSERT(io_ctl->index < io_ctl->num_pages);
398         io_ctl->page = io_ctl->pages[io_ctl->index++];
399         io_ctl->cur = page_address(io_ctl->page);
400         io_ctl->orig = io_ctl->cur;
401         io_ctl->size = PAGE_SIZE;
402         if (clear)
403                 clear_page(io_ctl->cur);
404 }
405
406 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
407 {
408         int i;
409
410         io_ctl_unmap_page(io_ctl);
411
412         for (i = 0; i < io_ctl->num_pages; i++) {
413                 if (io_ctl->pages[i]) {
414                         ClearPageChecked(io_ctl->pages[i]);
415                         unlock_page(io_ctl->pages[i]);
416                         put_page(io_ctl->pages[i]);
417                 }
418         }
419 }
420
421 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
422 {
423         struct page *page;
424         struct inode *inode = io_ctl->inode;
425         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
426         int i;
427
428         for (i = 0; i < io_ctl->num_pages; i++) {
429                 int ret;
430
431                 page = find_or_create_page(inode->i_mapping, i, mask);
432                 if (!page) {
433                         io_ctl_drop_pages(io_ctl);
434                         return -ENOMEM;
435                 }
436
437                 ret = set_page_extent_mapped(page);
438                 if (ret < 0) {
439                         unlock_page(page);
440                         put_page(page);
441                         io_ctl_drop_pages(io_ctl);
442                         return ret;
443                 }
444
445                 io_ctl->pages[i] = page;
446                 if (uptodate && !PageUptodate(page)) {
447                         btrfs_readpage(NULL, page);
448                         lock_page(page);
449                         if (page->mapping != inode->i_mapping) {
450                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
451                                           "free space cache page truncated");
452                                 io_ctl_drop_pages(io_ctl);
453                                 return -EIO;
454                         }
455                         if (!PageUptodate(page)) {
456                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
457                                            "error reading free space cache");
458                                 io_ctl_drop_pages(io_ctl);
459                                 return -EIO;
460                         }
461                 }
462         }
463
464         for (i = 0; i < io_ctl->num_pages; i++)
465                 clear_page_dirty_for_io(io_ctl->pages[i]);
466
467         return 0;
468 }
469
470 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
471 {
472         io_ctl_map_page(io_ctl, 1);
473
474         /*
475          * Skip the csum areas.  If we don't check crcs then we just have a
476          * 64bit chunk at the front of the first page.
477          */
478         io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
479         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
480
481         put_unaligned_le64(generation, io_ctl->cur);
482         io_ctl->cur += sizeof(u64);
483 }
484
485 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
486 {
487         u64 cache_gen;
488
489         /*
490          * Skip the crc area.  If we don't check crcs then we just have a 64bit
491          * chunk at the front of the first page.
492          */
493         io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
494         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
495
496         cache_gen = get_unaligned_le64(io_ctl->cur);
497         if (cache_gen != generation) {
498                 btrfs_err_rl(io_ctl->fs_info,
499                         "space cache generation (%llu) does not match inode (%llu)",
500                                 cache_gen, generation);
501                 io_ctl_unmap_page(io_ctl);
502                 return -EIO;
503         }
504         io_ctl->cur += sizeof(u64);
505         return 0;
506 }
507
508 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
509 {
510         u32 *tmp;
511         u32 crc = ~(u32)0;
512         unsigned offset = 0;
513
514         if (index == 0)
515                 offset = sizeof(u32) * io_ctl->num_pages;
516
517         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
518         btrfs_crc32c_final(crc, (u8 *)&crc);
519         io_ctl_unmap_page(io_ctl);
520         tmp = page_address(io_ctl->pages[0]);
521         tmp += index;
522         *tmp = crc;
523 }
524
525 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
526 {
527         u32 *tmp, val;
528         u32 crc = ~(u32)0;
529         unsigned offset = 0;
530
531         if (index == 0)
532                 offset = sizeof(u32) * io_ctl->num_pages;
533
534         tmp = page_address(io_ctl->pages[0]);
535         tmp += index;
536         val = *tmp;
537
538         io_ctl_map_page(io_ctl, 0);
539         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
540         btrfs_crc32c_final(crc, (u8 *)&crc);
541         if (val != crc) {
542                 btrfs_err_rl(io_ctl->fs_info,
543                         "csum mismatch on free space cache");
544                 io_ctl_unmap_page(io_ctl);
545                 return -EIO;
546         }
547
548         return 0;
549 }
550
551 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
552                             void *bitmap)
553 {
554         struct btrfs_free_space_entry *entry;
555
556         if (!io_ctl->cur)
557                 return -ENOSPC;
558
559         entry = io_ctl->cur;
560         put_unaligned_le64(offset, &entry->offset);
561         put_unaligned_le64(bytes, &entry->bytes);
562         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
563                 BTRFS_FREE_SPACE_EXTENT;
564         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
565         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
566
567         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
568                 return 0;
569
570         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
571
572         /* No more pages to map */
573         if (io_ctl->index >= io_ctl->num_pages)
574                 return 0;
575
576         /* map the next page */
577         io_ctl_map_page(io_ctl, 1);
578         return 0;
579 }
580
581 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
582 {
583         if (!io_ctl->cur)
584                 return -ENOSPC;
585
586         /*
587          * If we aren't at the start of the current page, unmap this one and
588          * map the next one if there is any left.
589          */
590         if (io_ctl->cur != io_ctl->orig) {
591                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
592                 if (io_ctl->index >= io_ctl->num_pages)
593                         return -ENOSPC;
594                 io_ctl_map_page(io_ctl, 0);
595         }
596
597         copy_page(io_ctl->cur, bitmap);
598         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
599         if (io_ctl->index < io_ctl->num_pages)
600                 io_ctl_map_page(io_ctl, 0);
601         return 0;
602 }
603
604 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
605 {
606         /*
607          * If we're not on the boundary we know we've modified the page and we
608          * need to crc the page.
609          */
610         if (io_ctl->cur != io_ctl->orig)
611                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
612         else
613                 io_ctl_unmap_page(io_ctl);
614
615         while (io_ctl->index < io_ctl->num_pages) {
616                 io_ctl_map_page(io_ctl, 1);
617                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
618         }
619 }
620
621 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
622                             struct btrfs_free_space *entry, u8 *type)
623 {
624         struct btrfs_free_space_entry *e;
625         int ret;
626
627         if (!io_ctl->cur) {
628                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
629                 if (ret)
630                         return ret;
631         }
632
633         e = io_ctl->cur;
634         entry->offset = get_unaligned_le64(&e->offset);
635         entry->bytes = get_unaligned_le64(&e->bytes);
636         *type = e->type;
637         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
638         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
639
640         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
641                 return 0;
642
643         io_ctl_unmap_page(io_ctl);
644
645         return 0;
646 }
647
648 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
649                               struct btrfs_free_space *entry)
650 {
651         int ret;
652
653         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
654         if (ret)
655                 return ret;
656
657         copy_page(entry->bitmap, io_ctl->cur);
658         io_ctl_unmap_page(io_ctl);
659
660         return 0;
661 }
662
663 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
664 {
665         struct btrfs_block_group *block_group = ctl->private;
666         u64 max_bytes;
667         u64 bitmap_bytes;
668         u64 extent_bytes;
669         u64 size = block_group->length;
670         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
671         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
672
673         max_bitmaps = max_t(u64, max_bitmaps, 1);
674
675         ASSERT(ctl->total_bitmaps <= max_bitmaps);
676
677         /*
678          * We are trying to keep the total amount of memory used per 1GiB of
679          * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
680          * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
681          * bitmaps, we may end up using more memory than this.
682          */
683         if (size < SZ_1G)
684                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
685         else
686                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
687
688         bitmap_bytes = ctl->total_bitmaps * ctl->unit;
689
690         /*
691          * we want the extent entry threshold to always be at most 1/2 the max
692          * bytes we can have, or whatever is less than that.
693          */
694         extent_bytes = max_bytes - bitmap_bytes;
695         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
696
697         ctl->extents_thresh =
698                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
699 }
700
701 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
702                                    struct btrfs_free_space_ctl *ctl,
703                                    struct btrfs_path *path, u64 offset)
704 {
705         struct btrfs_fs_info *fs_info = root->fs_info;
706         struct btrfs_free_space_header *header;
707         struct extent_buffer *leaf;
708         struct btrfs_io_ctl io_ctl;
709         struct btrfs_key key;
710         struct btrfs_free_space *e, *n;
711         LIST_HEAD(bitmaps);
712         u64 num_entries;
713         u64 num_bitmaps;
714         u64 generation;
715         u8 type;
716         int ret = 0;
717
718         /* Nothing in the space cache, goodbye */
719         if (!i_size_read(inode))
720                 return 0;
721
722         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
723         key.offset = offset;
724         key.type = 0;
725
726         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
727         if (ret < 0)
728                 return 0;
729         else if (ret > 0) {
730                 btrfs_release_path(path);
731                 return 0;
732         }
733
734         ret = -1;
735
736         leaf = path->nodes[0];
737         header = btrfs_item_ptr(leaf, path->slots[0],
738                                 struct btrfs_free_space_header);
739         num_entries = btrfs_free_space_entries(leaf, header);
740         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
741         generation = btrfs_free_space_generation(leaf, header);
742         btrfs_release_path(path);
743
744         if (!BTRFS_I(inode)->generation) {
745                 btrfs_info(fs_info,
746                            "the free space cache file (%llu) is invalid, skip it",
747                            offset);
748                 return 0;
749         }
750
751         if (BTRFS_I(inode)->generation != generation) {
752                 btrfs_err(fs_info,
753                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
754                           BTRFS_I(inode)->generation, generation);
755                 return 0;
756         }
757
758         if (!num_entries)
759                 return 0;
760
761         ret = io_ctl_init(&io_ctl, inode, 0);
762         if (ret)
763                 return ret;
764
765         readahead_cache(inode);
766
767         ret = io_ctl_prepare_pages(&io_ctl, true);
768         if (ret)
769                 goto out;
770
771         ret = io_ctl_check_crc(&io_ctl, 0);
772         if (ret)
773                 goto free_cache;
774
775         ret = io_ctl_check_generation(&io_ctl, generation);
776         if (ret)
777                 goto free_cache;
778
779         while (num_entries) {
780                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
781                                       GFP_NOFS);
782                 if (!e) {
783                         ret = -ENOMEM;
784                         goto free_cache;
785                 }
786
787                 ret = io_ctl_read_entry(&io_ctl, e, &type);
788                 if (ret) {
789                         kmem_cache_free(btrfs_free_space_cachep, e);
790                         goto free_cache;
791                 }
792
793                 if (!e->bytes) {
794                         ret = -1;
795                         kmem_cache_free(btrfs_free_space_cachep, e);
796                         goto free_cache;
797                 }
798
799                 if (type == BTRFS_FREE_SPACE_EXTENT) {
800                         spin_lock(&ctl->tree_lock);
801                         ret = link_free_space(ctl, e);
802                         spin_unlock(&ctl->tree_lock);
803                         if (ret) {
804                                 btrfs_err(fs_info,
805                                         "Duplicate entries in free space cache, dumping");
806                                 kmem_cache_free(btrfs_free_space_cachep, e);
807                                 goto free_cache;
808                         }
809                 } else {
810                         ASSERT(num_bitmaps);
811                         num_bitmaps--;
812                         e->bitmap = kmem_cache_zalloc(
813                                         btrfs_free_space_bitmap_cachep, GFP_NOFS);
814                         if (!e->bitmap) {
815                                 ret = -ENOMEM;
816                                 kmem_cache_free(
817                                         btrfs_free_space_cachep, e);
818                                 goto free_cache;
819                         }
820                         spin_lock(&ctl->tree_lock);
821                         ret = link_free_space(ctl, e);
822                         ctl->total_bitmaps++;
823                         recalculate_thresholds(ctl);
824                         spin_unlock(&ctl->tree_lock);
825                         if (ret) {
826                                 btrfs_err(fs_info,
827                                         "Duplicate entries in free space cache, dumping");
828                                 kmem_cache_free(btrfs_free_space_cachep, e);
829                                 goto free_cache;
830                         }
831                         list_add_tail(&e->list, &bitmaps);
832                 }
833
834                 num_entries--;
835         }
836
837         io_ctl_unmap_page(&io_ctl);
838
839         /*
840          * We add the bitmaps at the end of the entries in order that
841          * the bitmap entries are added to the cache.
842          */
843         list_for_each_entry_safe(e, n, &bitmaps, list) {
844                 list_del_init(&e->list);
845                 ret = io_ctl_read_bitmap(&io_ctl, e);
846                 if (ret)
847                         goto free_cache;
848         }
849
850         io_ctl_drop_pages(&io_ctl);
851         ret = 1;
852 out:
853         io_ctl_free(&io_ctl);
854         return ret;
855 free_cache:
856         io_ctl_drop_pages(&io_ctl);
857         __btrfs_remove_free_space_cache(ctl);
858         goto out;
859 }
860
861 static int copy_free_space_cache(struct btrfs_block_group *block_group,
862                                  struct btrfs_free_space_ctl *ctl)
863 {
864         struct btrfs_free_space *info;
865         struct rb_node *n;
866         int ret = 0;
867
868         while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
869                 info = rb_entry(n, struct btrfs_free_space, offset_index);
870                 if (!info->bitmap) {
871                         unlink_free_space(ctl, info);
872                         ret = btrfs_add_free_space(block_group, info->offset,
873                                                    info->bytes);
874                         kmem_cache_free(btrfs_free_space_cachep, info);
875                 } else {
876                         u64 offset = info->offset;
877                         u64 bytes = ctl->unit;
878
879                         while (search_bitmap(ctl, info, &offset, &bytes,
880                                              false) == 0) {
881                                 ret = btrfs_add_free_space(block_group, offset,
882                                                            bytes);
883                                 if (ret)
884                                         break;
885                                 bitmap_clear_bits(ctl, info, offset, bytes);
886                                 offset = info->offset;
887                                 bytes = ctl->unit;
888                         }
889                         free_bitmap(ctl, info);
890                 }
891                 cond_resched();
892         }
893         return ret;
894 }
895
896 int load_free_space_cache(struct btrfs_block_group *block_group)
897 {
898         struct btrfs_fs_info *fs_info = block_group->fs_info;
899         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
900         struct btrfs_free_space_ctl tmp_ctl = {};
901         struct inode *inode;
902         struct btrfs_path *path;
903         int ret = 0;
904         bool matched;
905         u64 used = block_group->used;
906
907         /*
908          * Because we could potentially discard our loaded free space, we want
909          * to load everything into a temporary structure first, and then if it's
910          * valid copy it all into the actual free space ctl.
911          */
912         btrfs_init_free_space_ctl(block_group, &tmp_ctl);
913
914         /*
915          * If this block group has been marked to be cleared for one reason or
916          * another then we can't trust the on disk cache, so just return.
917          */
918         spin_lock(&block_group->lock);
919         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
920                 spin_unlock(&block_group->lock);
921                 return 0;
922         }
923         spin_unlock(&block_group->lock);
924
925         path = btrfs_alloc_path();
926         if (!path)
927                 return 0;
928         path->search_commit_root = 1;
929         path->skip_locking = 1;
930
931         /*
932          * We must pass a path with search_commit_root set to btrfs_iget in
933          * order to avoid a deadlock when allocating extents for the tree root.
934          *
935          * When we are COWing an extent buffer from the tree root, when looking
936          * for a free extent, at extent-tree.c:find_free_extent(), we can find
937          * block group without its free space cache loaded. When we find one
938          * we must load its space cache which requires reading its free space
939          * cache's inode item from the root tree. If this inode item is located
940          * in the same leaf that we started COWing before, then we end up in
941          * deadlock on the extent buffer (trying to read lock it when we
942          * previously write locked it).
943          *
944          * It's safe to read the inode item using the commit root because
945          * block groups, once loaded, stay in memory forever (until they are
946          * removed) as well as their space caches once loaded. New block groups
947          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
948          * we will never try to read their inode item while the fs is mounted.
949          */
950         inode = lookup_free_space_inode(block_group, path);
951         if (IS_ERR(inode)) {
952                 btrfs_free_path(path);
953                 return 0;
954         }
955
956         /* We may have converted the inode and made the cache invalid. */
957         spin_lock(&block_group->lock);
958         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
959                 spin_unlock(&block_group->lock);
960                 btrfs_free_path(path);
961                 goto out;
962         }
963         spin_unlock(&block_group->lock);
964
965         ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
966                                       path, block_group->start);
967         btrfs_free_path(path);
968         if (ret <= 0)
969                 goto out;
970
971         matched = (tmp_ctl.free_space == (block_group->length - used -
972                                           block_group->bytes_super));
973
974         if (matched) {
975                 ret = copy_free_space_cache(block_group, &tmp_ctl);
976                 /*
977                  * ret == 1 means we successfully loaded the free space cache,
978                  * so we need to re-set it here.
979                  */
980                 if (ret == 0)
981                         ret = 1;
982         } else {
983                 __btrfs_remove_free_space_cache(&tmp_ctl);
984                 btrfs_warn(fs_info,
985                            "block group %llu has wrong amount of free space",
986                            block_group->start);
987                 ret = -1;
988         }
989 out:
990         if (ret < 0) {
991                 /* This cache is bogus, make sure it gets cleared */
992                 spin_lock(&block_group->lock);
993                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
994                 spin_unlock(&block_group->lock);
995                 ret = 0;
996
997                 btrfs_warn(fs_info,
998                            "failed to load free space cache for block group %llu, rebuilding it now",
999                            block_group->start);
1000         }
1001
1002         spin_lock(&ctl->tree_lock);
1003         btrfs_discard_update_discardable(block_group);
1004         spin_unlock(&ctl->tree_lock);
1005         iput(inode);
1006         return ret;
1007 }
1008
1009 static noinline_for_stack
1010 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1011                               struct btrfs_free_space_ctl *ctl,
1012                               struct btrfs_block_group *block_group,
1013                               int *entries, int *bitmaps,
1014                               struct list_head *bitmap_list)
1015 {
1016         int ret;
1017         struct btrfs_free_cluster *cluster = NULL;
1018         struct btrfs_free_cluster *cluster_locked = NULL;
1019         struct rb_node *node = rb_first(&ctl->free_space_offset);
1020         struct btrfs_trim_range *trim_entry;
1021
1022         /* Get the cluster for this block_group if it exists */
1023         if (block_group && !list_empty(&block_group->cluster_list)) {
1024                 cluster = list_entry(block_group->cluster_list.next,
1025                                      struct btrfs_free_cluster,
1026                                      block_group_list);
1027         }
1028
1029         if (!node && cluster) {
1030                 cluster_locked = cluster;
1031                 spin_lock(&cluster_locked->lock);
1032                 node = rb_first(&cluster->root);
1033                 cluster = NULL;
1034         }
1035
1036         /* Write out the extent entries */
1037         while (node) {
1038                 struct btrfs_free_space *e;
1039
1040                 e = rb_entry(node, struct btrfs_free_space, offset_index);
1041                 *entries += 1;
1042
1043                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1044                                        e->bitmap);
1045                 if (ret)
1046                         goto fail;
1047
1048                 if (e->bitmap) {
1049                         list_add_tail(&e->list, bitmap_list);
1050                         *bitmaps += 1;
1051                 }
1052                 node = rb_next(node);
1053                 if (!node && cluster) {
1054                         node = rb_first(&cluster->root);
1055                         cluster_locked = cluster;
1056                         spin_lock(&cluster_locked->lock);
1057                         cluster = NULL;
1058                 }
1059         }
1060         if (cluster_locked) {
1061                 spin_unlock(&cluster_locked->lock);
1062                 cluster_locked = NULL;
1063         }
1064
1065         /*
1066          * Make sure we don't miss any range that was removed from our rbtree
1067          * because trimming is running. Otherwise after a umount+mount (or crash
1068          * after committing the transaction) we would leak free space and get
1069          * an inconsistent free space cache report from fsck.
1070          */
1071         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1072                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1073                                        trim_entry->bytes, NULL);
1074                 if (ret)
1075                         goto fail;
1076                 *entries += 1;
1077         }
1078
1079         return 0;
1080 fail:
1081         if (cluster_locked)
1082                 spin_unlock(&cluster_locked->lock);
1083         return -ENOSPC;
1084 }
1085
1086 static noinline_for_stack int
1087 update_cache_item(struct btrfs_trans_handle *trans,
1088                   struct btrfs_root *root,
1089                   struct inode *inode,
1090                   struct btrfs_path *path, u64 offset,
1091                   int entries, int bitmaps)
1092 {
1093         struct btrfs_key key;
1094         struct btrfs_free_space_header *header;
1095         struct extent_buffer *leaf;
1096         int ret;
1097
1098         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1099         key.offset = offset;
1100         key.type = 0;
1101
1102         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1103         if (ret < 0) {
1104                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1105                                  EXTENT_DELALLOC, 0, 0, NULL);
1106                 goto fail;
1107         }
1108         leaf = path->nodes[0];
1109         if (ret > 0) {
1110                 struct btrfs_key found_key;
1111                 ASSERT(path->slots[0]);
1112                 path->slots[0]--;
1113                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1114                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1115                     found_key.offset != offset) {
1116                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1117                                          inode->i_size - 1, EXTENT_DELALLOC, 0,
1118                                          0, NULL);
1119                         btrfs_release_path(path);
1120                         goto fail;
1121                 }
1122         }
1123
1124         BTRFS_I(inode)->generation = trans->transid;
1125         header = btrfs_item_ptr(leaf, path->slots[0],
1126                                 struct btrfs_free_space_header);
1127         btrfs_set_free_space_entries(leaf, header, entries);
1128         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1129         btrfs_set_free_space_generation(leaf, header, trans->transid);
1130         btrfs_mark_buffer_dirty(leaf);
1131         btrfs_release_path(path);
1132
1133         return 0;
1134
1135 fail:
1136         return -1;
1137 }
1138
1139 static noinline_for_stack int write_pinned_extent_entries(
1140                             struct btrfs_trans_handle *trans,
1141                             struct btrfs_block_group *block_group,
1142                             struct btrfs_io_ctl *io_ctl,
1143                             int *entries)
1144 {
1145         u64 start, extent_start, extent_end, len;
1146         struct extent_io_tree *unpin = NULL;
1147         int ret;
1148
1149         if (!block_group)
1150                 return 0;
1151
1152         /*
1153          * We want to add any pinned extents to our free space cache
1154          * so we don't leak the space
1155          *
1156          * We shouldn't have switched the pinned extents yet so this is the
1157          * right one
1158          */
1159         unpin = &trans->transaction->pinned_extents;
1160
1161         start = block_group->start;
1162
1163         while (start < block_group->start + block_group->length) {
1164                 ret = find_first_extent_bit(unpin, start,
1165                                             &extent_start, &extent_end,
1166                                             EXTENT_DIRTY, NULL);
1167                 if (ret)
1168                         return 0;
1169
1170                 /* This pinned extent is out of our range */
1171                 if (extent_start >= block_group->start + block_group->length)
1172                         return 0;
1173
1174                 extent_start = max(extent_start, start);
1175                 extent_end = min(block_group->start + block_group->length,
1176                                  extent_end + 1);
1177                 len = extent_end - extent_start;
1178
1179                 *entries += 1;
1180                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1181                 if (ret)
1182                         return -ENOSPC;
1183
1184                 start = extent_end;
1185         }
1186
1187         return 0;
1188 }
1189
1190 static noinline_for_stack int
1191 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1192 {
1193         struct btrfs_free_space *entry, *next;
1194         int ret;
1195
1196         /* Write out the bitmaps */
1197         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1198                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1199                 if (ret)
1200                         return -ENOSPC;
1201                 list_del_init(&entry->list);
1202         }
1203
1204         return 0;
1205 }
1206
1207 static int flush_dirty_cache(struct inode *inode)
1208 {
1209         int ret;
1210
1211         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1212         if (ret)
1213                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1214                                  EXTENT_DELALLOC, 0, 0, NULL);
1215
1216         return ret;
1217 }
1218
1219 static void noinline_for_stack
1220 cleanup_bitmap_list(struct list_head *bitmap_list)
1221 {
1222         struct btrfs_free_space *entry, *next;
1223
1224         list_for_each_entry_safe(entry, next, bitmap_list, list)
1225                 list_del_init(&entry->list);
1226 }
1227
1228 static void noinline_for_stack
1229 cleanup_write_cache_enospc(struct inode *inode,
1230                            struct btrfs_io_ctl *io_ctl,
1231                            struct extent_state **cached_state)
1232 {
1233         io_ctl_drop_pages(io_ctl);
1234         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1235                              i_size_read(inode) - 1, cached_state);
1236 }
1237
1238 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1239                                  struct btrfs_trans_handle *trans,
1240                                  struct btrfs_block_group *block_group,
1241                                  struct btrfs_io_ctl *io_ctl,
1242                                  struct btrfs_path *path, u64 offset)
1243 {
1244         int ret;
1245         struct inode *inode = io_ctl->inode;
1246
1247         if (!inode)
1248                 return 0;
1249
1250         /* Flush the dirty pages in the cache file. */
1251         ret = flush_dirty_cache(inode);
1252         if (ret)
1253                 goto out;
1254
1255         /* Update the cache item to tell everyone this cache file is valid. */
1256         ret = update_cache_item(trans, root, inode, path, offset,
1257                                 io_ctl->entries, io_ctl->bitmaps);
1258 out:
1259         if (ret) {
1260                 invalidate_inode_pages2(inode->i_mapping);
1261                 BTRFS_I(inode)->generation = 0;
1262                 if (block_group)
1263                         btrfs_debug(root->fs_info,
1264           "failed to write free space cache for block group %llu error %d",
1265                                   block_group->start, ret);
1266         }
1267         btrfs_update_inode(trans, root, BTRFS_I(inode));
1268
1269         if (block_group) {
1270                 /* the dirty list is protected by the dirty_bgs_lock */
1271                 spin_lock(&trans->transaction->dirty_bgs_lock);
1272
1273                 /* the disk_cache_state is protected by the block group lock */
1274                 spin_lock(&block_group->lock);
1275
1276                 /*
1277                  * only mark this as written if we didn't get put back on
1278                  * the dirty list while waiting for IO.   Otherwise our
1279                  * cache state won't be right, and we won't get written again
1280                  */
1281                 if (!ret && list_empty(&block_group->dirty_list))
1282                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1283                 else if (ret)
1284                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1285
1286                 spin_unlock(&block_group->lock);
1287                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1288                 io_ctl->inode = NULL;
1289                 iput(inode);
1290         }
1291
1292         return ret;
1293
1294 }
1295
1296 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1297                         struct btrfs_block_group *block_group,
1298                         struct btrfs_path *path)
1299 {
1300         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1301                                      block_group, &block_group->io_ctl,
1302                                      path, block_group->start);
1303 }
1304
1305 /**
1306  * Write out cached info to an inode
1307  *
1308  * @root:        root the inode belongs to
1309  * @inode:       freespace inode we are writing out
1310  * @ctl:         free space cache we are going to write out
1311  * @block_group: block_group for this cache if it belongs to a block_group
1312  * @io_ctl:      holds context for the io
1313  * @trans:       the trans handle
1314  *
1315  * This function writes out a free space cache struct to disk for quick recovery
1316  * on mount.  This will return 0 if it was successful in writing the cache out,
1317  * or an errno if it was not.
1318  */
1319 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1320                                    struct btrfs_free_space_ctl *ctl,
1321                                    struct btrfs_block_group *block_group,
1322                                    struct btrfs_io_ctl *io_ctl,
1323                                    struct btrfs_trans_handle *trans)
1324 {
1325         struct extent_state *cached_state = NULL;
1326         LIST_HEAD(bitmap_list);
1327         int entries = 0;
1328         int bitmaps = 0;
1329         int ret;
1330         int must_iput = 0;
1331
1332         if (!i_size_read(inode))
1333                 return -EIO;
1334
1335         WARN_ON(io_ctl->pages);
1336         ret = io_ctl_init(io_ctl, inode, 1);
1337         if (ret)
1338                 return ret;
1339
1340         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1341                 down_write(&block_group->data_rwsem);
1342                 spin_lock(&block_group->lock);
1343                 if (block_group->delalloc_bytes) {
1344                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1345                         spin_unlock(&block_group->lock);
1346                         up_write(&block_group->data_rwsem);
1347                         BTRFS_I(inode)->generation = 0;
1348                         ret = 0;
1349                         must_iput = 1;
1350                         goto out;
1351                 }
1352                 spin_unlock(&block_group->lock);
1353         }
1354
1355         /* Lock all pages first so we can lock the extent safely. */
1356         ret = io_ctl_prepare_pages(io_ctl, false);
1357         if (ret)
1358                 goto out_unlock;
1359
1360         lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1361                          &cached_state);
1362
1363         io_ctl_set_generation(io_ctl, trans->transid);
1364
1365         mutex_lock(&ctl->cache_writeout_mutex);
1366         /* Write out the extent entries in the free space cache */
1367         spin_lock(&ctl->tree_lock);
1368         ret = write_cache_extent_entries(io_ctl, ctl,
1369                                          block_group, &entries, &bitmaps,
1370                                          &bitmap_list);
1371         if (ret)
1372                 goto out_nospc_locked;
1373
1374         /*
1375          * Some spaces that are freed in the current transaction are pinned,
1376          * they will be added into free space cache after the transaction is
1377          * committed, we shouldn't lose them.
1378          *
1379          * If this changes while we are working we'll get added back to
1380          * the dirty list and redo it.  No locking needed
1381          */
1382         ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1383         if (ret)
1384                 goto out_nospc_locked;
1385
1386         /*
1387          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1388          * locked while doing it because a concurrent trim can be manipulating
1389          * or freeing the bitmap.
1390          */
1391         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1392         spin_unlock(&ctl->tree_lock);
1393         mutex_unlock(&ctl->cache_writeout_mutex);
1394         if (ret)
1395                 goto out_nospc;
1396
1397         /* Zero out the rest of the pages just to make sure */
1398         io_ctl_zero_remaining_pages(io_ctl);
1399
1400         /* Everything is written out, now we dirty the pages in the file. */
1401         ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1402                                 io_ctl->num_pages, 0, i_size_read(inode),
1403                                 &cached_state, false);
1404         if (ret)
1405                 goto out_nospc;
1406
1407         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1408                 up_write(&block_group->data_rwsem);
1409         /*
1410          * Release the pages and unlock the extent, we will flush
1411          * them out later
1412          */
1413         io_ctl_drop_pages(io_ctl);
1414         io_ctl_free(io_ctl);
1415
1416         unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1417                              i_size_read(inode) - 1, &cached_state);
1418
1419         /*
1420          * at this point the pages are under IO and we're happy,
1421          * The caller is responsible for waiting on them and updating
1422          * the cache and the inode
1423          */
1424         io_ctl->entries = entries;
1425         io_ctl->bitmaps = bitmaps;
1426
1427         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1428         if (ret)
1429                 goto out;
1430
1431         return 0;
1432
1433 out_nospc_locked:
1434         cleanup_bitmap_list(&bitmap_list);
1435         spin_unlock(&ctl->tree_lock);
1436         mutex_unlock(&ctl->cache_writeout_mutex);
1437
1438 out_nospc:
1439         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1440
1441 out_unlock:
1442         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1443                 up_write(&block_group->data_rwsem);
1444
1445 out:
1446         io_ctl->inode = NULL;
1447         io_ctl_free(io_ctl);
1448         if (ret) {
1449                 invalidate_inode_pages2(inode->i_mapping);
1450                 BTRFS_I(inode)->generation = 0;
1451         }
1452         btrfs_update_inode(trans, root, BTRFS_I(inode));
1453         if (must_iput)
1454                 iput(inode);
1455         return ret;
1456 }
1457
1458 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1459                           struct btrfs_block_group *block_group,
1460                           struct btrfs_path *path)
1461 {
1462         struct btrfs_fs_info *fs_info = trans->fs_info;
1463         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1464         struct inode *inode;
1465         int ret = 0;
1466
1467         spin_lock(&block_group->lock);
1468         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1469                 spin_unlock(&block_group->lock);
1470                 return 0;
1471         }
1472         spin_unlock(&block_group->lock);
1473
1474         inode = lookup_free_space_inode(block_group, path);
1475         if (IS_ERR(inode))
1476                 return 0;
1477
1478         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1479                                 block_group, &block_group->io_ctl, trans);
1480         if (ret) {
1481                 btrfs_debug(fs_info,
1482           "failed to write free space cache for block group %llu error %d",
1483                           block_group->start, ret);
1484                 spin_lock(&block_group->lock);
1485                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1486                 spin_unlock(&block_group->lock);
1487
1488                 block_group->io_ctl.inode = NULL;
1489                 iput(inode);
1490         }
1491
1492         /*
1493          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1494          * to wait for IO and put the inode
1495          */
1496
1497         return ret;
1498 }
1499
1500 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1501                                           u64 offset)
1502 {
1503         ASSERT(offset >= bitmap_start);
1504         offset -= bitmap_start;
1505         return (unsigned long)(div_u64(offset, unit));
1506 }
1507
1508 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1509 {
1510         return (unsigned long)(div_u64(bytes, unit));
1511 }
1512
1513 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1514                                    u64 offset)
1515 {
1516         u64 bitmap_start;
1517         u64 bytes_per_bitmap;
1518
1519         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1520         bitmap_start = offset - ctl->start;
1521         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1522         bitmap_start *= bytes_per_bitmap;
1523         bitmap_start += ctl->start;
1524
1525         return bitmap_start;
1526 }
1527
1528 static int tree_insert_offset(struct rb_root *root, u64 offset,
1529                               struct rb_node *node, int bitmap)
1530 {
1531         struct rb_node **p = &root->rb_node;
1532         struct rb_node *parent = NULL;
1533         struct btrfs_free_space *info;
1534
1535         while (*p) {
1536                 parent = *p;
1537                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1538
1539                 if (offset < info->offset) {
1540                         p = &(*p)->rb_left;
1541                 } else if (offset > info->offset) {
1542                         p = &(*p)->rb_right;
1543                 } else {
1544                         /*
1545                          * we could have a bitmap entry and an extent entry
1546                          * share the same offset.  If this is the case, we want
1547                          * the extent entry to always be found first if we do a
1548                          * linear search through the tree, since we want to have
1549                          * the quickest allocation time, and allocating from an
1550                          * extent is faster than allocating from a bitmap.  So
1551                          * if we're inserting a bitmap and we find an entry at
1552                          * this offset, we want to go right, or after this entry
1553                          * logically.  If we are inserting an extent and we've
1554                          * found a bitmap, we want to go left, or before
1555                          * logically.
1556                          */
1557                         if (bitmap) {
1558                                 if (info->bitmap) {
1559                                         WARN_ON_ONCE(1);
1560                                         return -EEXIST;
1561                                 }
1562                                 p = &(*p)->rb_right;
1563                         } else {
1564                                 if (!info->bitmap) {
1565                                         WARN_ON_ONCE(1);
1566                                         return -EEXIST;
1567                                 }
1568                                 p = &(*p)->rb_left;
1569                         }
1570                 }
1571         }
1572
1573         rb_link_node(node, parent, p);
1574         rb_insert_color(node, root);
1575
1576         return 0;
1577 }
1578
1579 /*
1580  * searches the tree for the given offset.
1581  *
1582  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1583  * want a section that has at least bytes size and comes at or after the given
1584  * offset.
1585  */
1586 static struct btrfs_free_space *
1587 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1588                    u64 offset, int bitmap_only, int fuzzy)
1589 {
1590         struct rb_node *n = ctl->free_space_offset.rb_node;
1591         struct btrfs_free_space *entry, *prev = NULL;
1592
1593         /* find entry that is closest to the 'offset' */
1594         while (1) {
1595                 if (!n) {
1596                         entry = NULL;
1597                         break;
1598                 }
1599
1600                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1601                 prev = entry;
1602
1603                 if (offset < entry->offset)
1604                         n = n->rb_left;
1605                 else if (offset > entry->offset)
1606                         n = n->rb_right;
1607                 else
1608                         break;
1609         }
1610
1611         if (bitmap_only) {
1612                 if (!entry)
1613                         return NULL;
1614                 if (entry->bitmap)
1615                         return entry;
1616
1617                 /*
1618                  * bitmap entry and extent entry may share same offset,
1619                  * in that case, bitmap entry comes after extent entry.
1620                  */
1621                 n = rb_next(n);
1622                 if (!n)
1623                         return NULL;
1624                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1625                 if (entry->offset != offset)
1626                         return NULL;
1627
1628                 WARN_ON(!entry->bitmap);
1629                 return entry;
1630         } else if (entry) {
1631                 if (entry->bitmap) {
1632                         /*
1633                          * if previous extent entry covers the offset,
1634                          * we should return it instead of the bitmap entry
1635                          */
1636                         n = rb_prev(&entry->offset_index);
1637                         if (n) {
1638                                 prev = rb_entry(n, struct btrfs_free_space,
1639                                                 offset_index);
1640                                 if (!prev->bitmap &&
1641                                     prev->offset + prev->bytes > offset)
1642                                         entry = prev;
1643                         }
1644                 }
1645                 return entry;
1646         }
1647
1648         if (!prev)
1649                 return NULL;
1650
1651         /* find last entry before the 'offset' */
1652         entry = prev;
1653         if (entry->offset > offset) {
1654                 n = rb_prev(&entry->offset_index);
1655                 if (n) {
1656                         entry = rb_entry(n, struct btrfs_free_space,
1657                                         offset_index);
1658                         ASSERT(entry->offset <= offset);
1659                 } else {
1660                         if (fuzzy)
1661                                 return entry;
1662                         else
1663                                 return NULL;
1664                 }
1665         }
1666
1667         if (entry->bitmap) {
1668                 n = rb_prev(&entry->offset_index);
1669                 if (n) {
1670                         prev = rb_entry(n, struct btrfs_free_space,
1671                                         offset_index);
1672                         if (!prev->bitmap &&
1673                             prev->offset + prev->bytes > offset)
1674                                 return prev;
1675                 }
1676                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1677                         return entry;
1678         } else if (entry->offset + entry->bytes > offset)
1679                 return entry;
1680
1681         if (!fuzzy)
1682                 return NULL;
1683
1684         while (1) {
1685                 if (entry->bitmap) {
1686                         if (entry->offset + BITS_PER_BITMAP *
1687                             ctl->unit > offset)
1688                                 break;
1689                 } else {
1690                         if (entry->offset + entry->bytes > offset)
1691                                 break;
1692                 }
1693
1694                 n = rb_next(&entry->offset_index);
1695                 if (!n)
1696                         return NULL;
1697                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1698         }
1699         return entry;
1700 }
1701
1702 static inline void
1703 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1704                     struct btrfs_free_space *info)
1705 {
1706         rb_erase(&info->offset_index, &ctl->free_space_offset);
1707         ctl->free_extents--;
1708
1709         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1710                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
1711                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1712         }
1713 }
1714
1715 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1716                               struct btrfs_free_space *info)
1717 {
1718         __unlink_free_space(ctl, info);
1719         ctl->free_space -= info->bytes;
1720 }
1721
1722 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1723                            struct btrfs_free_space *info)
1724 {
1725         int ret = 0;
1726
1727         ASSERT(info->bytes || info->bitmap);
1728         ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1729                                  &info->offset_index, (info->bitmap != NULL));
1730         if (ret)
1731                 return ret;
1732
1733         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1734                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
1735                 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1736         }
1737
1738         ctl->free_space += info->bytes;
1739         ctl->free_extents++;
1740         return ret;
1741 }
1742
1743 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1744                                        struct btrfs_free_space *info,
1745                                        u64 offset, u64 bytes)
1746 {
1747         unsigned long start, count, end;
1748         int extent_delta = -1;
1749
1750         start = offset_to_bit(info->offset, ctl->unit, offset);
1751         count = bytes_to_bits(bytes, ctl->unit);
1752         end = start + count;
1753         ASSERT(end <= BITS_PER_BITMAP);
1754
1755         bitmap_clear(info->bitmap, start, count);
1756
1757         info->bytes -= bytes;
1758         if (info->max_extent_size > ctl->unit)
1759                 info->max_extent_size = 0;
1760
1761         if (start && test_bit(start - 1, info->bitmap))
1762                 extent_delta++;
1763
1764         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1765                 extent_delta++;
1766
1767         info->bitmap_extents += extent_delta;
1768         if (!btrfs_free_space_trimmed(info)) {
1769                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1770                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1771         }
1772 }
1773
1774 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1775                               struct btrfs_free_space *info, u64 offset,
1776                               u64 bytes)
1777 {
1778         __bitmap_clear_bits(ctl, info, offset, bytes);
1779         ctl->free_space -= bytes;
1780 }
1781
1782 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1783                             struct btrfs_free_space *info, u64 offset,
1784                             u64 bytes)
1785 {
1786         unsigned long start, count, end;
1787         int extent_delta = 1;
1788
1789         start = offset_to_bit(info->offset, ctl->unit, offset);
1790         count = bytes_to_bits(bytes, ctl->unit);
1791         end = start + count;
1792         ASSERT(end <= BITS_PER_BITMAP);
1793
1794         bitmap_set(info->bitmap, start, count);
1795
1796         info->bytes += bytes;
1797         ctl->free_space += bytes;
1798
1799         if (start && test_bit(start - 1, info->bitmap))
1800                 extent_delta--;
1801
1802         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1803                 extent_delta--;
1804
1805         info->bitmap_extents += extent_delta;
1806         if (!btrfs_free_space_trimmed(info)) {
1807                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1808                 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1809         }
1810 }
1811
1812 /*
1813  * If we can not find suitable extent, we will use bytes to record
1814  * the size of the max extent.
1815  */
1816 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1817                          struct btrfs_free_space *bitmap_info, u64 *offset,
1818                          u64 *bytes, bool for_alloc)
1819 {
1820         unsigned long found_bits = 0;
1821         unsigned long max_bits = 0;
1822         unsigned long bits, i;
1823         unsigned long next_zero;
1824         unsigned long extent_bits;
1825
1826         /*
1827          * Skip searching the bitmap if we don't have a contiguous section that
1828          * is large enough for this allocation.
1829          */
1830         if (for_alloc &&
1831             bitmap_info->max_extent_size &&
1832             bitmap_info->max_extent_size < *bytes) {
1833                 *bytes = bitmap_info->max_extent_size;
1834                 return -1;
1835         }
1836
1837         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1838                           max_t(u64, *offset, bitmap_info->offset));
1839         bits = bytes_to_bits(*bytes, ctl->unit);
1840
1841         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1842                 if (for_alloc && bits == 1) {
1843                         found_bits = 1;
1844                         break;
1845                 }
1846                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1847                                                BITS_PER_BITMAP, i);
1848                 extent_bits = next_zero - i;
1849                 if (extent_bits >= bits) {
1850                         found_bits = extent_bits;
1851                         break;
1852                 } else if (extent_bits > max_bits) {
1853                         max_bits = extent_bits;
1854                 }
1855                 i = next_zero;
1856         }
1857
1858         if (found_bits) {
1859                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1860                 *bytes = (u64)(found_bits) * ctl->unit;
1861                 return 0;
1862         }
1863
1864         *bytes = (u64)(max_bits) * ctl->unit;
1865         bitmap_info->max_extent_size = *bytes;
1866         return -1;
1867 }
1868
1869 static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1870 {
1871         if (entry->bitmap)
1872                 return entry->max_extent_size;
1873         return entry->bytes;
1874 }
1875
1876 /* Cache the size of the max extent in bytes */
1877 static struct btrfs_free_space *
1878 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1879                 unsigned long align, u64 *max_extent_size)
1880 {
1881         struct btrfs_free_space *entry;
1882         struct rb_node *node;
1883         u64 tmp;
1884         u64 align_off;
1885         int ret;
1886
1887         if (!ctl->free_space_offset.rb_node)
1888                 goto out;
1889
1890         entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1891         if (!entry)
1892                 goto out;
1893
1894         for (node = &entry->offset_index; node; node = rb_next(node)) {
1895                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1896                 if (entry->bytes < *bytes) {
1897                         *max_extent_size = max(get_max_extent_size(entry),
1898                                                *max_extent_size);
1899                         continue;
1900                 }
1901
1902                 /* make sure the space returned is big enough
1903                  * to match our requested alignment
1904                  */
1905                 if (*bytes >= align) {
1906                         tmp = entry->offset - ctl->start + align - 1;
1907                         tmp = div64_u64(tmp, align);
1908                         tmp = tmp * align + ctl->start;
1909                         align_off = tmp - entry->offset;
1910                 } else {
1911                         align_off = 0;
1912                         tmp = entry->offset;
1913                 }
1914
1915                 if (entry->bytes < *bytes + align_off) {
1916                         *max_extent_size = max(get_max_extent_size(entry),
1917                                                *max_extent_size);
1918                         continue;
1919                 }
1920
1921                 if (entry->bitmap) {
1922                         u64 size = *bytes;
1923
1924                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
1925                         if (!ret) {
1926                                 *offset = tmp;
1927                                 *bytes = size;
1928                                 return entry;
1929                         } else {
1930                                 *max_extent_size =
1931                                         max(get_max_extent_size(entry),
1932                                             *max_extent_size);
1933                         }
1934                         continue;
1935                 }
1936
1937                 *offset = tmp;
1938                 *bytes = entry->bytes - align_off;
1939                 return entry;
1940         }
1941 out:
1942         return NULL;
1943 }
1944
1945 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1946                            struct btrfs_free_space *info, u64 offset)
1947 {
1948         info->offset = offset_to_bitmap(ctl, offset);
1949         info->bytes = 0;
1950         info->bitmap_extents = 0;
1951         INIT_LIST_HEAD(&info->list);
1952         link_free_space(ctl, info);
1953         ctl->total_bitmaps++;
1954         recalculate_thresholds(ctl);
1955 }
1956
1957 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1958                         struct btrfs_free_space *bitmap_info)
1959 {
1960         /*
1961          * Normally when this is called, the bitmap is completely empty. However,
1962          * if we are blowing up the free space cache for one reason or another
1963          * via __btrfs_remove_free_space_cache(), then it may not be freed and
1964          * we may leave stats on the table.
1965          */
1966         if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
1967                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
1968                         bitmap_info->bitmap_extents;
1969                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
1970
1971         }
1972         unlink_free_space(ctl, bitmap_info);
1973         kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1974         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1975         ctl->total_bitmaps--;
1976         recalculate_thresholds(ctl);
1977 }
1978
1979 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1980                               struct btrfs_free_space *bitmap_info,
1981                               u64 *offset, u64 *bytes)
1982 {
1983         u64 end;
1984         u64 search_start, search_bytes;
1985         int ret;
1986
1987 again:
1988         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1989
1990         /*
1991          * We need to search for bits in this bitmap.  We could only cover some
1992          * of the extent in this bitmap thanks to how we add space, so we need
1993          * to search for as much as it as we can and clear that amount, and then
1994          * go searching for the next bit.
1995          */
1996         search_start = *offset;
1997         search_bytes = ctl->unit;
1998         search_bytes = min(search_bytes, end - search_start + 1);
1999         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2000                             false);
2001         if (ret < 0 || search_start != *offset)
2002                 return -EINVAL;
2003
2004         /* We may have found more bits than what we need */
2005         search_bytes = min(search_bytes, *bytes);
2006
2007         /* Cannot clear past the end of the bitmap */
2008         search_bytes = min(search_bytes, end - search_start + 1);
2009
2010         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
2011         *offset += search_bytes;
2012         *bytes -= search_bytes;
2013
2014         if (*bytes) {
2015                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
2016                 if (!bitmap_info->bytes)
2017                         free_bitmap(ctl, bitmap_info);
2018
2019                 /*
2020                  * no entry after this bitmap, but we still have bytes to
2021                  * remove, so something has gone wrong.
2022                  */
2023                 if (!next)
2024                         return -EINVAL;
2025
2026                 bitmap_info = rb_entry(next, struct btrfs_free_space,
2027                                        offset_index);
2028
2029                 /*
2030                  * if the next entry isn't a bitmap we need to return to let the
2031                  * extent stuff do its work.
2032                  */
2033                 if (!bitmap_info->bitmap)
2034                         return -EAGAIN;
2035
2036                 /*
2037                  * Ok the next item is a bitmap, but it may not actually hold
2038                  * the information for the rest of this free space stuff, so
2039                  * look for it, and if we don't find it return so we can try
2040                  * everything over again.
2041                  */
2042                 search_start = *offset;
2043                 search_bytes = ctl->unit;
2044                 ret = search_bitmap(ctl, bitmap_info, &search_start,
2045                                     &search_bytes, false);
2046                 if (ret < 0 || search_start != *offset)
2047                         return -EAGAIN;
2048
2049                 goto again;
2050         } else if (!bitmap_info->bytes)
2051                 free_bitmap(ctl, bitmap_info);
2052
2053         return 0;
2054 }
2055
2056 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2057                                struct btrfs_free_space *info, u64 offset,
2058                                u64 bytes, enum btrfs_trim_state trim_state)
2059 {
2060         u64 bytes_to_set = 0;
2061         u64 end;
2062
2063         /*
2064          * This is a tradeoff to make bitmap trim state minimal.  We mark the
2065          * whole bitmap untrimmed if at any point we add untrimmed regions.
2066          */
2067         if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2068                 if (btrfs_free_space_trimmed(info)) {
2069                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
2070                                 info->bitmap_extents;
2071                         ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2072                 }
2073                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2074         }
2075
2076         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2077
2078         bytes_to_set = min(end - offset, bytes);
2079
2080         bitmap_set_bits(ctl, info, offset, bytes_to_set);
2081
2082         /*
2083          * We set some bytes, we have no idea what the max extent size is
2084          * anymore.
2085          */
2086         info->max_extent_size = 0;
2087
2088         return bytes_to_set;
2089
2090 }
2091
2092 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2093                       struct btrfs_free_space *info)
2094 {
2095         struct btrfs_block_group *block_group = ctl->private;
2096         struct btrfs_fs_info *fs_info = block_group->fs_info;
2097         bool forced = false;
2098
2099 #ifdef CONFIG_BTRFS_DEBUG
2100         if (btrfs_should_fragment_free_space(block_group))
2101                 forced = true;
2102 #endif
2103
2104         /* This is a way to reclaim large regions from the bitmaps. */
2105         if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2106                 return false;
2107
2108         /*
2109          * If we are below the extents threshold then we can add this as an
2110          * extent, and don't have to deal with the bitmap
2111          */
2112         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2113                 /*
2114                  * If this block group has some small extents we don't want to
2115                  * use up all of our free slots in the cache with them, we want
2116                  * to reserve them to larger extents, however if we have plenty
2117                  * of cache left then go ahead an dadd them, no sense in adding
2118                  * the overhead of a bitmap if we don't have to.
2119                  */
2120                 if (info->bytes <= fs_info->sectorsize * 8) {
2121                         if (ctl->free_extents * 3 <= ctl->extents_thresh)
2122                                 return false;
2123                 } else {
2124                         return false;
2125                 }
2126         }
2127
2128         /*
2129          * The original block groups from mkfs can be really small, like 8
2130          * megabytes, so don't bother with a bitmap for those entries.  However
2131          * some block groups can be smaller than what a bitmap would cover but
2132          * are still large enough that they could overflow the 32k memory limit,
2133          * so allow those block groups to still be allowed to have a bitmap
2134          * entry.
2135          */
2136         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2137                 return false;
2138
2139         return true;
2140 }
2141
2142 static const struct btrfs_free_space_op free_space_op = {
2143         .use_bitmap             = use_bitmap,
2144 };
2145
2146 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2147                               struct btrfs_free_space *info)
2148 {
2149         struct btrfs_free_space *bitmap_info;
2150         struct btrfs_block_group *block_group = NULL;
2151         int added = 0;
2152         u64 bytes, offset, bytes_added;
2153         enum btrfs_trim_state trim_state;
2154         int ret;
2155
2156         bytes = info->bytes;
2157         offset = info->offset;
2158         trim_state = info->trim_state;
2159
2160         if (!ctl->op->use_bitmap(ctl, info))
2161                 return 0;
2162
2163         if (ctl->op == &free_space_op)
2164                 block_group = ctl->private;
2165 again:
2166         /*
2167          * Since we link bitmaps right into the cluster we need to see if we
2168          * have a cluster here, and if so and it has our bitmap we need to add
2169          * the free space to that bitmap.
2170          */
2171         if (block_group && !list_empty(&block_group->cluster_list)) {
2172                 struct btrfs_free_cluster *cluster;
2173                 struct rb_node *node;
2174                 struct btrfs_free_space *entry;
2175
2176                 cluster = list_entry(block_group->cluster_list.next,
2177                                      struct btrfs_free_cluster,
2178                                      block_group_list);
2179                 spin_lock(&cluster->lock);
2180                 node = rb_first(&cluster->root);
2181                 if (!node) {
2182                         spin_unlock(&cluster->lock);
2183                         goto no_cluster_bitmap;
2184                 }
2185
2186                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2187                 if (!entry->bitmap) {
2188                         spin_unlock(&cluster->lock);
2189                         goto no_cluster_bitmap;
2190                 }
2191
2192                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2193                         bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2194                                                           bytes, trim_state);
2195                         bytes -= bytes_added;
2196                         offset += bytes_added;
2197                 }
2198                 spin_unlock(&cluster->lock);
2199                 if (!bytes) {
2200                         ret = 1;
2201                         goto out;
2202                 }
2203         }
2204
2205 no_cluster_bitmap:
2206         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2207                                          1, 0);
2208         if (!bitmap_info) {
2209                 ASSERT(added == 0);
2210                 goto new_bitmap;
2211         }
2212
2213         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2214                                           trim_state);
2215         bytes -= bytes_added;
2216         offset += bytes_added;
2217         added = 0;
2218
2219         if (!bytes) {
2220                 ret = 1;
2221                 goto out;
2222         } else
2223                 goto again;
2224
2225 new_bitmap:
2226         if (info && info->bitmap) {
2227                 add_new_bitmap(ctl, info, offset);
2228                 added = 1;
2229                 info = NULL;
2230                 goto again;
2231         } else {
2232                 spin_unlock(&ctl->tree_lock);
2233
2234                 /* no pre-allocated info, allocate a new one */
2235                 if (!info) {
2236                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2237                                                  GFP_NOFS);
2238                         if (!info) {
2239                                 spin_lock(&ctl->tree_lock);
2240                                 ret = -ENOMEM;
2241                                 goto out;
2242                         }
2243                 }
2244
2245                 /* allocate the bitmap */
2246                 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2247                                                  GFP_NOFS);
2248                 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2249                 spin_lock(&ctl->tree_lock);
2250                 if (!info->bitmap) {
2251                         ret = -ENOMEM;
2252                         goto out;
2253                 }
2254                 goto again;
2255         }
2256
2257 out:
2258         if (info) {
2259                 if (info->bitmap)
2260                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2261                                         info->bitmap);
2262                 kmem_cache_free(btrfs_free_space_cachep, info);
2263         }
2264
2265         return ret;
2266 }
2267
2268 /*
2269  * Free space merging rules:
2270  *  1) Merge trimmed areas together
2271  *  2) Let untrimmed areas coalesce with trimmed areas
2272  *  3) Always pull neighboring regions from bitmaps
2273  *
2274  * The above rules are for when we merge free space based on btrfs_trim_state.
2275  * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2276  * same reason: to promote larger extent regions which makes life easier for
2277  * find_free_extent().  Rule 2 enables coalescing based on the common path
2278  * being returning free space from btrfs_finish_extent_commit().  So when free
2279  * space is trimmed, it will prevent aggregating trimmed new region and
2280  * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2281  * and provide find_free_extent() with the largest extents possible hoping for
2282  * the reuse path.
2283  */
2284 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2285                           struct btrfs_free_space *info, bool update_stat)
2286 {
2287         struct btrfs_free_space *left_info = NULL;
2288         struct btrfs_free_space *right_info;
2289         bool merged = false;
2290         u64 offset = info->offset;
2291         u64 bytes = info->bytes;
2292         const bool is_trimmed = btrfs_free_space_trimmed(info);
2293
2294         /*
2295          * first we want to see if there is free space adjacent to the range we
2296          * are adding, if there is remove that struct and add a new one to
2297          * cover the entire range
2298          */
2299         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2300         if (right_info && rb_prev(&right_info->offset_index))
2301                 left_info = rb_entry(rb_prev(&right_info->offset_index),
2302                                      struct btrfs_free_space, offset_index);
2303         else if (!right_info)
2304                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2305
2306         /* See try_merge_free_space() comment. */
2307         if (right_info && !right_info->bitmap &&
2308             (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2309                 if (update_stat)
2310                         unlink_free_space(ctl, right_info);
2311                 else
2312                         __unlink_free_space(ctl, right_info);
2313                 info->bytes += right_info->bytes;
2314                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2315                 merged = true;
2316         }
2317
2318         /* See try_merge_free_space() comment. */
2319         if (left_info && !left_info->bitmap &&
2320             left_info->offset + left_info->bytes == offset &&
2321             (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2322                 if (update_stat)
2323                         unlink_free_space(ctl, left_info);
2324                 else
2325                         __unlink_free_space(ctl, left_info);
2326                 info->offset = left_info->offset;
2327                 info->bytes += left_info->bytes;
2328                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2329                 merged = true;
2330         }
2331
2332         return merged;
2333 }
2334
2335 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2336                                      struct btrfs_free_space *info,
2337                                      bool update_stat)
2338 {
2339         struct btrfs_free_space *bitmap;
2340         unsigned long i;
2341         unsigned long j;
2342         const u64 end = info->offset + info->bytes;
2343         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2344         u64 bytes;
2345
2346         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2347         if (!bitmap)
2348                 return false;
2349
2350         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2351         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2352         if (j == i)
2353                 return false;
2354         bytes = (j - i) * ctl->unit;
2355         info->bytes += bytes;
2356
2357         /* See try_merge_free_space() comment. */
2358         if (!btrfs_free_space_trimmed(bitmap))
2359                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2360
2361         if (update_stat)
2362                 bitmap_clear_bits(ctl, bitmap, end, bytes);
2363         else
2364                 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2365
2366         if (!bitmap->bytes)
2367                 free_bitmap(ctl, bitmap);
2368
2369         return true;
2370 }
2371
2372 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2373                                        struct btrfs_free_space *info,
2374                                        bool update_stat)
2375 {
2376         struct btrfs_free_space *bitmap;
2377         u64 bitmap_offset;
2378         unsigned long i;
2379         unsigned long j;
2380         unsigned long prev_j;
2381         u64 bytes;
2382
2383         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2384         /* If we're on a boundary, try the previous logical bitmap. */
2385         if (bitmap_offset == info->offset) {
2386                 if (info->offset == 0)
2387                         return false;
2388                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2389         }
2390
2391         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2392         if (!bitmap)
2393                 return false;
2394
2395         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2396         j = 0;
2397         prev_j = (unsigned long)-1;
2398         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2399                 if (j > i)
2400                         break;
2401                 prev_j = j;
2402         }
2403         if (prev_j == i)
2404                 return false;
2405
2406         if (prev_j == (unsigned long)-1)
2407                 bytes = (i + 1) * ctl->unit;
2408         else
2409                 bytes = (i - prev_j) * ctl->unit;
2410
2411         info->offset -= bytes;
2412         info->bytes += bytes;
2413
2414         /* See try_merge_free_space() comment. */
2415         if (!btrfs_free_space_trimmed(bitmap))
2416                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2417
2418         if (update_stat)
2419                 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2420         else
2421                 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2422
2423         if (!bitmap->bytes)
2424                 free_bitmap(ctl, bitmap);
2425
2426         return true;
2427 }
2428
2429 /*
2430  * We prefer always to allocate from extent entries, both for clustered and
2431  * non-clustered allocation requests. So when attempting to add a new extent
2432  * entry, try to see if there's adjacent free space in bitmap entries, and if
2433  * there is, migrate that space from the bitmaps to the extent.
2434  * Like this we get better chances of satisfying space allocation requests
2435  * because we attempt to satisfy them based on a single cache entry, and never
2436  * on 2 or more entries - even if the entries represent a contiguous free space
2437  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2438  * ends).
2439  */
2440 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2441                               struct btrfs_free_space *info,
2442                               bool update_stat)
2443 {
2444         /*
2445          * Only work with disconnected entries, as we can change their offset,
2446          * and must be extent entries.
2447          */
2448         ASSERT(!info->bitmap);
2449         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2450
2451         if (ctl->total_bitmaps > 0) {
2452                 bool stole_end;
2453                 bool stole_front = false;
2454
2455                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2456                 if (ctl->total_bitmaps > 0)
2457                         stole_front = steal_from_bitmap_to_front(ctl, info,
2458                                                                  update_stat);
2459
2460                 if (stole_end || stole_front)
2461                         try_merge_free_space(ctl, info, update_stat);
2462         }
2463 }
2464
2465 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2466                            struct btrfs_free_space_ctl *ctl,
2467                            u64 offset, u64 bytes,
2468                            enum btrfs_trim_state trim_state)
2469 {
2470         struct btrfs_block_group *block_group = ctl->private;
2471         struct btrfs_free_space *info;
2472         int ret = 0;
2473         u64 filter_bytes = bytes;
2474
2475         ASSERT(!btrfs_is_zoned(fs_info));
2476
2477         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2478         if (!info)
2479                 return -ENOMEM;
2480
2481         info->offset = offset;
2482         info->bytes = bytes;
2483         info->trim_state = trim_state;
2484         RB_CLEAR_NODE(&info->offset_index);
2485
2486         spin_lock(&ctl->tree_lock);
2487
2488         if (try_merge_free_space(ctl, info, true))
2489                 goto link;
2490
2491         /*
2492          * There was no extent directly to the left or right of this new
2493          * extent then we know we're going to have to allocate a new extent, so
2494          * before we do that see if we need to drop this into a bitmap
2495          */
2496         ret = insert_into_bitmap(ctl, info);
2497         if (ret < 0) {
2498                 goto out;
2499         } else if (ret) {
2500                 ret = 0;
2501                 goto out;
2502         }
2503 link:
2504         /*
2505          * Only steal free space from adjacent bitmaps if we're sure we're not
2506          * going to add the new free space to existing bitmap entries - because
2507          * that would mean unnecessary work that would be reverted. Therefore
2508          * attempt to steal space from bitmaps if we're adding an extent entry.
2509          */
2510         steal_from_bitmap(ctl, info, true);
2511
2512         filter_bytes = max(filter_bytes, info->bytes);
2513
2514         ret = link_free_space(ctl, info);
2515         if (ret)
2516                 kmem_cache_free(btrfs_free_space_cachep, info);
2517 out:
2518         btrfs_discard_update_discardable(block_group);
2519         spin_unlock(&ctl->tree_lock);
2520
2521         if (ret) {
2522                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2523                 ASSERT(ret != -EEXIST);
2524         }
2525
2526         if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2527                 btrfs_discard_check_filter(block_group, filter_bytes);
2528                 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2529         }
2530
2531         return ret;
2532 }
2533
2534 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2535                                         u64 bytenr, u64 size, bool used)
2536 {
2537         struct btrfs_fs_info *fs_info = block_group->fs_info;
2538         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2539         u64 offset = bytenr - block_group->start;
2540         u64 to_free, to_unusable;
2541         const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
2542
2543         spin_lock(&ctl->tree_lock);
2544         if (!used)
2545                 to_free = size;
2546         else if (offset >= block_group->alloc_offset)
2547                 to_free = size;
2548         else if (offset + size <= block_group->alloc_offset)
2549                 to_free = 0;
2550         else
2551                 to_free = offset + size - block_group->alloc_offset;
2552         to_unusable = size - to_free;
2553
2554         ctl->free_space += to_free;
2555         /*
2556          * If the block group is read-only, we should account freed space into
2557          * bytes_readonly.
2558          */
2559         if (!block_group->ro)
2560                 block_group->zone_unusable += to_unusable;
2561         spin_unlock(&ctl->tree_lock);
2562         if (!used) {
2563                 spin_lock(&block_group->lock);
2564                 block_group->alloc_offset -= size;
2565                 spin_unlock(&block_group->lock);
2566         }
2567
2568         /* All the region is now unusable. Mark it as unused and reclaim */
2569         if (block_group->zone_unusable == block_group->length) {
2570                 btrfs_mark_bg_unused(block_group);
2571         } else if (bg_reclaim_threshold &&
2572                    block_group->zone_unusable >=
2573                    div_factor_fine(block_group->length, bg_reclaim_threshold)) {
2574                 btrfs_mark_bg_to_reclaim(block_group);
2575         }
2576
2577         return 0;
2578 }
2579
2580 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2581                          u64 bytenr, u64 size)
2582 {
2583         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2584
2585         if (btrfs_is_zoned(block_group->fs_info))
2586                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2587                                                     true);
2588
2589         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2590                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2591
2592         return __btrfs_add_free_space(block_group->fs_info,
2593                                       block_group->free_space_ctl,
2594                                       bytenr, size, trim_state);
2595 }
2596
2597 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2598                                 u64 bytenr, u64 size)
2599 {
2600         if (btrfs_is_zoned(block_group->fs_info))
2601                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2602                                                     false);
2603
2604         return btrfs_add_free_space(block_group, bytenr, size);
2605 }
2606
2607 /*
2608  * This is a subtle distinction because when adding free space back in general,
2609  * we want it to be added as untrimmed for async. But in the case where we add
2610  * it on loading of a block group, we want to consider it trimmed.
2611  */
2612 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2613                                        u64 bytenr, u64 size)
2614 {
2615         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2616
2617         if (btrfs_is_zoned(block_group->fs_info))
2618                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2619                                                     true);
2620
2621         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2622             btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2623                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2624
2625         return __btrfs_add_free_space(block_group->fs_info,
2626                                       block_group->free_space_ctl,
2627                                       bytenr, size, trim_state);
2628 }
2629
2630 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2631                             u64 offset, u64 bytes)
2632 {
2633         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2634         struct btrfs_free_space *info;
2635         int ret;
2636         bool re_search = false;
2637
2638         if (btrfs_is_zoned(block_group->fs_info)) {
2639                 /*
2640                  * This can happen with conventional zones when replaying log.
2641                  * Since the allocation info of tree-log nodes are not recorded
2642                  * to the extent-tree, calculate_alloc_pointer() failed to
2643                  * advance the allocation pointer after last allocated tree log
2644                  * node blocks.
2645                  *
2646                  * This function is called from
2647                  * btrfs_pin_extent_for_log_replay() when replaying the log.
2648                  * Advance the pointer not to overwrite the tree-log nodes.
2649                  */
2650                 if (block_group->start + block_group->alloc_offset <
2651                     offset + bytes) {
2652                         block_group->alloc_offset =
2653                                 offset + bytes - block_group->start;
2654                 }
2655                 return 0;
2656         }
2657
2658         spin_lock(&ctl->tree_lock);
2659
2660 again:
2661         ret = 0;
2662         if (!bytes)
2663                 goto out_lock;
2664
2665         info = tree_search_offset(ctl, offset, 0, 0);
2666         if (!info) {
2667                 /*
2668                  * oops didn't find an extent that matched the space we wanted
2669                  * to remove, look for a bitmap instead
2670                  */
2671                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2672                                           1, 0);
2673                 if (!info) {
2674                         /*
2675                          * If we found a partial bit of our free space in a
2676                          * bitmap but then couldn't find the other part this may
2677                          * be a problem, so WARN about it.
2678                          */
2679                         WARN_ON(re_search);
2680                         goto out_lock;
2681                 }
2682         }
2683
2684         re_search = false;
2685         if (!info->bitmap) {
2686                 unlink_free_space(ctl, info);
2687                 if (offset == info->offset) {
2688                         u64 to_free = min(bytes, info->bytes);
2689
2690                         info->bytes -= to_free;
2691                         info->offset += to_free;
2692                         if (info->bytes) {
2693                                 ret = link_free_space(ctl, info);
2694                                 WARN_ON(ret);
2695                         } else {
2696                                 kmem_cache_free(btrfs_free_space_cachep, info);
2697                         }
2698
2699                         offset += to_free;
2700                         bytes -= to_free;
2701                         goto again;
2702                 } else {
2703                         u64 old_end = info->bytes + info->offset;
2704
2705                         info->bytes = offset - info->offset;
2706                         ret = link_free_space(ctl, info);
2707                         WARN_ON(ret);
2708                         if (ret)
2709                                 goto out_lock;
2710
2711                         /* Not enough bytes in this entry to satisfy us */
2712                         if (old_end < offset + bytes) {
2713                                 bytes -= old_end - offset;
2714                                 offset = old_end;
2715                                 goto again;
2716                         } else if (old_end == offset + bytes) {
2717                                 /* all done */
2718                                 goto out_lock;
2719                         }
2720                         spin_unlock(&ctl->tree_lock);
2721
2722                         ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2723                                                      offset + bytes,
2724                                                      old_end - (offset + bytes),
2725                                                      info->trim_state);
2726                         WARN_ON(ret);
2727                         goto out;
2728                 }
2729         }
2730
2731         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2732         if (ret == -EAGAIN) {
2733                 re_search = true;
2734                 goto again;
2735         }
2736 out_lock:
2737         btrfs_discard_update_discardable(block_group);
2738         spin_unlock(&ctl->tree_lock);
2739 out:
2740         return ret;
2741 }
2742
2743 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2744                            u64 bytes)
2745 {
2746         struct btrfs_fs_info *fs_info = block_group->fs_info;
2747         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2748         struct btrfs_free_space *info;
2749         struct rb_node *n;
2750         int count = 0;
2751
2752         /*
2753          * Zoned btrfs does not use free space tree and cluster. Just print
2754          * out the free space after the allocation offset.
2755          */
2756         if (btrfs_is_zoned(fs_info)) {
2757                 btrfs_info(fs_info, "free space %llu",
2758                            block_group->length - block_group->alloc_offset);
2759                 return;
2760         }
2761
2762         spin_lock(&ctl->tree_lock);
2763         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2764                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2765                 if (info->bytes >= bytes && !block_group->ro)
2766                         count++;
2767                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2768                            info->offset, info->bytes,
2769                        (info->bitmap) ? "yes" : "no");
2770         }
2771         spin_unlock(&ctl->tree_lock);
2772         btrfs_info(fs_info, "block group has cluster?: %s",
2773                list_empty(&block_group->cluster_list) ? "no" : "yes");
2774         btrfs_info(fs_info,
2775                    "%d blocks of free space at or bigger than bytes is", count);
2776 }
2777
2778 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2779                                struct btrfs_free_space_ctl *ctl)
2780 {
2781         struct btrfs_fs_info *fs_info = block_group->fs_info;
2782
2783         spin_lock_init(&ctl->tree_lock);
2784         ctl->unit = fs_info->sectorsize;
2785         ctl->start = block_group->start;
2786         ctl->private = block_group;
2787         ctl->op = &free_space_op;
2788         INIT_LIST_HEAD(&ctl->trimming_ranges);
2789         mutex_init(&ctl->cache_writeout_mutex);
2790
2791         /*
2792          * we only want to have 32k of ram per block group for keeping
2793          * track of free space, and if we pass 1/2 of that we want to
2794          * start converting things over to using bitmaps
2795          */
2796         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2797 }
2798
2799 /*
2800  * for a given cluster, put all of its extents back into the free
2801  * space cache.  If the block group passed doesn't match the block group
2802  * pointed to by the cluster, someone else raced in and freed the
2803  * cluster already.  In that case, we just return without changing anything
2804  */
2805 static void __btrfs_return_cluster_to_free_space(
2806                              struct btrfs_block_group *block_group,
2807                              struct btrfs_free_cluster *cluster)
2808 {
2809         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2810         struct btrfs_free_space *entry;
2811         struct rb_node *node;
2812
2813         spin_lock(&cluster->lock);
2814         if (cluster->block_group != block_group) {
2815                 spin_unlock(&cluster->lock);
2816                 return;
2817         }
2818
2819         cluster->block_group = NULL;
2820         cluster->window_start = 0;
2821         list_del_init(&cluster->block_group_list);
2822
2823         node = rb_first(&cluster->root);
2824         while (node) {
2825                 bool bitmap;
2826
2827                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2828                 node = rb_next(&entry->offset_index);
2829                 rb_erase(&entry->offset_index, &cluster->root);
2830                 RB_CLEAR_NODE(&entry->offset_index);
2831
2832                 bitmap = (entry->bitmap != NULL);
2833                 if (!bitmap) {
2834                         /* Merging treats extents as if they were new */
2835                         if (!btrfs_free_space_trimmed(entry)) {
2836                                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
2837                                 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2838                                         entry->bytes;
2839                         }
2840
2841                         try_merge_free_space(ctl, entry, false);
2842                         steal_from_bitmap(ctl, entry, false);
2843
2844                         /* As we insert directly, update these statistics */
2845                         if (!btrfs_free_space_trimmed(entry)) {
2846                                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
2847                                 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2848                                         entry->bytes;
2849                         }
2850                 }
2851                 tree_insert_offset(&ctl->free_space_offset,
2852                                    entry->offset, &entry->offset_index, bitmap);
2853         }
2854         cluster->root = RB_ROOT;
2855         spin_unlock(&cluster->lock);
2856         btrfs_put_block_group(block_group);
2857 }
2858
2859 static void __btrfs_remove_free_space_cache_locked(
2860                                 struct btrfs_free_space_ctl *ctl)
2861 {
2862         struct btrfs_free_space *info;
2863         struct rb_node *node;
2864
2865         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2866                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2867                 if (!info->bitmap) {
2868                         unlink_free_space(ctl, info);
2869                         kmem_cache_free(btrfs_free_space_cachep, info);
2870                 } else {
2871                         free_bitmap(ctl, info);
2872                 }
2873
2874                 cond_resched_lock(&ctl->tree_lock);
2875         }
2876 }
2877
2878 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2879 {
2880         spin_lock(&ctl->tree_lock);
2881         __btrfs_remove_free_space_cache_locked(ctl);
2882         if (ctl->private)
2883                 btrfs_discard_update_discardable(ctl->private);
2884         spin_unlock(&ctl->tree_lock);
2885 }
2886
2887 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2888 {
2889         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2890         struct btrfs_free_cluster *cluster;
2891         struct list_head *head;
2892
2893         spin_lock(&ctl->tree_lock);
2894         while ((head = block_group->cluster_list.next) !=
2895                &block_group->cluster_list) {
2896                 cluster = list_entry(head, struct btrfs_free_cluster,
2897                                      block_group_list);
2898
2899                 WARN_ON(cluster->block_group != block_group);
2900                 __btrfs_return_cluster_to_free_space(block_group, cluster);
2901
2902                 cond_resched_lock(&ctl->tree_lock);
2903         }
2904         __btrfs_remove_free_space_cache_locked(ctl);
2905         btrfs_discard_update_discardable(block_group);
2906         spin_unlock(&ctl->tree_lock);
2907
2908 }
2909
2910 /**
2911  * btrfs_is_free_space_trimmed - see if everything is trimmed
2912  * @block_group: block_group of interest
2913  *
2914  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2915  */
2916 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2917 {
2918         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2919         struct btrfs_free_space *info;
2920         struct rb_node *node;
2921         bool ret = true;
2922
2923         spin_lock(&ctl->tree_lock);
2924         node = rb_first(&ctl->free_space_offset);
2925
2926         while (node) {
2927                 info = rb_entry(node, struct btrfs_free_space, offset_index);
2928
2929                 if (!btrfs_free_space_trimmed(info)) {
2930                         ret = false;
2931                         break;
2932                 }
2933
2934                 node = rb_next(node);
2935         }
2936
2937         spin_unlock(&ctl->tree_lock);
2938         return ret;
2939 }
2940
2941 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2942                                u64 offset, u64 bytes, u64 empty_size,
2943                                u64 *max_extent_size)
2944 {
2945         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2946         struct btrfs_discard_ctl *discard_ctl =
2947                                         &block_group->fs_info->discard_ctl;
2948         struct btrfs_free_space *entry = NULL;
2949         u64 bytes_search = bytes + empty_size;
2950         u64 ret = 0;
2951         u64 align_gap = 0;
2952         u64 align_gap_len = 0;
2953         enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2954
2955         ASSERT(!btrfs_is_zoned(block_group->fs_info));
2956
2957         spin_lock(&ctl->tree_lock);
2958         entry = find_free_space(ctl, &offset, &bytes_search,
2959                                 block_group->full_stripe_len, max_extent_size);
2960         if (!entry)
2961                 goto out;
2962
2963         ret = offset;
2964         if (entry->bitmap) {
2965                 bitmap_clear_bits(ctl, entry, offset, bytes);
2966
2967                 if (!btrfs_free_space_trimmed(entry))
2968                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2969
2970                 if (!entry->bytes)
2971                         free_bitmap(ctl, entry);
2972         } else {
2973                 unlink_free_space(ctl, entry);
2974                 align_gap_len = offset - entry->offset;
2975                 align_gap = entry->offset;
2976                 align_gap_trim_state = entry->trim_state;
2977
2978                 if (!btrfs_free_space_trimmed(entry))
2979                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2980
2981                 entry->offset = offset + bytes;
2982                 WARN_ON(entry->bytes < bytes + align_gap_len);
2983
2984                 entry->bytes -= bytes + align_gap_len;
2985                 if (!entry->bytes)
2986                         kmem_cache_free(btrfs_free_space_cachep, entry);
2987                 else
2988                         link_free_space(ctl, entry);
2989         }
2990 out:
2991         btrfs_discard_update_discardable(block_group);
2992         spin_unlock(&ctl->tree_lock);
2993
2994         if (align_gap_len)
2995                 __btrfs_add_free_space(block_group->fs_info, ctl,
2996                                        align_gap, align_gap_len,
2997                                        align_gap_trim_state);
2998         return ret;
2999 }
3000
3001 /*
3002  * given a cluster, put all of its extents back into the free space
3003  * cache.  If a block group is passed, this function will only free
3004  * a cluster that belongs to the passed block group.
3005  *
3006  * Otherwise, it'll get a reference on the block group pointed to by the
3007  * cluster and remove the cluster from it.
3008  */
3009 void btrfs_return_cluster_to_free_space(
3010                                struct btrfs_block_group *block_group,
3011                                struct btrfs_free_cluster *cluster)
3012 {
3013         struct btrfs_free_space_ctl *ctl;
3014
3015         /* first, get a safe pointer to the block group */
3016         spin_lock(&cluster->lock);
3017         if (!block_group) {
3018                 block_group = cluster->block_group;
3019                 if (!block_group) {
3020                         spin_unlock(&cluster->lock);
3021                         return;
3022                 }
3023         } else if (cluster->block_group != block_group) {
3024                 /* someone else has already freed it don't redo their work */
3025                 spin_unlock(&cluster->lock);
3026                 return;
3027         }
3028         btrfs_get_block_group(block_group);
3029         spin_unlock(&cluster->lock);
3030
3031         ctl = block_group->free_space_ctl;
3032
3033         /* now return any extents the cluster had on it */
3034         spin_lock(&ctl->tree_lock);
3035         __btrfs_return_cluster_to_free_space(block_group, cluster);
3036         spin_unlock(&ctl->tree_lock);
3037
3038         btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3039
3040         /* finally drop our ref */
3041         btrfs_put_block_group(block_group);
3042 }
3043
3044 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3045                                    struct btrfs_free_cluster *cluster,
3046                                    struct btrfs_free_space *entry,
3047                                    u64 bytes, u64 min_start,
3048                                    u64 *max_extent_size)
3049 {
3050         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3051         int err;
3052         u64 search_start = cluster->window_start;
3053         u64 search_bytes = bytes;
3054         u64 ret = 0;
3055
3056         search_start = min_start;
3057         search_bytes = bytes;
3058
3059         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3060         if (err) {
3061                 *max_extent_size = max(get_max_extent_size(entry),
3062                                        *max_extent_size);
3063                 return 0;
3064         }
3065
3066         ret = search_start;
3067         __bitmap_clear_bits(ctl, entry, ret, bytes);
3068
3069         return ret;
3070 }
3071
3072 /*
3073  * given a cluster, try to allocate 'bytes' from it, returns 0
3074  * if it couldn't find anything suitably large, or a logical disk offset
3075  * if things worked out
3076  */
3077 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3078                              struct btrfs_free_cluster *cluster, u64 bytes,
3079                              u64 min_start, u64 *max_extent_size)
3080 {
3081         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3082         struct btrfs_discard_ctl *discard_ctl =
3083                                         &block_group->fs_info->discard_ctl;
3084         struct btrfs_free_space *entry = NULL;
3085         struct rb_node *node;
3086         u64 ret = 0;
3087
3088         ASSERT(!btrfs_is_zoned(block_group->fs_info));
3089
3090         spin_lock(&cluster->lock);
3091         if (bytes > cluster->max_size)
3092                 goto out;
3093
3094         if (cluster->block_group != block_group)
3095                 goto out;
3096
3097         node = rb_first(&cluster->root);
3098         if (!node)
3099                 goto out;
3100
3101         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3102         while (1) {
3103                 if (entry->bytes < bytes)
3104                         *max_extent_size = max(get_max_extent_size(entry),
3105                                                *max_extent_size);
3106
3107                 if (entry->bytes < bytes ||
3108                     (!entry->bitmap && entry->offset < min_start)) {
3109                         node = rb_next(&entry->offset_index);
3110                         if (!node)
3111                                 break;
3112                         entry = rb_entry(node, struct btrfs_free_space,
3113                                          offset_index);
3114                         continue;
3115                 }
3116
3117                 if (entry->bitmap) {
3118                         ret = btrfs_alloc_from_bitmap(block_group,
3119                                                       cluster, entry, bytes,
3120                                                       cluster->window_start,
3121                                                       max_extent_size);
3122                         if (ret == 0) {
3123                                 node = rb_next(&entry->offset_index);
3124                                 if (!node)
3125                                         break;
3126                                 entry = rb_entry(node, struct btrfs_free_space,
3127                                                  offset_index);
3128                                 continue;
3129                         }
3130                         cluster->window_start += bytes;
3131                 } else {
3132                         ret = entry->offset;
3133
3134                         entry->offset += bytes;
3135                         entry->bytes -= bytes;
3136                 }
3137
3138                 break;
3139         }
3140 out:
3141         spin_unlock(&cluster->lock);
3142
3143         if (!ret)
3144                 return 0;
3145
3146         spin_lock(&ctl->tree_lock);
3147
3148         if (!btrfs_free_space_trimmed(entry))
3149                 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3150
3151         ctl->free_space -= bytes;
3152         if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3153                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3154
3155         spin_lock(&cluster->lock);
3156         if (entry->bytes == 0) {
3157                 rb_erase(&entry->offset_index, &cluster->root);
3158                 ctl->free_extents--;
3159                 if (entry->bitmap) {
3160                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
3161                                         entry->bitmap);
3162                         ctl->total_bitmaps--;
3163                         recalculate_thresholds(ctl);
3164                 } else if (!btrfs_free_space_trimmed(entry)) {
3165                         ctl->discardable_extents[BTRFS_STAT_CURR]--;
3166                 }
3167                 kmem_cache_free(btrfs_free_space_cachep, entry);
3168         }
3169
3170         spin_unlock(&cluster->lock);
3171         spin_unlock(&ctl->tree_lock);
3172
3173         return ret;
3174 }
3175
3176 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3177                                 struct btrfs_free_space *entry,
3178                                 struct btrfs_free_cluster *cluster,
3179                                 u64 offset, u64 bytes,
3180                                 u64 cont1_bytes, u64 min_bytes)
3181 {
3182         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3183         unsigned long next_zero;
3184         unsigned long i;
3185         unsigned long want_bits;
3186         unsigned long min_bits;
3187         unsigned long found_bits;
3188         unsigned long max_bits = 0;
3189         unsigned long start = 0;
3190         unsigned long total_found = 0;
3191         int ret;
3192
3193         i = offset_to_bit(entry->offset, ctl->unit,
3194                           max_t(u64, offset, entry->offset));
3195         want_bits = bytes_to_bits(bytes, ctl->unit);
3196         min_bits = bytes_to_bits(min_bytes, ctl->unit);
3197
3198         /*
3199          * Don't bother looking for a cluster in this bitmap if it's heavily
3200          * fragmented.
3201          */
3202         if (entry->max_extent_size &&
3203             entry->max_extent_size < cont1_bytes)
3204                 return -ENOSPC;
3205 again:
3206         found_bits = 0;
3207         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3208                 next_zero = find_next_zero_bit(entry->bitmap,
3209                                                BITS_PER_BITMAP, i);
3210                 if (next_zero - i >= min_bits) {
3211                         found_bits = next_zero - i;
3212                         if (found_bits > max_bits)
3213                                 max_bits = found_bits;
3214                         break;
3215                 }
3216                 if (next_zero - i > max_bits)
3217                         max_bits = next_zero - i;
3218                 i = next_zero;
3219         }
3220
3221         if (!found_bits) {
3222                 entry->max_extent_size = (u64)max_bits * ctl->unit;
3223                 return -ENOSPC;
3224         }
3225
3226         if (!total_found) {
3227                 start = i;
3228                 cluster->max_size = 0;
3229         }
3230
3231         total_found += found_bits;
3232
3233         if (cluster->max_size < found_bits * ctl->unit)
3234                 cluster->max_size = found_bits * ctl->unit;
3235
3236         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3237                 i = next_zero + 1;
3238                 goto again;
3239         }
3240
3241         cluster->window_start = start * ctl->unit + entry->offset;
3242         rb_erase(&entry->offset_index, &ctl->free_space_offset);
3243         ret = tree_insert_offset(&cluster->root, entry->offset,
3244                                  &entry->offset_index, 1);
3245         ASSERT(!ret); /* -EEXIST; Logic error */
3246
3247         trace_btrfs_setup_cluster(block_group, cluster,
3248                                   total_found * ctl->unit, 1);
3249         return 0;
3250 }
3251
3252 /*
3253  * This searches the block group for just extents to fill the cluster with.
3254  * Try to find a cluster with at least bytes total bytes, at least one
3255  * extent of cont1_bytes, and other clusters of at least min_bytes.
3256  */
3257 static noinline int
3258 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3259                         struct btrfs_free_cluster *cluster,
3260                         struct list_head *bitmaps, u64 offset, u64 bytes,
3261                         u64 cont1_bytes, u64 min_bytes)
3262 {
3263         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3264         struct btrfs_free_space *first = NULL;
3265         struct btrfs_free_space *entry = NULL;
3266         struct btrfs_free_space *last;
3267         struct rb_node *node;
3268         u64 window_free;
3269         u64 max_extent;
3270         u64 total_size = 0;
3271
3272         entry = tree_search_offset(ctl, offset, 0, 1);
3273         if (!entry)
3274                 return -ENOSPC;
3275
3276         /*
3277          * We don't want bitmaps, so just move along until we find a normal
3278          * extent entry.
3279          */
3280         while (entry->bitmap || entry->bytes < min_bytes) {
3281                 if (entry->bitmap && list_empty(&entry->list))
3282                         list_add_tail(&entry->list, bitmaps);
3283                 node = rb_next(&entry->offset_index);
3284                 if (!node)
3285                         return -ENOSPC;
3286                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3287         }
3288
3289         window_free = entry->bytes;
3290         max_extent = entry->bytes;
3291         first = entry;
3292         last = entry;
3293
3294         for (node = rb_next(&entry->offset_index); node;
3295              node = rb_next(&entry->offset_index)) {
3296                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3297
3298                 if (entry->bitmap) {
3299                         if (list_empty(&entry->list))
3300                                 list_add_tail(&entry->list, bitmaps);
3301                         continue;
3302                 }
3303
3304                 if (entry->bytes < min_bytes)
3305                         continue;
3306
3307                 last = entry;
3308                 window_free += entry->bytes;
3309                 if (entry->bytes > max_extent)
3310                         max_extent = entry->bytes;
3311         }
3312
3313         if (window_free < bytes || max_extent < cont1_bytes)
3314                 return -ENOSPC;
3315
3316         cluster->window_start = first->offset;
3317
3318         node = &first->offset_index;
3319
3320         /*
3321          * now we've found our entries, pull them out of the free space
3322          * cache and put them into the cluster rbtree
3323          */
3324         do {
3325                 int ret;
3326
3327                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3328                 node = rb_next(&entry->offset_index);
3329                 if (entry->bitmap || entry->bytes < min_bytes)
3330                         continue;
3331
3332                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3333                 ret = tree_insert_offset(&cluster->root, entry->offset,
3334                                          &entry->offset_index, 0);
3335                 total_size += entry->bytes;
3336                 ASSERT(!ret); /* -EEXIST; Logic error */
3337         } while (node && entry != last);
3338
3339         cluster->max_size = max_extent;
3340         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3341         return 0;
3342 }
3343
3344 /*
3345  * This specifically looks for bitmaps that may work in the cluster, we assume
3346  * that we have already failed to find extents that will work.
3347  */
3348 static noinline int
3349 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3350                      struct btrfs_free_cluster *cluster,
3351                      struct list_head *bitmaps, u64 offset, u64 bytes,
3352                      u64 cont1_bytes, u64 min_bytes)
3353 {
3354         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3355         struct btrfs_free_space *entry = NULL;
3356         int ret = -ENOSPC;
3357         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3358
3359         if (ctl->total_bitmaps == 0)
3360                 return -ENOSPC;
3361
3362         /*
3363          * The bitmap that covers offset won't be in the list unless offset
3364          * is just its start offset.
3365          */
3366         if (!list_empty(bitmaps))
3367                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3368
3369         if (!entry || entry->offset != bitmap_offset) {
3370                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3371                 if (entry && list_empty(&entry->list))
3372                         list_add(&entry->list, bitmaps);
3373         }
3374
3375         list_for_each_entry(entry, bitmaps, list) {
3376                 if (entry->bytes < bytes)
3377                         continue;
3378                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3379                                            bytes, cont1_bytes, min_bytes);
3380                 if (!ret)
3381                         return 0;
3382         }
3383
3384         /*
3385          * The bitmaps list has all the bitmaps that record free space
3386          * starting after offset, so no more search is required.
3387          */
3388         return -ENOSPC;
3389 }
3390
3391 /*
3392  * here we try to find a cluster of blocks in a block group.  The goal
3393  * is to find at least bytes+empty_size.
3394  * We might not find them all in one contiguous area.
3395  *
3396  * returns zero and sets up cluster if things worked out, otherwise
3397  * it returns -enospc
3398  */
3399 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3400                              struct btrfs_free_cluster *cluster,
3401                              u64 offset, u64 bytes, u64 empty_size)
3402 {
3403         struct btrfs_fs_info *fs_info = block_group->fs_info;
3404         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3405         struct btrfs_free_space *entry, *tmp;
3406         LIST_HEAD(bitmaps);
3407         u64 min_bytes;
3408         u64 cont1_bytes;
3409         int ret;
3410
3411         /*
3412          * Choose the minimum extent size we'll require for this
3413          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3414          * For metadata, allow allocates with smaller extents.  For
3415          * data, keep it dense.
3416          */
3417         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3418                 cont1_bytes = min_bytes = bytes + empty_size;
3419         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3420                 cont1_bytes = bytes;
3421                 min_bytes = fs_info->sectorsize;
3422         } else {
3423                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3424                 min_bytes = fs_info->sectorsize;
3425         }
3426
3427         spin_lock(&ctl->tree_lock);
3428
3429         /*
3430          * If we know we don't have enough space to make a cluster don't even
3431          * bother doing all the work to try and find one.
3432          */
3433         if (ctl->free_space < bytes) {
3434                 spin_unlock(&ctl->tree_lock);
3435                 return -ENOSPC;
3436         }
3437
3438         spin_lock(&cluster->lock);
3439
3440         /* someone already found a cluster, hooray */
3441         if (cluster->block_group) {
3442                 ret = 0;
3443                 goto out;
3444         }
3445
3446         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3447                                  min_bytes);
3448
3449         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3450                                       bytes + empty_size,
3451                                       cont1_bytes, min_bytes);
3452         if (ret)
3453                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3454                                            offset, bytes + empty_size,
3455                                            cont1_bytes, min_bytes);
3456
3457         /* Clear our temporary list */
3458         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3459                 list_del_init(&entry->list);
3460
3461         if (!ret) {
3462                 btrfs_get_block_group(block_group);
3463                 list_add_tail(&cluster->block_group_list,
3464                               &block_group->cluster_list);
3465                 cluster->block_group = block_group;
3466         } else {
3467                 trace_btrfs_failed_cluster_setup(block_group);
3468         }
3469 out:
3470         spin_unlock(&cluster->lock);
3471         spin_unlock(&ctl->tree_lock);
3472
3473         return ret;
3474 }
3475
3476 /*
3477  * simple code to zero out a cluster
3478  */
3479 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3480 {
3481         spin_lock_init(&cluster->lock);
3482         spin_lock_init(&cluster->refill_lock);
3483         cluster->root = RB_ROOT;
3484         cluster->max_size = 0;
3485         cluster->fragmented = false;
3486         INIT_LIST_HEAD(&cluster->block_group_list);
3487         cluster->block_group = NULL;
3488 }
3489
3490 static int do_trimming(struct btrfs_block_group *block_group,
3491                        u64 *total_trimmed, u64 start, u64 bytes,
3492                        u64 reserved_start, u64 reserved_bytes,
3493                        enum btrfs_trim_state reserved_trim_state,
3494                        struct btrfs_trim_range *trim_entry)
3495 {
3496         struct btrfs_space_info *space_info = block_group->space_info;
3497         struct btrfs_fs_info *fs_info = block_group->fs_info;
3498         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3499         int ret;
3500         int update = 0;
3501         const u64 end = start + bytes;
3502         const u64 reserved_end = reserved_start + reserved_bytes;
3503         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3504         u64 trimmed = 0;
3505
3506         spin_lock(&space_info->lock);
3507         spin_lock(&block_group->lock);
3508         if (!block_group->ro) {
3509                 block_group->reserved += reserved_bytes;
3510                 space_info->bytes_reserved += reserved_bytes;
3511                 update = 1;
3512         }
3513         spin_unlock(&block_group->lock);
3514         spin_unlock(&space_info->lock);
3515
3516         ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3517         if (!ret) {
3518                 *total_trimmed += trimmed;
3519                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3520         }
3521
3522         mutex_lock(&ctl->cache_writeout_mutex);
3523         if (reserved_start < start)
3524                 __btrfs_add_free_space(fs_info, ctl, reserved_start,
3525                                        start - reserved_start,
3526                                        reserved_trim_state);
3527         if (start + bytes < reserved_start + reserved_bytes)
3528                 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3529                                        reserved_trim_state);
3530         __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3531         list_del(&trim_entry->list);
3532         mutex_unlock(&ctl->cache_writeout_mutex);
3533
3534         if (update) {
3535                 spin_lock(&space_info->lock);
3536                 spin_lock(&block_group->lock);
3537                 if (block_group->ro)
3538                         space_info->bytes_readonly += reserved_bytes;
3539                 block_group->reserved -= reserved_bytes;
3540                 space_info->bytes_reserved -= reserved_bytes;
3541                 spin_unlock(&block_group->lock);
3542                 spin_unlock(&space_info->lock);
3543         }
3544
3545         return ret;
3546 }
3547
3548 /*
3549  * If @async is set, then we will trim 1 region and return.
3550  */
3551 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3552                           u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3553                           bool async)
3554 {
3555         struct btrfs_discard_ctl *discard_ctl =
3556                                         &block_group->fs_info->discard_ctl;
3557         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3558         struct btrfs_free_space *entry;
3559         struct rb_node *node;
3560         int ret = 0;
3561         u64 extent_start;
3562         u64 extent_bytes;
3563         enum btrfs_trim_state extent_trim_state;
3564         u64 bytes;
3565         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3566
3567         while (start < end) {
3568                 struct btrfs_trim_range trim_entry;
3569
3570                 mutex_lock(&ctl->cache_writeout_mutex);
3571                 spin_lock(&ctl->tree_lock);
3572
3573                 if (ctl->free_space < minlen)
3574                         goto out_unlock;
3575
3576                 entry = tree_search_offset(ctl, start, 0, 1);
3577                 if (!entry)
3578                         goto out_unlock;
3579
3580                 /* Skip bitmaps and if async, already trimmed entries */
3581                 while (entry->bitmap ||
3582                        (async && btrfs_free_space_trimmed(entry))) {
3583                         node = rb_next(&entry->offset_index);
3584                         if (!node)
3585                                 goto out_unlock;
3586                         entry = rb_entry(node, struct btrfs_free_space,
3587                                          offset_index);
3588                 }
3589
3590                 if (entry->offset >= end)
3591                         goto out_unlock;
3592
3593                 extent_start = entry->offset;
3594                 extent_bytes = entry->bytes;
3595                 extent_trim_state = entry->trim_state;
3596                 if (async) {
3597                         start = entry->offset;
3598                         bytes = entry->bytes;
3599                         if (bytes < minlen) {
3600                                 spin_unlock(&ctl->tree_lock);
3601                                 mutex_unlock(&ctl->cache_writeout_mutex);
3602                                 goto next;
3603                         }
3604                         unlink_free_space(ctl, entry);
3605                         /*
3606                          * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3607                          * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3608                          * X when we come back around.  So trim it now.
3609                          */
3610                         if (max_discard_size &&
3611                             bytes >= (max_discard_size +
3612                                       BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3613                                 bytes = max_discard_size;
3614                                 extent_bytes = max_discard_size;
3615                                 entry->offset += max_discard_size;
3616                                 entry->bytes -= max_discard_size;
3617                                 link_free_space(ctl, entry);
3618                         } else {
3619                                 kmem_cache_free(btrfs_free_space_cachep, entry);
3620                         }
3621                 } else {
3622                         start = max(start, extent_start);
3623                         bytes = min(extent_start + extent_bytes, end) - start;
3624                         if (bytes < minlen) {
3625                                 spin_unlock(&ctl->tree_lock);
3626                                 mutex_unlock(&ctl->cache_writeout_mutex);
3627                                 goto next;
3628                         }
3629
3630                         unlink_free_space(ctl, entry);
3631                         kmem_cache_free(btrfs_free_space_cachep, entry);
3632                 }
3633
3634                 spin_unlock(&ctl->tree_lock);
3635                 trim_entry.start = extent_start;
3636                 trim_entry.bytes = extent_bytes;
3637                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3638                 mutex_unlock(&ctl->cache_writeout_mutex);
3639
3640                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3641                                   extent_start, extent_bytes, extent_trim_state,
3642                                   &trim_entry);
3643                 if (ret) {
3644                         block_group->discard_cursor = start + bytes;
3645                         break;
3646                 }
3647 next:
3648                 start += bytes;
3649                 block_group->discard_cursor = start;
3650                 if (async && *total_trimmed)
3651                         break;
3652
3653                 if (fatal_signal_pending(current)) {
3654                         ret = -ERESTARTSYS;
3655                         break;
3656                 }
3657
3658                 cond_resched();
3659         }
3660
3661         return ret;
3662
3663 out_unlock:
3664         block_group->discard_cursor = btrfs_block_group_end(block_group);
3665         spin_unlock(&ctl->tree_lock);
3666         mutex_unlock(&ctl->cache_writeout_mutex);
3667
3668         return ret;
3669 }
3670
3671 /*
3672  * If we break out of trimming a bitmap prematurely, we should reset the
3673  * trimming bit.  In a rather contrieved case, it's possible to race here so
3674  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3675  *
3676  * start = start of bitmap
3677  * end = near end of bitmap
3678  *
3679  * Thread 1:                    Thread 2:
3680  * trim_bitmaps(start)
3681  *                              trim_bitmaps(end)
3682  *                              end_trimming_bitmap()
3683  * reset_trimming_bitmap()
3684  */
3685 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3686 {
3687         struct btrfs_free_space *entry;
3688
3689         spin_lock(&ctl->tree_lock);
3690         entry = tree_search_offset(ctl, offset, 1, 0);
3691         if (entry) {
3692                 if (btrfs_free_space_trimmed(entry)) {
3693                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
3694                                 entry->bitmap_extents;
3695                         ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3696                 }
3697                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3698         }
3699
3700         spin_unlock(&ctl->tree_lock);
3701 }
3702
3703 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3704                                 struct btrfs_free_space *entry)
3705 {
3706         if (btrfs_free_space_trimming_bitmap(entry)) {
3707                 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3708                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3709                         entry->bitmap_extents;
3710                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3711         }
3712 }
3713
3714 /*
3715  * If @async is set, then we will trim 1 region and return.
3716  */
3717 static int trim_bitmaps(struct btrfs_block_group *block_group,
3718                         u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3719                         u64 maxlen, bool async)
3720 {
3721         struct btrfs_discard_ctl *discard_ctl =
3722                                         &block_group->fs_info->discard_ctl;
3723         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3724         struct btrfs_free_space *entry;
3725         int ret = 0;
3726         int ret2;
3727         u64 bytes;
3728         u64 offset = offset_to_bitmap(ctl, start);
3729         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3730
3731         while (offset < end) {
3732                 bool next_bitmap = false;
3733                 struct btrfs_trim_range trim_entry;
3734
3735                 mutex_lock(&ctl->cache_writeout_mutex);
3736                 spin_lock(&ctl->tree_lock);
3737
3738                 if (ctl->free_space < minlen) {
3739                         block_group->discard_cursor =
3740                                 btrfs_block_group_end(block_group);
3741                         spin_unlock(&ctl->tree_lock);
3742                         mutex_unlock(&ctl->cache_writeout_mutex);
3743                         break;
3744                 }
3745
3746                 entry = tree_search_offset(ctl, offset, 1, 0);
3747                 /*
3748                  * Bitmaps are marked trimmed lossily now to prevent constant
3749                  * discarding of the same bitmap (the reason why we are bound
3750                  * by the filters).  So, retrim the block group bitmaps when we
3751                  * are preparing to punt to the unused_bgs list.  This uses
3752                  * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3753                  * which is the only discard index which sets minlen to 0.
3754                  */
3755                 if (!entry || (async && minlen && start == offset &&
3756                                btrfs_free_space_trimmed(entry))) {
3757                         spin_unlock(&ctl->tree_lock);
3758                         mutex_unlock(&ctl->cache_writeout_mutex);
3759                         next_bitmap = true;
3760                         goto next;
3761                 }
3762
3763                 /*
3764                  * Async discard bitmap trimming begins at by setting the start
3765                  * to be key.objectid and the offset_to_bitmap() aligns to the
3766                  * start of the bitmap.  This lets us know we are fully
3767                  * scanning the bitmap rather than only some portion of it.
3768                  */
3769                 if (start == offset)
3770                         entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3771
3772                 bytes = minlen;
3773                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3774                 if (ret2 || start >= end) {
3775                         /*
3776                          * We lossily consider a bitmap trimmed if we only skip
3777                          * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3778                          */
3779                         if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3780                                 end_trimming_bitmap(ctl, entry);
3781                         else
3782                                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3783                         spin_unlock(&ctl->tree_lock);
3784                         mutex_unlock(&ctl->cache_writeout_mutex);
3785                         next_bitmap = true;
3786                         goto next;
3787                 }
3788
3789                 /*
3790                  * We already trimmed a region, but are using the locking above
3791                  * to reset the trim_state.
3792                  */
3793                 if (async && *total_trimmed) {
3794                         spin_unlock(&ctl->tree_lock);
3795                         mutex_unlock(&ctl->cache_writeout_mutex);
3796                         goto out;
3797                 }
3798
3799                 bytes = min(bytes, end - start);
3800                 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3801                         spin_unlock(&ctl->tree_lock);
3802                         mutex_unlock(&ctl->cache_writeout_mutex);
3803                         goto next;
3804                 }
3805
3806                 /*
3807                  * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3808                  * If X < @minlen, we won't trim X when we come back around.
3809                  * So trim it now.  We differ here from trimming extents as we
3810                  * don't keep individual state per bit.
3811                  */
3812                 if (async &&
3813                     max_discard_size &&
3814                     bytes > (max_discard_size + minlen))
3815                         bytes = max_discard_size;
3816
3817                 bitmap_clear_bits(ctl, entry, start, bytes);
3818                 if (entry->bytes == 0)
3819                         free_bitmap(ctl, entry);
3820
3821                 spin_unlock(&ctl->tree_lock);
3822                 trim_entry.start = start;
3823                 trim_entry.bytes = bytes;
3824                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3825                 mutex_unlock(&ctl->cache_writeout_mutex);
3826
3827                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3828                                   start, bytes, 0, &trim_entry);
3829                 if (ret) {
3830                         reset_trimming_bitmap(ctl, offset);
3831                         block_group->discard_cursor =
3832                                 btrfs_block_group_end(block_group);
3833                         break;
3834                 }
3835 next:
3836                 if (next_bitmap) {
3837                         offset += BITS_PER_BITMAP * ctl->unit;
3838                         start = offset;
3839                 } else {
3840                         start += bytes;
3841                 }
3842                 block_group->discard_cursor = start;
3843
3844                 if (fatal_signal_pending(current)) {
3845                         if (start != offset)
3846                                 reset_trimming_bitmap(ctl, offset);
3847                         ret = -ERESTARTSYS;
3848                         break;
3849                 }
3850
3851                 cond_resched();
3852         }
3853
3854         if (offset >= end)
3855                 block_group->discard_cursor = end;
3856
3857 out:
3858         return ret;
3859 }
3860
3861 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3862                            u64 *trimmed, u64 start, u64 end, u64 minlen)
3863 {
3864         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3865         int ret;
3866         u64 rem = 0;
3867
3868         ASSERT(!btrfs_is_zoned(block_group->fs_info));
3869
3870         *trimmed = 0;
3871
3872         spin_lock(&block_group->lock);
3873         if (block_group->removed) {
3874                 spin_unlock(&block_group->lock);
3875                 return 0;
3876         }
3877         btrfs_freeze_block_group(block_group);
3878         spin_unlock(&block_group->lock);
3879
3880         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3881         if (ret)
3882                 goto out;
3883
3884         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3885         div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3886         /* If we ended in the middle of a bitmap, reset the trimming flag */
3887         if (rem)
3888                 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3889 out:
3890         btrfs_unfreeze_block_group(block_group);
3891         return ret;
3892 }
3893
3894 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
3895                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3896                                    bool async)
3897 {
3898         int ret;
3899
3900         *trimmed = 0;
3901
3902         spin_lock(&block_group->lock);
3903         if (block_group->removed) {
3904                 spin_unlock(&block_group->lock);
3905                 return 0;
3906         }
3907         btrfs_freeze_block_group(block_group);
3908         spin_unlock(&block_group->lock);
3909
3910         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3911         btrfs_unfreeze_block_group(block_group);
3912
3913         return ret;
3914 }
3915
3916 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
3917                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
3918                                    u64 maxlen, bool async)
3919 {
3920         int ret;
3921
3922         *trimmed = 0;
3923
3924         spin_lock(&block_group->lock);
3925         if (block_group->removed) {
3926                 spin_unlock(&block_group->lock);
3927                 return 0;
3928         }
3929         btrfs_freeze_block_group(block_group);
3930         spin_unlock(&block_group->lock);
3931
3932         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
3933                            async);
3934
3935         btrfs_unfreeze_block_group(block_group);
3936
3937         return ret;
3938 }
3939
3940 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
3941 {
3942         return btrfs_super_cache_generation(fs_info->super_copy);
3943 }
3944
3945 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
3946                                        struct btrfs_trans_handle *trans)
3947 {
3948         struct btrfs_block_group *block_group;
3949         struct rb_node *node;
3950         int ret = 0;
3951
3952         btrfs_info(fs_info, "cleaning free space cache v1");
3953
3954         node = rb_first(&fs_info->block_group_cache_tree);
3955         while (node) {
3956                 block_group = rb_entry(node, struct btrfs_block_group, cache_node);
3957                 ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
3958                 if (ret)
3959                         goto out;
3960                 node = rb_next(node);
3961         }
3962 out:
3963         return ret;
3964 }
3965
3966 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
3967 {
3968         struct btrfs_trans_handle *trans;
3969         int ret;
3970
3971         /*
3972          * update_super_roots will appropriately set or unset
3973          * super_copy->cache_generation based on SPACE_CACHE and
3974          * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
3975          * transaction commit whether we are enabling space cache v1 and don't
3976          * have any other work to do, or are disabling it and removing free
3977          * space inodes.
3978          */
3979         trans = btrfs_start_transaction(fs_info->tree_root, 0);
3980         if (IS_ERR(trans))
3981                 return PTR_ERR(trans);
3982
3983         if (!active) {
3984                 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3985                 ret = cleanup_free_space_cache_v1(fs_info, trans);
3986                 if (ret) {
3987                         btrfs_abort_transaction(trans, ret);
3988                         btrfs_end_transaction(trans);
3989                         goto out;
3990                 }
3991         }
3992
3993         ret = btrfs_commit_transaction(trans);
3994 out:
3995         clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3996
3997         return ret;
3998 }
3999
4000 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4001 /*
4002  * Use this if you need to make a bitmap or extent entry specifically, it
4003  * doesn't do any of the merging that add_free_space does, this acts a lot like
4004  * how the free space cache loading stuff works, so you can get really weird
4005  * configurations.
4006  */
4007 int test_add_free_space_entry(struct btrfs_block_group *cache,
4008                               u64 offset, u64 bytes, bool bitmap)
4009 {
4010         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4011         struct btrfs_free_space *info = NULL, *bitmap_info;
4012         void *map = NULL;
4013         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4014         u64 bytes_added;
4015         int ret;
4016
4017 again:
4018         if (!info) {
4019                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4020                 if (!info)
4021                         return -ENOMEM;
4022         }
4023
4024         if (!bitmap) {
4025                 spin_lock(&ctl->tree_lock);
4026                 info->offset = offset;
4027                 info->bytes = bytes;
4028                 info->max_extent_size = 0;
4029                 ret = link_free_space(ctl, info);
4030                 spin_unlock(&ctl->tree_lock);
4031                 if (ret)
4032                         kmem_cache_free(btrfs_free_space_cachep, info);
4033                 return ret;
4034         }
4035
4036         if (!map) {
4037                 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4038                 if (!map) {
4039                         kmem_cache_free(btrfs_free_space_cachep, info);
4040                         return -ENOMEM;
4041                 }
4042         }
4043
4044         spin_lock(&ctl->tree_lock);
4045         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4046                                          1, 0);
4047         if (!bitmap_info) {
4048                 info->bitmap = map;
4049                 map = NULL;
4050                 add_new_bitmap(ctl, info, offset);
4051                 bitmap_info = info;
4052                 info = NULL;
4053         }
4054
4055         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4056                                           trim_state);
4057
4058         bytes -= bytes_added;
4059         offset += bytes_added;
4060         spin_unlock(&ctl->tree_lock);
4061
4062         if (bytes)
4063                 goto again;
4064
4065         if (info)
4066                 kmem_cache_free(btrfs_free_space_cachep, info);
4067         if (map)
4068                 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4069         return 0;
4070 }
4071
4072 /*
4073  * Checks to see if the given range is in the free space cache.  This is really
4074  * just used to check the absence of space, so if there is free space in the
4075  * range at all we will return 1.
4076  */
4077 int test_check_exists(struct btrfs_block_group *cache,
4078                       u64 offset, u64 bytes)
4079 {
4080         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4081         struct btrfs_free_space *info;
4082         int ret = 0;
4083
4084         spin_lock(&ctl->tree_lock);
4085         info = tree_search_offset(ctl, offset, 0, 0);
4086         if (!info) {
4087                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4088                                           1, 0);
4089                 if (!info)
4090                         goto out;
4091         }
4092
4093 have_info:
4094         if (info->bitmap) {
4095                 u64 bit_off, bit_bytes;
4096                 struct rb_node *n;
4097                 struct btrfs_free_space *tmp;
4098
4099                 bit_off = offset;
4100                 bit_bytes = ctl->unit;
4101                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4102                 if (!ret) {
4103                         if (bit_off == offset) {
4104                                 ret = 1;
4105                                 goto out;
4106                         } else if (bit_off > offset &&
4107                                    offset + bytes > bit_off) {
4108                                 ret = 1;
4109                                 goto out;
4110                         }
4111                 }
4112
4113                 n = rb_prev(&info->offset_index);
4114                 while (n) {
4115                         tmp = rb_entry(n, struct btrfs_free_space,
4116                                        offset_index);
4117                         if (tmp->offset + tmp->bytes < offset)
4118                                 break;
4119                         if (offset + bytes < tmp->offset) {
4120                                 n = rb_prev(&tmp->offset_index);
4121                                 continue;
4122                         }
4123                         info = tmp;
4124                         goto have_info;
4125                 }
4126
4127                 n = rb_next(&info->offset_index);
4128                 while (n) {
4129                         tmp = rb_entry(n, struct btrfs_free_space,
4130                                        offset_index);
4131                         if (offset + bytes < tmp->offset)
4132                                 break;
4133                         if (tmp->offset + tmp->bytes < offset) {
4134                                 n = rb_next(&tmp->offset_index);
4135                                 continue;
4136                         }
4137                         info = tmp;
4138                         goto have_info;
4139                 }
4140
4141                 ret = 0;
4142                 goto out;
4143         }
4144
4145         if (info->offset == offset) {
4146                 ret = 1;
4147                 goto out;
4148         }
4149
4150         if (offset > info->offset && offset < info->offset + info->bytes)
4151                 ret = 1;
4152 out:
4153         spin_unlock(&ctl->tree_lock);
4154         return ret;
4155 }
4156 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */