Merge tag 'phy-for-6.5_v2' of git://git.kernel.org/pub/scm/linux/kernel/git/phy/linux-phy
[platform/kernel/linux-starfive.git] / fs / btrfs / free-space-cache.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2008 Red Hat.  All rights reserved.
4  */
5
6 #include <linux/pagemap.h>
7 #include <linux/sched.h>
8 #include <linux/sched/signal.h>
9 #include <linux/slab.h>
10 #include <linux/math64.h>
11 #include <linux/ratelimit.h>
12 #include <linux/error-injection.h>
13 #include <linux/sched/mm.h>
14 #include "ctree.h"
15 #include "fs.h"
16 #include "messages.h"
17 #include "misc.h"
18 #include "free-space-cache.h"
19 #include "transaction.h"
20 #include "disk-io.h"
21 #include "extent_io.h"
22 #include "volumes.h"
23 #include "space-info.h"
24 #include "delalloc-space.h"
25 #include "block-group.h"
26 #include "discard.h"
27 #include "subpage.h"
28 #include "inode-item.h"
29 #include "accessors.h"
30 #include "file-item.h"
31 #include "file.h"
32 #include "super.h"
33
34 #define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K
36 #define FORCE_EXTENT_THRESHOLD  SZ_1M
37
38 static struct kmem_cache *btrfs_free_space_cachep;
39 static struct kmem_cache *btrfs_free_space_bitmap_cachep;
40
41 struct btrfs_trim_range {
42         u64 start;
43         u64 bytes;
44         struct list_head list;
45 };
46
47 static int link_free_space(struct btrfs_free_space_ctl *ctl,
48                            struct btrfs_free_space *info);
49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
50                               struct btrfs_free_space *info, bool update_stat);
51 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
52                          struct btrfs_free_space *bitmap_info, u64 *offset,
53                          u64 *bytes, bool for_alloc);
54 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
55                         struct btrfs_free_space *bitmap_info);
56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
57                               struct btrfs_free_space *info, u64 offset,
58                               u64 bytes, bool update_stats);
59
60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
61 {
62         struct btrfs_free_space *info;
63         struct rb_node *node;
64
65         while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
66                 info = rb_entry(node, struct btrfs_free_space, offset_index);
67                 if (!info->bitmap) {
68                         unlink_free_space(ctl, info, true);
69                         kmem_cache_free(btrfs_free_space_cachep, info);
70                 } else {
71                         free_bitmap(ctl, info);
72                 }
73
74                 cond_resched_lock(&ctl->tree_lock);
75         }
76 }
77
78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
79                                                struct btrfs_path *path,
80                                                u64 offset)
81 {
82         struct btrfs_fs_info *fs_info = root->fs_info;
83         struct btrfs_key key;
84         struct btrfs_key location;
85         struct btrfs_disk_key disk_key;
86         struct btrfs_free_space_header *header;
87         struct extent_buffer *leaf;
88         struct inode *inode = NULL;
89         unsigned nofs_flag;
90         int ret;
91
92         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
93         key.offset = offset;
94         key.type = 0;
95
96         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
97         if (ret < 0)
98                 return ERR_PTR(ret);
99         if (ret > 0) {
100                 btrfs_release_path(path);
101                 return ERR_PTR(-ENOENT);
102         }
103
104         leaf = path->nodes[0];
105         header = btrfs_item_ptr(leaf, path->slots[0],
106                                 struct btrfs_free_space_header);
107         btrfs_free_space_key(leaf, header, &disk_key);
108         btrfs_disk_key_to_cpu(&location, &disk_key);
109         btrfs_release_path(path);
110
111         /*
112          * We are often under a trans handle at this point, so we need to make
113          * sure NOFS is set to keep us from deadlocking.
114          */
115         nofs_flag = memalloc_nofs_save();
116         inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
117         btrfs_release_path(path);
118         memalloc_nofs_restore(nofs_flag);
119         if (IS_ERR(inode))
120                 return inode;
121
122         mapping_set_gfp_mask(inode->i_mapping,
123                         mapping_gfp_constraint(inode->i_mapping,
124                         ~(__GFP_FS | __GFP_HIGHMEM)));
125
126         return inode;
127 }
128
129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
130                 struct btrfs_path *path)
131 {
132         struct btrfs_fs_info *fs_info = block_group->fs_info;
133         struct inode *inode = NULL;
134         u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
135
136         spin_lock(&block_group->lock);
137         if (block_group->inode)
138                 inode = igrab(block_group->inode);
139         spin_unlock(&block_group->lock);
140         if (inode)
141                 return inode;
142
143         inode = __lookup_free_space_inode(fs_info->tree_root, path,
144                                           block_group->start);
145         if (IS_ERR(inode))
146                 return inode;
147
148         spin_lock(&block_group->lock);
149         if (!((BTRFS_I(inode)->flags & flags) == flags)) {
150                 btrfs_info(fs_info, "Old style space inode found, converting.");
151                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
152                         BTRFS_INODE_NODATACOW;
153                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
154         }
155
156         if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags))
157                 block_group->inode = igrab(inode);
158         spin_unlock(&block_group->lock);
159
160         return inode;
161 }
162
163 static int __create_free_space_inode(struct btrfs_root *root,
164                                      struct btrfs_trans_handle *trans,
165                                      struct btrfs_path *path,
166                                      u64 ino, u64 offset)
167 {
168         struct btrfs_key key;
169         struct btrfs_disk_key disk_key;
170         struct btrfs_free_space_header *header;
171         struct btrfs_inode_item *inode_item;
172         struct extent_buffer *leaf;
173         /* We inline CRCs for the free disk space cache */
174         const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
175                           BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
176         int ret;
177
178         ret = btrfs_insert_empty_inode(trans, root, path, ino);
179         if (ret)
180                 return ret;
181
182         leaf = path->nodes[0];
183         inode_item = btrfs_item_ptr(leaf, path->slots[0],
184                                     struct btrfs_inode_item);
185         btrfs_item_key(leaf, &disk_key, path->slots[0]);
186         memzero_extent_buffer(leaf, (unsigned long)inode_item,
187                              sizeof(*inode_item));
188         btrfs_set_inode_generation(leaf, inode_item, trans->transid);
189         btrfs_set_inode_size(leaf, inode_item, 0);
190         btrfs_set_inode_nbytes(leaf, inode_item, 0);
191         btrfs_set_inode_uid(leaf, inode_item, 0);
192         btrfs_set_inode_gid(leaf, inode_item, 0);
193         btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
194         btrfs_set_inode_flags(leaf, inode_item, flags);
195         btrfs_set_inode_nlink(leaf, inode_item, 1);
196         btrfs_set_inode_transid(leaf, inode_item, trans->transid);
197         btrfs_set_inode_block_group(leaf, inode_item, offset);
198         btrfs_mark_buffer_dirty(leaf);
199         btrfs_release_path(path);
200
201         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
202         key.offset = offset;
203         key.type = 0;
204         ret = btrfs_insert_empty_item(trans, root, path, &key,
205                                       sizeof(struct btrfs_free_space_header));
206         if (ret < 0) {
207                 btrfs_release_path(path);
208                 return ret;
209         }
210
211         leaf = path->nodes[0];
212         header = btrfs_item_ptr(leaf, path->slots[0],
213                                 struct btrfs_free_space_header);
214         memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
215         btrfs_set_free_space_key(leaf, header, &disk_key);
216         btrfs_mark_buffer_dirty(leaf);
217         btrfs_release_path(path);
218
219         return 0;
220 }
221
222 int create_free_space_inode(struct btrfs_trans_handle *trans,
223                             struct btrfs_block_group *block_group,
224                             struct btrfs_path *path)
225 {
226         int ret;
227         u64 ino;
228
229         ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
230         if (ret < 0)
231                 return ret;
232
233         return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
234                                          ino, block_group->start);
235 }
236
237 /*
238  * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
239  * handles lookup, otherwise it takes ownership and iputs the inode.
240  * Don't reuse an inode pointer after passing it into this function.
241  */
242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
243                                   struct inode *inode,
244                                   struct btrfs_block_group *block_group)
245 {
246         struct btrfs_path *path;
247         struct btrfs_key key;
248         int ret = 0;
249
250         path = btrfs_alloc_path();
251         if (!path)
252                 return -ENOMEM;
253
254         if (!inode)
255                 inode = lookup_free_space_inode(block_group, path);
256         if (IS_ERR(inode)) {
257                 if (PTR_ERR(inode) != -ENOENT)
258                         ret = PTR_ERR(inode);
259                 goto out;
260         }
261         ret = btrfs_orphan_add(trans, BTRFS_I(inode));
262         if (ret) {
263                 btrfs_add_delayed_iput(BTRFS_I(inode));
264                 goto out;
265         }
266         clear_nlink(inode);
267         /* One for the block groups ref */
268         spin_lock(&block_group->lock);
269         if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) {
270                 block_group->inode = NULL;
271                 spin_unlock(&block_group->lock);
272                 iput(inode);
273         } else {
274                 spin_unlock(&block_group->lock);
275         }
276         /* One for the lookup ref */
277         btrfs_add_delayed_iput(BTRFS_I(inode));
278
279         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
280         key.type = 0;
281         key.offset = block_group->start;
282         ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
283                                 -1, 1);
284         if (ret) {
285                 if (ret > 0)
286                         ret = 0;
287                 goto out;
288         }
289         ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
290 out:
291         btrfs_free_path(path);
292         return ret;
293 }
294
295 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
296                                     struct btrfs_block_group *block_group,
297                                     struct inode *vfs_inode)
298 {
299         struct btrfs_truncate_control control = {
300                 .inode = BTRFS_I(vfs_inode),
301                 .new_size = 0,
302                 .ino = btrfs_ino(BTRFS_I(vfs_inode)),
303                 .min_type = BTRFS_EXTENT_DATA_KEY,
304                 .clear_extent_range = true,
305         };
306         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
307         struct btrfs_root *root = inode->root;
308         struct extent_state *cached_state = NULL;
309         int ret = 0;
310         bool locked = false;
311
312         if (block_group) {
313                 struct btrfs_path *path = btrfs_alloc_path();
314
315                 if (!path) {
316                         ret = -ENOMEM;
317                         goto fail;
318                 }
319                 locked = true;
320                 mutex_lock(&trans->transaction->cache_write_mutex);
321                 if (!list_empty(&block_group->io_list)) {
322                         list_del_init(&block_group->io_list);
323
324                         btrfs_wait_cache_io(trans, block_group, path);
325                         btrfs_put_block_group(block_group);
326                 }
327
328                 /*
329                  * now that we've truncated the cache away, its no longer
330                  * setup or written
331                  */
332                 spin_lock(&block_group->lock);
333                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
334                 spin_unlock(&block_group->lock);
335                 btrfs_free_path(path);
336         }
337
338         btrfs_i_size_write(inode, 0);
339         truncate_pagecache(vfs_inode, 0);
340
341         lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
342         btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
343
344         /*
345          * We skip the throttling logic for free space cache inodes, so we don't
346          * need to check for -EAGAIN.
347          */
348         ret = btrfs_truncate_inode_items(trans, root, &control);
349
350         inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
351         btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
352
353         unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
354         if (ret)
355                 goto fail;
356
357         ret = btrfs_update_inode(trans, root, inode);
358
359 fail:
360         if (locked)
361                 mutex_unlock(&trans->transaction->cache_write_mutex);
362         if (ret)
363                 btrfs_abort_transaction(trans, ret);
364
365         return ret;
366 }
367
368 static void readahead_cache(struct inode *inode)
369 {
370         struct file_ra_state ra;
371         unsigned long last_index;
372
373         file_ra_state_init(&ra, inode->i_mapping);
374         last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
375
376         page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
377 }
378
379 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
380                        int write)
381 {
382         int num_pages;
383
384         num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
385
386         /* Make sure we can fit our crcs and generation into the first page */
387         if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
388                 return -ENOSPC;
389
390         memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
391
392         io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
393         if (!io_ctl->pages)
394                 return -ENOMEM;
395
396         io_ctl->num_pages = num_pages;
397         io_ctl->fs_info = btrfs_sb(inode->i_sb);
398         io_ctl->inode = inode;
399
400         return 0;
401 }
402 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
403
404 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
405 {
406         kfree(io_ctl->pages);
407         io_ctl->pages = NULL;
408 }
409
410 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
411 {
412         if (io_ctl->cur) {
413                 io_ctl->cur = NULL;
414                 io_ctl->orig = NULL;
415         }
416 }
417
418 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
419 {
420         ASSERT(io_ctl->index < io_ctl->num_pages);
421         io_ctl->page = io_ctl->pages[io_ctl->index++];
422         io_ctl->cur = page_address(io_ctl->page);
423         io_ctl->orig = io_ctl->cur;
424         io_ctl->size = PAGE_SIZE;
425         if (clear)
426                 clear_page(io_ctl->cur);
427 }
428
429 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
430 {
431         int i;
432
433         io_ctl_unmap_page(io_ctl);
434
435         for (i = 0; i < io_ctl->num_pages; i++) {
436                 if (io_ctl->pages[i]) {
437                         btrfs_page_clear_checked(io_ctl->fs_info,
438                                         io_ctl->pages[i],
439                                         page_offset(io_ctl->pages[i]),
440                                         PAGE_SIZE);
441                         unlock_page(io_ctl->pages[i]);
442                         put_page(io_ctl->pages[i]);
443                 }
444         }
445 }
446
447 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
448 {
449         struct page *page;
450         struct inode *inode = io_ctl->inode;
451         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
452         int i;
453
454         for (i = 0; i < io_ctl->num_pages; i++) {
455                 int ret;
456
457                 page = find_or_create_page(inode->i_mapping, i, mask);
458                 if (!page) {
459                         io_ctl_drop_pages(io_ctl);
460                         return -ENOMEM;
461                 }
462
463                 ret = set_page_extent_mapped(page);
464                 if (ret < 0) {
465                         unlock_page(page);
466                         put_page(page);
467                         io_ctl_drop_pages(io_ctl);
468                         return ret;
469                 }
470
471                 io_ctl->pages[i] = page;
472                 if (uptodate && !PageUptodate(page)) {
473                         btrfs_read_folio(NULL, page_folio(page));
474                         lock_page(page);
475                         if (page->mapping != inode->i_mapping) {
476                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
477                                           "free space cache page truncated");
478                                 io_ctl_drop_pages(io_ctl);
479                                 return -EIO;
480                         }
481                         if (!PageUptodate(page)) {
482                                 btrfs_err(BTRFS_I(inode)->root->fs_info,
483                                            "error reading free space cache");
484                                 io_ctl_drop_pages(io_ctl);
485                                 return -EIO;
486                         }
487                 }
488         }
489
490         for (i = 0; i < io_ctl->num_pages; i++)
491                 clear_page_dirty_for_io(io_ctl->pages[i]);
492
493         return 0;
494 }
495
496 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
497 {
498         io_ctl_map_page(io_ctl, 1);
499
500         /*
501          * Skip the csum areas.  If we don't check crcs then we just have a
502          * 64bit chunk at the front of the first page.
503          */
504         io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
505         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
506
507         put_unaligned_le64(generation, io_ctl->cur);
508         io_ctl->cur += sizeof(u64);
509 }
510
511 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
512 {
513         u64 cache_gen;
514
515         /*
516          * Skip the crc area.  If we don't check crcs then we just have a 64bit
517          * chunk at the front of the first page.
518          */
519         io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
520         io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
521
522         cache_gen = get_unaligned_le64(io_ctl->cur);
523         if (cache_gen != generation) {
524                 btrfs_err_rl(io_ctl->fs_info,
525                         "space cache generation (%llu) does not match inode (%llu)",
526                                 cache_gen, generation);
527                 io_ctl_unmap_page(io_ctl);
528                 return -EIO;
529         }
530         io_ctl->cur += sizeof(u64);
531         return 0;
532 }
533
534 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
535 {
536         u32 *tmp;
537         u32 crc = ~(u32)0;
538         unsigned offset = 0;
539
540         if (index == 0)
541                 offset = sizeof(u32) * io_ctl->num_pages;
542
543         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
544         btrfs_crc32c_final(crc, (u8 *)&crc);
545         io_ctl_unmap_page(io_ctl);
546         tmp = page_address(io_ctl->pages[0]);
547         tmp += index;
548         *tmp = crc;
549 }
550
551 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
552 {
553         u32 *tmp, val;
554         u32 crc = ~(u32)0;
555         unsigned offset = 0;
556
557         if (index == 0)
558                 offset = sizeof(u32) * io_ctl->num_pages;
559
560         tmp = page_address(io_ctl->pages[0]);
561         tmp += index;
562         val = *tmp;
563
564         io_ctl_map_page(io_ctl, 0);
565         crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
566         btrfs_crc32c_final(crc, (u8 *)&crc);
567         if (val != crc) {
568                 btrfs_err_rl(io_ctl->fs_info,
569                         "csum mismatch on free space cache");
570                 io_ctl_unmap_page(io_ctl);
571                 return -EIO;
572         }
573
574         return 0;
575 }
576
577 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
578                             void *bitmap)
579 {
580         struct btrfs_free_space_entry *entry;
581
582         if (!io_ctl->cur)
583                 return -ENOSPC;
584
585         entry = io_ctl->cur;
586         put_unaligned_le64(offset, &entry->offset);
587         put_unaligned_le64(bytes, &entry->bytes);
588         entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
589                 BTRFS_FREE_SPACE_EXTENT;
590         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
591         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
592
593         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
594                 return 0;
595
596         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
597
598         /* No more pages to map */
599         if (io_ctl->index >= io_ctl->num_pages)
600                 return 0;
601
602         /* map the next page */
603         io_ctl_map_page(io_ctl, 1);
604         return 0;
605 }
606
607 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
608 {
609         if (!io_ctl->cur)
610                 return -ENOSPC;
611
612         /*
613          * If we aren't at the start of the current page, unmap this one and
614          * map the next one if there is any left.
615          */
616         if (io_ctl->cur != io_ctl->orig) {
617                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
618                 if (io_ctl->index >= io_ctl->num_pages)
619                         return -ENOSPC;
620                 io_ctl_map_page(io_ctl, 0);
621         }
622
623         copy_page(io_ctl->cur, bitmap);
624         io_ctl_set_crc(io_ctl, io_ctl->index - 1);
625         if (io_ctl->index < io_ctl->num_pages)
626                 io_ctl_map_page(io_ctl, 0);
627         return 0;
628 }
629
630 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
631 {
632         /*
633          * If we're not on the boundary we know we've modified the page and we
634          * need to crc the page.
635          */
636         if (io_ctl->cur != io_ctl->orig)
637                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
638         else
639                 io_ctl_unmap_page(io_ctl);
640
641         while (io_ctl->index < io_ctl->num_pages) {
642                 io_ctl_map_page(io_ctl, 1);
643                 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
644         }
645 }
646
647 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
648                             struct btrfs_free_space *entry, u8 *type)
649 {
650         struct btrfs_free_space_entry *e;
651         int ret;
652
653         if (!io_ctl->cur) {
654                 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
655                 if (ret)
656                         return ret;
657         }
658
659         e = io_ctl->cur;
660         entry->offset = get_unaligned_le64(&e->offset);
661         entry->bytes = get_unaligned_le64(&e->bytes);
662         *type = e->type;
663         io_ctl->cur += sizeof(struct btrfs_free_space_entry);
664         io_ctl->size -= sizeof(struct btrfs_free_space_entry);
665
666         if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
667                 return 0;
668
669         io_ctl_unmap_page(io_ctl);
670
671         return 0;
672 }
673
674 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
675                               struct btrfs_free_space *entry)
676 {
677         int ret;
678
679         ret = io_ctl_check_crc(io_ctl, io_ctl->index);
680         if (ret)
681                 return ret;
682
683         copy_page(entry->bitmap, io_ctl->cur);
684         io_ctl_unmap_page(io_ctl);
685
686         return 0;
687 }
688
689 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
690 {
691         struct btrfs_block_group *block_group = ctl->block_group;
692         u64 max_bytes;
693         u64 bitmap_bytes;
694         u64 extent_bytes;
695         u64 size = block_group->length;
696         u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
697         u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
698
699         max_bitmaps = max_t(u64, max_bitmaps, 1);
700
701         if (ctl->total_bitmaps > max_bitmaps)
702                 btrfs_err(block_group->fs_info,
703 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu",
704                           block_group->start, block_group->length,
705                           ctl->total_bitmaps, ctl->unit, max_bitmaps,
706                           bytes_per_bg);
707         ASSERT(ctl->total_bitmaps <= max_bitmaps);
708
709         /*
710          * We are trying to keep the total amount of memory used per 1GiB of
711          * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
712          * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
713          * bitmaps, we may end up using more memory than this.
714          */
715         if (size < SZ_1G)
716                 max_bytes = MAX_CACHE_BYTES_PER_GIG;
717         else
718                 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
719
720         bitmap_bytes = ctl->total_bitmaps * ctl->unit;
721
722         /*
723          * we want the extent entry threshold to always be at most 1/2 the max
724          * bytes we can have, or whatever is less than that.
725          */
726         extent_bytes = max_bytes - bitmap_bytes;
727         extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
728
729         ctl->extents_thresh =
730                 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
731 }
732
733 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
734                                    struct btrfs_free_space_ctl *ctl,
735                                    struct btrfs_path *path, u64 offset)
736 {
737         struct btrfs_fs_info *fs_info = root->fs_info;
738         struct btrfs_free_space_header *header;
739         struct extent_buffer *leaf;
740         struct btrfs_io_ctl io_ctl;
741         struct btrfs_key key;
742         struct btrfs_free_space *e, *n;
743         LIST_HEAD(bitmaps);
744         u64 num_entries;
745         u64 num_bitmaps;
746         u64 generation;
747         u8 type;
748         int ret = 0;
749
750         /* Nothing in the space cache, goodbye */
751         if (!i_size_read(inode))
752                 return 0;
753
754         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
755         key.offset = offset;
756         key.type = 0;
757
758         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
759         if (ret < 0)
760                 return 0;
761         else if (ret > 0) {
762                 btrfs_release_path(path);
763                 return 0;
764         }
765
766         ret = -1;
767
768         leaf = path->nodes[0];
769         header = btrfs_item_ptr(leaf, path->slots[0],
770                                 struct btrfs_free_space_header);
771         num_entries = btrfs_free_space_entries(leaf, header);
772         num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
773         generation = btrfs_free_space_generation(leaf, header);
774         btrfs_release_path(path);
775
776         if (!BTRFS_I(inode)->generation) {
777                 btrfs_info(fs_info,
778                            "the free space cache file (%llu) is invalid, skip it",
779                            offset);
780                 return 0;
781         }
782
783         if (BTRFS_I(inode)->generation != generation) {
784                 btrfs_err(fs_info,
785                           "free space inode generation (%llu) did not match free space cache generation (%llu)",
786                           BTRFS_I(inode)->generation, generation);
787                 return 0;
788         }
789
790         if (!num_entries)
791                 return 0;
792
793         ret = io_ctl_init(&io_ctl, inode, 0);
794         if (ret)
795                 return ret;
796
797         readahead_cache(inode);
798
799         ret = io_ctl_prepare_pages(&io_ctl, true);
800         if (ret)
801                 goto out;
802
803         ret = io_ctl_check_crc(&io_ctl, 0);
804         if (ret)
805                 goto free_cache;
806
807         ret = io_ctl_check_generation(&io_ctl, generation);
808         if (ret)
809                 goto free_cache;
810
811         while (num_entries) {
812                 e = kmem_cache_zalloc(btrfs_free_space_cachep,
813                                       GFP_NOFS);
814                 if (!e) {
815                         ret = -ENOMEM;
816                         goto free_cache;
817                 }
818
819                 ret = io_ctl_read_entry(&io_ctl, e, &type);
820                 if (ret) {
821                         kmem_cache_free(btrfs_free_space_cachep, e);
822                         goto free_cache;
823                 }
824
825                 if (!e->bytes) {
826                         ret = -1;
827                         kmem_cache_free(btrfs_free_space_cachep, e);
828                         goto free_cache;
829                 }
830
831                 if (type == BTRFS_FREE_SPACE_EXTENT) {
832                         spin_lock(&ctl->tree_lock);
833                         ret = link_free_space(ctl, e);
834                         spin_unlock(&ctl->tree_lock);
835                         if (ret) {
836                                 btrfs_err(fs_info,
837                                         "Duplicate entries in free space cache, dumping");
838                                 kmem_cache_free(btrfs_free_space_cachep, e);
839                                 goto free_cache;
840                         }
841                 } else {
842                         ASSERT(num_bitmaps);
843                         num_bitmaps--;
844                         e->bitmap = kmem_cache_zalloc(
845                                         btrfs_free_space_bitmap_cachep, GFP_NOFS);
846                         if (!e->bitmap) {
847                                 ret = -ENOMEM;
848                                 kmem_cache_free(
849                                         btrfs_free_space_cachep, e);
850                                 goto free_cache;
851                         }
852                         spin_lock(&ctl->tree_lock);
853                         ret = link_free_space(ctl, e);
854                         if (ret) {
855                                 spin_unlock(&ctl->tree_lock);
856                                 btrfs_err(fs_info,
857                                         "Duplicate entries in free space cache, dumping");
858                                 kmem_cache_free(btrfs_free_space_cachep, e);
859                                 goto free_cache;
860                         }
861                         ctl->total_bitmaps++;
862                         recalculate_thresholds(ctl);
863                         spin_unlock(&ctl->tree_lock);
864                         list_add_tail(&e->list, &bitmaps);
865                 }
866
867                 num_entries--;
868         }
869
870         io_ctl_unmap_page(&io_ctl);
871
872         /*
873          * We add the bitmaps at the end of the entries in order that
874          * the bitmap entries are added to the cache.
875          */
876         list_for_each_entry_safe(e, n, &bitmaps, list) {
877                 list_del_init(&e->list);
878                 ret = io_ctl_read_bitmap(&io_ctl, e);
879                 if (ret)
880                         goto free_cache;
881         }
882
883         io_ctl_drop_pages(&io_ctl);
884         ret = 1;
885 out:
886         io_ctl_free(&io_ctl);
887         return ret;
888 free_cache:
889         io_ctl_drop_pages(&io_ctl);
890
891         spin_lock(&ctl->tree_lock);
892         __btrfs_remove_free_space_cache(ctl);
893         spin_unlock(&ctl->tree_lock);
894         goto out;
895 }
896
897 static int copy_free_space_cache(struct btrfs_block_group *block_group,
898                                  struct btrfs_free_space_ctl *ctl)
899 {
900         struct btrfs_free_space *info;
901         struct rb_node *n;
902         int ret = 0;
903
904         while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
905                 info = rb_entry(n, struct btrfs_free_space, offset_index);
906                 if (!info->bitmap) {
907                         const u64 offset = info->offset;
908                         const u64 bytes = info->bytes;
909
910                         unlink_free_space(ctl, info, true);
911                         spin_unlock(&ctl->tree_lock);
912                         kmem_cache_free(btrfs_free_space_cachep, info);
913                         ret = btrfs_add_free_space(block_group, offset, bytes);
914                         spin_lock(&ctl->tree_lock);
915                 } else {
916                         u64 offset = info->offset;
917                         u64 bytes = ctl->unit;
918
919                         ret = search_bitmap(ctl, info, &offset, &bytes, false);
920                         if (ret == 0) {
921                                 bitmap_clear_bits(ctl, info, offset, bytes, true);
922                                 spin_unlock(&ctl->tree_lock);
923                                 ret = btrfs_add_free_space(block_group, offset,
924                                                            bytes);
925                                 spin_lock(&ctl->tree_lock);
926                         } else {
927                                 free_bitmap(ctl, info);
928                                 ret = 0;
929                         }
930                 }
931                 cond_resched_lock(&ctl->tree_lock);
932         }
933         return ret;
934 }
935
936 static struct lock_class_key btrfs_free_space_inode_key;
937
938 int load_free_space_cache(struct btrfs_block_group *block_group)
939 {
940         struct btrfs_fs_info *fs_info = block_group->fs_info;
941         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
942         struct btrfs_free_space_ctl tmp_ctl = {};
943         struct inode *inode;
944         struct btrfs_path *path;
945         int ret = 0;
946         bool matched;
947         u64 used = block_group->used;
948
949         /*
950          * Because we could potentially discard our loaded free space, we want
951          * to load everything into a temporary structure first, and then if it's
952          * valid copy it all into the actual free space ctl.
953          */
954         btrfs_init_free_space_ctl(block_group, &tmp_ctl);
955
956         /*
957          * If this block group has been marked to be cleared for one reason or
958          * another then we can't trust the on disk cache, so just return.
959          */
960         spin_lock(&block_group->lock);
961         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
962                 spin_unlock(&block_group->lock);
963                 return 0;
964         }
965         spin_unlock(&block_group->lock);
966
967         path = btrfs_alloc_path();
968         if (!path)
969                 return 0;
970         path->search_commit_root = 1;
971         path->skip_locking = 1;
972
973         /*
974          * We must pass a path with search_commit_root set to btrfs_iget in
975          * order to avoid a deadlock when allocating extents for the tree root.
976          *
977          * When we are COWing an extent buffer from the tree root, when looking
978          * for a free extent, at extent-tree.c:find_free_extent(), we can find
979          * block group without its free space cache loaded. When we find one
980          * we must load its space cache which requires reading its free space
981          * cache's inode item from the root tree. If this inode item is located
982          * in the same leaf that we started COWing before, then we end up in
983          * deadlock on the extent buffer (trying to read lock it when we
984          * previously write locked it).
985          *
986          * It's safe to read the inode item using the commit root because
987          * block groups, once loaded, stay in memory forever (until they are
988          * removed) as well as their space caches once loaded. New block groups
989          * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
990          * we will never try to read their inode item while the fs is mounted.
991          */
992         inode = lookup_free_space_inode(block_group, path);
993         if (IS_ERR(inode)) {
994                 btrfs_free_path(path);
995                 return 0;
996         }
997
998         /* We may have converted the inode and made the cache invalid. */
999         spin_lock(&block_group->lock);
1000         if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
1001                 spin_unlock(&block_group->lock);
1002                 btrfs_free_path(path);
1003                 goto out;
1004         }
1005         spin_unlock(&block_group->lock);
1006
1007         /*
1008          * Reinitialize the class of struct inode's mapping->invalidate_lock for
1009          * free space inodes to prevent false positives related to locks for normal
1010          * inodes.
1011          */
1012         lockdep_set_class(&(&inode->i_data)->invalidate_lock,
1013                           &btrfs_free_space_inode_key);
1014
1015         ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
1016                                       path, block_group->start);
1017         btrfs_free_path(path);
1018         if (ret <= 0)
1019                 goto out;
1020
1021         matched = (tmp_ctl.free_space == (block_group->length - used -
1022                                           block_group->bytes_super));
1023
1024         if (matched) {
1025                 spin_lock(&tmp_ctl.tree_lock);
1026                 ret = copy_free_space_cache(block_group, &tmp_ctl);
1027                 spin_unlock(&tmp_ctl.tree_lock);
1028                 /*
1029                  * ret == 1 means we successfully loaded the free space cache,
1030                  * so we need to re-set it here.
1031                  */
1032                 if (ret == 0)
1033                         ret = 1;
1034         } else {
1035                 /*
1036                  * We need to call the _locked variant so we don't try to update
1037                  * the discard counters.
1038                  */
1039                 spin_lock(&tmp_ctl.tree_lock);
1040                 __btrfs_remove_free_space_cache(&tmp_ctl);
1041                 spin_unlock(&tmp_ctl.tree_lock);
1042                 btrfs_warn(fs_info,
1043                            "block group %llu has wrong amount of free space",
1044                            block_group->start);
1045                 ret = -1;
1046         }
1047 out:
1048         if (ret < 0) {
1049                 /* This cache is bogus, make sure it gets cleared */
1050                 spin_lock(&block_group->lock);
1051                 block_group->disk_cache_state = BTRFS_DC_CLEAR;
1052                 spin_unlock(&block_group->lock);
1053                 ret = 0;
1054
1055                 btrfs_warn(fs_info,
1056                            "failed to load free space cache for block group %llu, rebuilding it now",
1057                            block_group->start);
1058         }
1059
1060         spin_lock(&ctl->tree_lock);
1061         btrfs_discard_update_discardable(block_group);
1062         spin_unlock(&ctl->tree_lock);
1063         iput(inode);
1064         return ret;
1065 }
1066
1067 static noinline_for_stack
1068 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1069                               struct btrfs_free_space_ctl *ctl,
1070                               struct btrfs_block_group *block_group,
1071                               int *entries, int *bitmaps,
1072                               struct list_head *bitmap_list)
1073 {
1074         int ret;
1075         struct btrfs_free_cluster *cluster = NULL;
1076         struct btrfs_free_cluster *cluster_locked = NULL;
1077         struct rb_node *node = rb_first(&ctl->free_space_offset);
1078         struct btrfs_trim_range *trim_entry;
1079
1080         /* Get the cluster for this block_group if it exists */
1081         if (block_group && !list_empty(&block_group->cluster_list)) {
1082                 cluster = list_entry(block_group->cluster_list.next,
1083                                      struct btrfs_free_cluster,
1084                                      block_group_list);
1085         }
1086
1087         if (!node && cluster) {
1088                 cluster_locked = cluster;
1089                 spin_lock(&cluster_locked->lock);
1090                 node = rb_first(&cluster->root);
1091                 cluster = NULL;
1092         }
1093
1094         /* Write out the extent entries */
1095         while (node) {
1096                 struct btrfs_free_space *e;
1097
1098                 e = rb_entry(node, struct btrfs_free_space, offset_index);
1099                 *entries += 1;
1100
1101                 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1102                                        e->bitmap);
1103                 if (ret)
1104                         goto fail;
1105
1106                 if (e->bitmap) {
1107                         list_add_tail(&e->list, bitmap_list);
1108                         *bitmaps += 1;
1109                 }
1110                 node = rb_next(node);
1111                 if (!node && cluster) {
1112                         node = rb_first(&cluster->root);
1113                         cluster_locked = cluster;
1114                         spin_lock(&cluster_locked->lock);
1115                         cluster = NULL;
1116                 }
1117         }
1118         if (cluster_locked) {
1119                 spin_unlock(&cluster_locked->lock);
1120                 cluster_locked = NULL;
1121         }
1122
1123         /*
1124          * Make sure we don't miss any range that was removed from our rbtree
1125          * because trimming is running. Otherwise after a umount+mount (or crash
1126          * after committing the transaction) we would leak free space and get
1127          * an inconsistent free space cache report from fsck.
1128          */
1129         list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1130                 ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1131                                        trim_entry->bytes, NULL);
1132                 if (ret)
1133                         goto fail;
1134                 *entries += 1;
1135         }
1136
1137         return 0;
1138 fail:
1139         if (cluster_locked)
1140                 spin_unlock(&cluster_locked->lock);
1141         return -ENOSPC;
1142 }
1143
1144 static noinline_for_stack int
1145 update_cache_item(struct btrfs_trans_handle *trans,
1146                   struct btrfs_root *root,
1147                   struct inode *inode,
1148                   struct btrfs_path *path, u64 offset,
1149                   int entries, int bitmaps)
1150 {
1151         struct btrfs_key key;
1152         struct btrfs_free_space_header *header;
1153         struct extent_buffer *leaf;
1154         int ret;
1155
1156         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1157         key.offset = offset;
1158         key.type = 0;
1159
1160         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1161         if (ret < 0) {
1162                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1163                                  EXTENT_DELALLOC, NULL);
1164                 goto fail;
1165         }
1166         leaf = path->nodes[0];
1167         if (ret > 0) {
1168                 struct btrfs_key found_key;
1169                 ASSERT(path->slots[0]);
1170                 path->slots[0]--;
1171                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1172                 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1173                     found_key.offset != offset) {
1174                         clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1175                                          inode->i_size - 1, EXTENT_DELALLOC,
1176                                          NULL);
1177                         btrfs_release_path(path);
1178                         goto fail;
1179                 }
1180         }
1181
1182         BTRFS_I(inode)->generation = trans->transid;
1183         header = btrfs_item_ptr(leaf, path->slots[0],
1184                                 struct btrfs_free_space_header);
1185         btrfs_set_free_space_entries(leaf, header, entries);
1186         btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1187         btrfs_set_free_space_generation(leaf, header, trans->transid);
1188         btrfs_mark_buffer_dirty(leaf);
1189         btrfs_release_path(path);
1190
1191         return 0;
1192
1193 fail:
1194         return -1;
1195 }
1196
1197 static noinline_for_stack int write_pinned_extent_entries(
1198                             struct btrfs_trans_handle *trans,
1199                             struct btrfs_block_group *block_group,
1200                             struct btrfs_io_ctl *io_ctl,
1201                             int *entries)
1202 {
1203         u64 start, extent_start, extent_end, len;
1204         struct extent_io_tree *unpin = NULL;
1205         int ret;
1206
1207         if (!block_group)
1208                 return 0;
1209
1210         /*
1211          * We want to add any pinned extents to our free space cache
1212          * so we don't leak the space
1213          *
1214          * We shouldn't have switched the pinned extents yet so this is the
1215          * right one
1216          */
1217         unpin = &trans->transaction->pinned_extents;
1218
1219         start = block_group->start;
1220
1221         while (start < block_group->start + block_group->length) {
1222                 ret = find_first_extent_bit(unpin, start,
1223                                             &extent_start, &extent_end,
1224                                             EXTENT_DIRTY, NULL);
1225                 if (ret)
1226                         return 0;
1227
1228                 /* This pinned extent is out of our range */
1229                 if (extent_start >= block_group->start + block_group->length)
1230                         return 0;
1231
1232                 extent_start = max(extent_start, start);
1233                 extent_end = min(block_group->start + block_group->length,
1234                                  extent_end + 1);
1235                 len = extent_end - extent_start;
1236
1237                 *entries += 1;
1238                 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1239                 if (ret)
1240                         return -ENOSPC;
1241
1242                 start = extent_end;
1243         }
1244
1245         return 0;
1246 }
1247
1248 static noinline_for_stack int
1249 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1250 {
1251         struct btrfs_free_space *entry, *next;
1252         int ret;
1253
1254         /* Write out the bitmaps */
1255         list_for_each_entry_safe(entry, next, bitmap_list, list) {
1256                 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1257                 if (ret)
1258                         return -ENOSPC;
1259                 list_del_init(&entry->list);
1260         }
1261
1262         return 0;
1263 }
1264
1265 static int flush_dirty_cache(struct inode *inode)
1266 {
1267         int ret;
1268
1269         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1270         if (ret)
1271                 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1272                                  EXTENT_DELALLOC, NULL);
1273
1274         return ret;
1275 }
1276
1277 static void noinline_for_stack
1278 cleanup_bitmap_list(struct list_head *bitmap_list)
1279 {
1280         struct btrfs_free_space *entry, *next;
1281
1282         list_for_each_entry_safe(entry, next, bitmap_list, list)
1283                 list_del_init(&entry->list);
1284 }
1285
1286 static void noinline_for_stack
1287 cleanup_write_cache_enospc(struct inode *inode,
1288                            struct btrfs_io_ctl *io_ctl,
1289                            struct extent_state **cached_state)
1290 {
1291         io_ctl_drop_pages(io_ctl);
1292         unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1293                       cached_state);
1294 }
1295
1296 static int __btrfs_wait_cache_io(struct btrfs_root *root,
1297                                  struct btrfs_trans_handle *trans,
1298                                  struct btrfs_block_group *block_group,
1299                                  struct btrfs_io_ctl *io_ctl,
1300                                  struct btrfs_path *path, u64 offset)
1301 {
1302         int ret;
1303         struct inode *inode = io_ctl->inode;
1304
1305         if (!inode)
1306                 return 0;
1307
1308         /* Flush the dirty pages in the cache file. */
1309         ret = flush_dirty_cache(inode);
1310         if (ret)
1311                 goto out;
1312
1313         /* Update the cache item to tell everyone this cache file is valid. */
1314         ret = update_cache_item(trans, root, inode, path, offset,
1315                                 io_ctl->entries, io_ctl->bitmaps);
1316 out:
1317         if (ret) {
1318                 invalidate_inode_pages2(inode->i_mapping);
1319                 BTRFS_I(inode)->generation = 0;
1320                 if (block_group)
1321                         btrfs_debug(root->fs_info,
1322           "failed to write free space cache for block group %llu error %d",
1323                                   block_group->start, ret);
1324         }
1325         btrfs_update_inode(trans, root, BTRFS_I(inode));
1326
1327         if (block_group) {
1328                 /* the dirty list is protected by the dirty_bgs_lock */
1329                 spin_lock(&trans->transaction->dirty_bgs_lock);
1330
1331                 /* the disk_cache_state is protected by the block group lock */
1332                 spin_lock(&block_group->lock);
1333
1334                 /*
1335                  * only mark this as written if we didn't get put back on
1336                  * the dirty list while waiting for IO.   Otherwise our
1337                  * cache state won't be right, and we won't get written again
1338                  */
1339                 if (!ret && list_empty(&block_group->dirty_list))
1340                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1341                 else if (ret)
1342                         block_group->disk_cache_state = BTRFS_DC_ERROR;
1343
1344                 spin_unlock(&block_group->lock);
1345                 spin_unlock(&trans->transaction->dirty_bgs_lock);
1346                 io_ctl->inode = NULL;
1347                 iput(inode);
1348         }
1349
1350         return ret;
1351
1352 }
1353
1354 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1355                         struct btrfs_block_group *block_group,
1356                         struct btrfs_path *path)
1357 {
1358         return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1359                                      block_group, &block_group->io_ctl,
1360                                      path, block_group->start);
1361 }
1362
1363 /*
1364  * Write out cached info to an inode.
1365  *
1366  * @root:        root the inode belongs to
1367  * @inode:       freespace inode we are writing out
1368  * @ctl:         free space cache we are going to write out
1369  * @block_group: block_group for this cache if it belongs to a block_group
1370  * @io_ctl:      holds context for the io
1371  * @trans:       the trans handle
1372  *
1373  * This function writes out a free space cache struct to disk for quick recovery
1374  * on mount.  This will return 0 if it was successful in writing the cache out,
1375  * or an errno if it was not.
1376  */
1377 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1378                                    struct btrfs_free_space_ctl *ctl,
1379                                    struct btrfs_block_group *block_group,
1380                                    struct btrfs_io_ctl *io_ctl,
1381                                    struct btrfs_trans_handle *trans)
1382 {
1383         struct extent_state *cached_state = NULL;
1384         LIST_HEAD(bitmap_list);
1385         int entries = 0;
1386         int bitmaps = 0;
1387         int ret;
1388         int must_iput = 0;
1389
1390         if (!i_size_read(inode))
1391                 return -EIO;
1392
1393         WARN_ON(io_ctl->pages);
1394         ret = io_ctl_init(io_ctl, inode, 1);
1395         if (ret)
1396                 return ret;
1397
1398         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1399                 down_write(&block_group->data_rwsem);
1400                 spin_lock(&block_group->lock);
1401                 if (block_group->delalloc_bytes) {
1402                         block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1403                         spin_unlock(&block_group->lock);
1404                         up_write(&block_group->data_rwsem);
1405                         BTRFS_I(inode)->generation = 0;
1406                         ret = 0;
1407                         must_iput = 1;
1408                         goto out;
1409                 }
1410                 spin_unlock(&block_group->lock);
1411         }
1412
1413         /* Lock all pages first so we can lock the extent safely. */
1414         ret = io_ctl_prepare_pages(io_ctl, false);
1415         if (ret)
1416                 goto out_unlock;
1417
1418         lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1419                     &cached_state);
1420
1421         io_ctl_set_generation(io_ctl, trans->transid);
1422
1423         mutex_lock(&ctl->cache_writeout_mutex);
1424         /* Write out the extent entries in the free space cache */
1425         spin_lock(&ctl->tree_lock);
1426         ret = write_cache_extent_entries(io_ctl, ctl,
1427                                          block_group, &entries, &bitmaps,
1428                                          &bitmap_list);
1429         if (ret)
1430                 goto out_nospc_locked;
1431
1432         /*
1433          * Some spaces that are freed in the current transaction are pinned,
1434          * they will be added into free space cache after the transaction is
1435          * committed, we shouldn't lose them.
1436          *
1437          * If this changes while we are working we'll get added back to
1438          * the dirty list and redo it.  No locking needed
1439          */
1440         ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1441         if (ret)
1442                 goto out_nospc_locked;
1443
1444         /*
1445          * At last, we write out all the bitmaps and keep cache_writeout_mutex
1446          * locked while doing it because a concurrent trim can be manipulating
1447          * or freeing the bitmap.
1448          */
1449         ret = write_bitmap_entries(io_ctl, &bitmap_list);
1450         spin_unlock(&ctl->tree_lock);
1451         mutex_unlock(&ctl->cache_writeout_mutex);
1452         if (ret)
1453                 goto out_nospc;
1454
1455         /* Zero out the rest of the pages just to make sure */
1456         io_ctl_zero_remaining_pages(io_ctl);
1457
1458         /* Everything is written out, now we dirty the pages in the file. */
1459         ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1460                                 io_ctl->num_pages, 0, i_size_read(inode),
1461                                 &cached_state, false);
1462         if (ret)
1463                 goto out_nospc;
1464
1465         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1466                 up_write(&block_group->data_rwsem);
1467         /*
1468          * Release the pages and unlock the extent, we will flush
1469          * them out later
1470          */
1471         io_ctl_drop_pages(io_ctl);
1472         io_ctl_free(io_ctl);
1473
1474         unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1475                       &cached_state);
1476
1477         /*
1478          * at this point the pages are under IO and we're happy,
1479          * The caller is responsible for waiting on them and updating
1480          * the cache and the inode
1481          */
1482         io_ctl->entries = entries;
1483         io_ctl->bitmaps = bitmaps;
1484
1485         ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1486         if (ret)
1487                 goto out;
1488
1489         return 0;
1490
1491 out_nospc_locked:
1492         cleanup_bitmap_list(&bitmap_list);
1493         spin_unlock(&ctl->tree_lock);
1494         mutex_unlock(&ctl->cache_writeout_mutex);
1495
1496 out_nospc:
1497         cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1498
1499 out_unlock:
1500         if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1501                 up_write(&block_group->data_rwsem);
1502
1503 out:
1504         io_ctl->inode = NULL;
1505         io_ctl_free(io_ctl);
1506         if (ret) {
1507                 invalidate_inode_pages2(inode->i_mapping);
1508                 BTRFS_I(inode)->generation = 0;
1509         }
1510         btrfs_update_inode(trans, root, BTRFS_I(inode));
1511         if (must_iput)
1512                 iput(inode);
1513         return ret;
1514 }
1515
1516 int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1517                           struct btrfs_block_group *block_group,
1518                           struct btrfs_path *path)
1519 {
1520         struct btrfs_fs_info *fs_info = trans->fs_info;
1521         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1522         struct inode *inode;
1523         int ret = 0;
1524
1525         spin_lock(&block_group->lock);
1526         if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1527                 spin_unlock(&block_group->lock);
1528                 return 0;
1529         }
1530         spin_unlock(&block_group->lock);
1531
1532         inode = lookup_free_space_inode(block_group, path);
1533         if (IS_ERR(inode))
1534                 return 0;
1535
1536         ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1537                                 block_group, &block_group->io_ctl, trans);
1538         if (ret) {
1539                 btrfs_debug(fs_info,
1540           "failed to write free space cache for block group %llu error %d",
1541                           block_group->start, ret);
1542                 spin_lock(&block_group->lock);
1543                 block_group->disk_cache_state = BTRFS_DC_ERROR;
1544                 spin_unlock(&block_group->lock);
1545
1546                 block_group->io_ctl.inode = NULL;
1547                 iput(inode);
1548         }
1549
1550         /*
1551          * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1552          * to wait for IO and put the inode
1553          */
1554
1555         return ret;
1556 }
1557
1558 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1559                                           u64 offset)
1560 {
1561         ASSERT(offset >= bitmap_start);
1562         offset -= bitmap_start;
1563         return (unsigned long)(div_u64(offset, unit));
1564 }
1565
1566 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1567 {
1568         return (unsigned long)(div_u64(bytes, unit));
1569 }
1570
1571 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1572                                    u64 offset)
1573 {
1574         u64 bitmap_start;
1575         u64 bytes_per_bitmap;
1576
1577         bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1578         bitmap_start = offset - ctl->start;
1579         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1580         bitmap_start *= bytes_per_bitmap;
1581         bitmap_start += ctl->start;
1582
1583         return bitmap_start;
1584 }
1585
1586 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl,
1587                               struct btrfs_free_cluster *cluster,
1588                               struct btrfs_free_space *new_entry)
1589 {
1590         struct rb_root *root;
1591         struct rb_node **p;
1592         struct rb_node *parent = NULL;
1593
1594         lockdep_assert_held(&ctl->tree_lock);
1595
1596         if (cluster) {
1597                 lockdep_assert_held(&cluster->lock);
1598                 root = &cluster->root;
1599         } else {
1600                 root = &ctl->free_space_offset;
1601         }
1602
1603         p = &root->rb_node;
1604
1605         while (*p) {
1606                 struct btrfs_free_space *info;
1607
1608                 parent = *p;
1609                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
1610
1611                 if (new_entry->offset < info->offset) {
1612                         p = &(*p)->rb_left;
1613                 } else if (new_entry->offset > info->offset) {
1614                         p = &(*p)->rb_right;
1615                 } else {
1616                         /*
1617                          * we could have a bitmap entry and an extent entry
1618                          * share the same offset.  If this is the case, we want
1619                          * the extent entry to always be found first if we do a
1620                          * linear search through the tree, since we want to have
1621                          * the quickest allocation time, and allocating from an
1622                          * extent is faster than allocating from a bitmap.  So
1623                          * if we're inserting a bitmap and we find an entry at
1624                          * this offset, we want to go right, or after this entry
1625                          * logically.  If we are inserting an extent and we've
1626                          * found a bitmap, we want to go left, or before
1627                          * logically.
1628                          */
1629                         if (new_entry->bitmap) {
1630                                 if (info->bitmap) {
1631                                         WARN_ON_ONCE(1);
1632                                         return -EEXIST;
1633                                 }
1634                                 p = &(*p)->rb_right;
1635                         } else {
1636                                 if (!info->bitmap) {
1637                                         WARN_ON_ONCE(1);
1638                                         return -EEXIST;
1639                                 }
1640                                 p = &(*p)->rb_left;
1641                         }
1642                 }
1643         }
1644
1645         rb_link_node(&new_entry->offset_index, parent, p);
1646         rb_insert_color(&new_entry->offset_index, root);
1647
1648         return 0;
1649 }
1650
1651 /*
1652  * This is a little subtle.  We *only* have ->max_extent_size set if we actually
1653  * searched through the bitmap and figured out the largest ->max_extent_size,
1654  * otherwise it's 0.  In the case that it's 0 we don't want to tell the
1655  * allocator the wrong thing, we want to use the actual real max_extent_size
1656  * we've found already if it's larger, or we want to use ->bytes.
1657  *
1658  * This matters because find_free_space() will skip entries who's ->bytes is
1659  * less than the required bytes.  So if we didn't search down this bitmap, we
1660  * may pick some previous entry that has a smaller ->max_extent_size than we
1661  * have.  For example, assume we have two entries, one that has
1662  * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
1663  * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
1664  *  call into find_free_space(), and return with max_extent_size == 4K, because
1665  *  that first bitmap entry had ->max_extent_size set, but the second one did
1666  *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
1667  *  8K contiguous range.
1668  *
1669  *  Consider the other case, we have 2 8K chunks in that second entry and still
1670  *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
1671  *  allocator comes in it'll fully search our second bitmap, and this time it'll
1672  *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
1673  *  right allocation the next loop through.
1674  */
1675 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1676 {
1677         if (entry->bitmap && entry->max_extent_size)
1678                 return entry->max_extent_size;
1679         return entry->bytes;
1680 }
1681
1682 /*
1683  * We want the largest entry to be leftmost, so this is inverted from what you'd
1684  * normally expect.
1685  */
1686 static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1687 {
1688         const struct btrfs_free_space *entry, *exist;
1689
1690         entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1691         exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1692         return get_max_extent_size(exist) < get_max_extent_size(entry);
1693 }
1694
1695 /*
1696  * searches the tree for the given offset.
1697  *
1698  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1699  * want a section that has at least bytes size and comes at or after the given
1700  * offset.
1701  */
1702 static struct btrfs_free_space *
1703 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1704                    u64 offset, int bitmap_only, int fuzzy)
1705 {
1706         struct rb_node *n = ctl->free_space_offset.rb_node;
1707         struct btrfs_free_space *entry = NULL, *prev = NULL;
1708
1709         lockdep_assert_held(&ctl->tree_lock);
1710
1711         /* find entry that is closest to the 'offset' */
1712         while (n) {
1713                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1714                 prev = entry;
1715
1716                 if (offset < entry->offset)
1717                         n = n->rb_left;
1718                 else if (offset > entry->offset)
1719                         n = n->rb_right;
1720                 else
1721                         break;
1722
1723                 entry = NULL;
1724         }
1725
1726         if (bitmap_only) {
1727                 if (!entry)
1728                         return NULL;
1729                 if (entry->bitmap)
1730                         return entry;
1731
1732                 /*
1733                  * bitmap entry and extent entry may share same offset,
1734                  * in that case, bitmap entry comes after extent entry.
1735                  */
1736                 n = rb_next(n);
1737                 if (!n)
1738                         return NULL;
1739                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1740                 if (entry->offset != offset)
1741                         return NULL;
1742
1743                 WARN_ON(!entry->bitmap);
1744                 return entry;
1745         } else if (entry) {
1746                 if (entry->bitmap) {
1747                         /*
1748                          * if previous extent entry covers the offset,
1749                          * we should return it instead of the bitmap entry
1750                          */
1751                         n = rb_prev(&entry->offset_index);
1752                         if (n) {
1753                                 prev = rb_entry(n, struct btrfs_free_space,
1754                                                 offset_index);
1755                                 if (!prev->bitmap &&
1756                                     prev->offset + prev->bytes > offset)
1757                                         entry = prev;
1758                         }
1759                 }
1760                 return entry;
1761         }
1762
1763         if (!prev)
1764                 return NULL;
1765
1766         /* find last entry before the 'offset' */
1767         entry = prev;
1768         if (entry->offset > offset) {
1769                 n = rb_prev(&entry->offset_index);
1770                 if (n) {
1771                         entry = rb_entry(n, struct btrfs_free_space,
1772                                         offset_index);
1773                         ASSERT(entry->offset <= offset);
1774                 } else {
1775                         if (fuzzy)
1776                                 return entry;
1777                         else
1778                                 return NULL;
1779                 }
1780         }
1781
1782         if (entry->bitmap) {
1783                 n = rb_prev(&entry->offset_index);
1784                 if (n) {
1785                         prev = rb_entry(n, struct btrfs_free_space,
1786                                         offset_index);
1787                         if (!prev->bitmap &&
1788                             prev->offset + prev->bytes > offset)
1789                                 return prev;
1790                 }
1791                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1792                         return entry;
1793         } else if (entry->offset + entry->bytes > offset)
1794                 return entry;
1795
1796         if (!fuzzy)
1797                 return NULL;
1798
1799         while (1) {
1800                 n = rb_next(&entry->offset_index);
1801                 if (!n)
1802                         return NULL;
1803                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
1804                 if (entry->bitmap) {
1805                         if (entry->offset + BITS_PER_BITMAP *
1806                             ctl->unit > offset)
1807                                 break;
1808                 } else {
1809                         if (entry->offset + entry->bytes > offset)
1810                                 break;
1811                 }
1812         }
1813         return entry;
1814 }
1815
1816 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1817                                      struct btrfs_free_space *info,
1818                                      bool update_stat)
1819 {
1820         lockdep_assert_held(&ctl->tree_lock);
1821
1822         rb_erase(&info->offset_index, &ctl->free_space_offset);
1823         rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1824         ctl->free_extents--;
1825
1826         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1827                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
1828                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1829         }
1830
1831         if (update_stat)
1832                 ctl->free_space -= info->bytes;
1833 }
1834
1835 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1836                            struct btrfs_free_space *info)
1837 {
1838         int ret = 0;
1839
1840         lockdep_assert_held(&ctl->tree_lock);
1841
1842         ASSERT(info->bytes || info->bitmap);
1843         ret = tree_insert_offset(ctl, NULL, info);
1844         if (ret)
1845                 return ret;
1846
1847         rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1848
1849         if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1850                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
1851                 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1852         }
1853
1854         ctl->free_space += info->bytes;
1855         ctl->free_extents++;
1856         return ret;
1857 }
1858
1859 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1860                                 struct btrfs_free_space *info)
1861 {
1862         ASSERT(info->bitmap);
1863
1864         /*
1865          * If our entry is empty it's because we're on a cluster and we don't
1866          * want to re-link it into our ctl bytes index.
1867          */
1868         if (RB_EMPTY_NODE(&info->bytes_index))
1869                 return;
1870
1871         lockdep_assert_held(&ctl->tree_lock);
1872
1873         rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1874         rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1875 }
1876
1877 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1878                                      struct btrfs_free_space *info,
1879                                      u64 offset, u64 bytes, bool update_stat)
1880 {
1881         unsigned long start, count, end;
1882         int extent_delta = -1;
1883
1884         start = offset_to_bit(info->offset, ctl->unit, offset);
1885         count = bytes_to_bits(bytes, ctl->unit);
1886         end = start + count;
1887         ASSERT(end <= BITS_PER_BITMAP);
1888
1889         bitmap_clear(info->bitmap, start, count);
1890
1891         info->bytes -= bytes;
1892         if (info->max_extent_size > ctl->unit)
1893                 info->max_extent_size = 0;
1894
1895         relink_bitmap_entry(ctl, info);
1896
1897         if (start && test_bit(start - 1, info->bitmap))
1898                 extent_delta++;
1899
1900         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1901                 extent_delta++;
1902
1903         info->bitmap_extents += extent_delta;
1904         if (!btrfs_free_space_trimmed(info)) {
1905                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1906                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1907         }
1908
1909         if (update_stat)
1910                 ctl->free_space -= bytes;
1911 }
1912
1913 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1914                             struct btrfs_free_space *info, u64 offset,
1915                             u64 bytes)
1916 {
1917         unsigned long start, count, end;
1918         int extent_delta = 1;
1919
1920         start = offset_to_bit(info->offset, ctl->unit, offset);
1921         count = bytes_to_bits(bytes, ctl->unit);
1922         end = start + count;
1923         ASSERT(end <= BITS_PER_BITMAP);
1924
1925         bitmap_set(info->bitmap, start, count);
1926
1927         /*
1928          * We set some bytes, we have no idea what the max extent size is
1929          * anymore.
1930          */
1931         info->max_extent_size = 0;
1932         info->bytes += bytes;
1933         ctl->free_space += bytes;
1934
1935         relink_bitmap_entry(ctl, info);
1936
1937         if (start && test_bit(start - 1, info->bitmap))
1938                 extent_delta--;
1939
1940         if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1941                 extent_delta--;
1942
1943         info->bitmap_extents += extent_delta;
1944         if (!btrfs_free_space_trimmed(info)) {
1945                 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1946                 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1947         }
1948 }
1949
1950 /*
1951  * If we can not find suitable extent, we will use bytes to record
1952  * the size of the max extent.
1953  */
1954 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1955                          struct btrfs_free_space *bitmap_info, u64 *offset,
1956                          u64 *bytes, bool for_alloc)
1957 {
1958         unsigned long found_bits = 0;
1959         unsigned long max_bits = 0;
1960         unsigned long bits, i;
1961         unsigned long next_zero;
1962         unsigned long extent_bits;
1963
1964         /*
1965          * Skip searching the bitmap if we don't have a contiguous section that
1966          * is large enough for this allocation.
1967          */
1968         if (for_alloc &&
1969             bitmap_info->max_extent_size &&
1970             bitmap_info->max_extent_size < *bytes) {
1971                 *bytes = bitmap_info->max_extent_size;
1972                 return -1;
1973         }
1974
1975         i = offset_to_bit(bitmap_info->offset, ctl->unit,
1976                           max_t(u64, *offset, bitmap_info->offset));
1977         bits = bytes_to_bits(*bytes, ctl->unit);
1978
1979         for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1980                 if (for_alloc && bits == 1) {
1981                         found_bits = 1;
1982                         break;
1983                 }
1984                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1985                                                BITS_PER_BITMAP, i);
1986                 extent_bits = next_zero - i;
1987                 if (extent_bits >= bits) {
1988                         found_bits = extent_bits;
1989                         break;
1990                 } else if (extent_bits > max_bits) {
1991                         max_bits = extent_bits;
1992                 }
1993                 i = next_zero;
1994         }
1995
1996         if (found_bits) {
1997                 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1998                 *bytes = (u64)(found_bits) * ctl->unit;
1999                 return 0;
2000         }
2001
2002         *bytes = (u64)(max_bits) * ctl->unit;
2003         bitmap_info->max_extent_size = *bytes;
2004         relink_bitmap_entry(ctl, bitmap_info);
2005         return -1;
2006 }
2007
2008 /* Cache the size of the max extent in bytes */
2009 static struct btrfs_free_space *
2010 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
2011                 unsigned long align, u64 *max_extent_size, bool use_bytes_index)
2012 {
2013         struct btrfs_free_space *entry;
2014         struct rb_node *node;
2015         u64 tmp;
2016         u64 align_off;
2017         int ret;
2018
2019         if (!ctl->free_space_offset.rb_node)
2020                 goto out;
2021 again:
2022         if (use_bytes_index) {
2023                 node = rb_first_cached(&ctl->free_space_bytes);
2024         } else {
2025                 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
2026                                            0, 1);
2027                 if (!entry)
2028                         goto out;
2029                 node = &entry->offset_index;
2030         }
2031
2032         for (; node; node = rb_next(node)) {
2033                 if (use_bytes_index)
2034                         entry = rb_entry(node, struct btrfs_free_space,
2035                                          bytes_index);
2036                 else
2037                         entry = rb_entry(node, struct btrfs_free_space,
2038                                          offset_index);
2039
2040                 /*
2041                  * If we are using the bytes index then all subsequent entries
2042                  * in this tree are going to be < bytes, so simply set the max
2043                  * extent size and exit the loop.
2044                  *
2045                  * If we're using the offset index then we need to keep going
2046                  * through the rest of the tree.
2047                  */
2048                 if (entry->bytes < *bytes) {
2049                         *max_extent_size = max(get_max_extent_size(entry),
2050                                                *max_extent_size);
2051                         if (use_bytes_index)
2052                                 break;
2053                         continue;
2054                 }
2055
2056                 /* make sure the space returned is big enough
2057                  * to match our requested alignment
2058                  */
2059                 if (*bytes >= align) {
2060                         tmp = entry->offset - ctl->start + align - 1;
2061                         tmp = div64_u64(tmp, align);
2062                         tmp = tmp * align + ctl->start;
2063                         align_off = tmp - entry->offset;
2064                 } else {
2065                         align_off = 0;
2066                         tmp = entry->offset;
2067                 }
2068
2069                 /*
2070                  * We don't break here if we're using the bytes index because we
2071                  * may have another entry that has the correct alignment that is
2072                  * the right size, so we don't want to miss that possibility.
2073                  * At worst this adds another loop through the logic, but if we
2074                  * broke here we could prematurely ENOSPC.
2075                  */
2076                 if (entry->bytes < *bytes + align_off) {
2077                         *max_extent_size = max(get_max_extent_size(entry),
2078                                                *max_extent_size);
2079                         continue;
2080                 }
2081
2082                 if (entry->bitmap) {
2083                         struct rb_node *old_next = rb_next(node);
2084                         u64 size = *bytes;
2085
2086                         ret = search_bitmap(ctl, entry, &tmp, &size, true);
2087                         if (!ret) {
2088                                 *offset = tmp;
2089                                 *bytes = size;
2090                                 return entry;
2091                         } else {
2092                                 *max_extent_size =
2093                                         max(get_max_extent_size(entry),
2094                                             *max_extent_size);
2095                         }
2096
2097                         /*
2098                          * The bitmap may have gotten re-arranged in the space
2099                          * index here because the max_extent_size may have been
2100                          * updated.  Start from the beginning again if this
2101                          * happened.
2102                          */
2103                         if (use_bytes_index && old_next != rb_next(node))
2104                                 goto again;
2105                         continue;
2106                 }
2107
2108                 *offset = tmp;
2109                 *bytes = entry->bytes - align_off;
2110                 return entry;
2111         }
2112 out:
2113         return NULL;
2114 }
2115
2116 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
2117                            struct btrfs_free_space *info, u64 offset)
2118 {
2119         info->offset = offset_to_bitmap(ctl, offset);
2120         info->bytes = 0;
2121         info->bitmap_extents = 0;
2122         INIT_LIST_HEAD(&info->list);
2123         link_free_space(ctl, info);
2124         ctl->total_bitmaps++;
2125         recalculate_thresholds(ctl);
2126 }
2127
2128 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
2129                         struct btrfs_free_space *bitmap_info)
2130 {
2131         /*
2132          * Normally when this is called, the bitmap is completely empty. However,
2133          * if we are blowing up the free space cache for one reason or another
2134          * via __btrfs_remove_free_space_cache(), then it may not be freed and
2135          * we may leave stats on the table.
2136          */
2137         if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2138                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
2139                         bitmap_info->bitmap_extents;
2140                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2141
2142         }
2143         unlink_free_space(ctl, bitmap_info, true);
2144         kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
2145         kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
2146         ctl->total_bitmaps--;
2147         recalculate_thresholds(ctl);
2148 }
2149
2150 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
2151                               struct btrfs_free_space *bitmap_info,
2152                               u64 *offset, u64 *bytes)
2153 {
2154         u64 end;
2155         u64 search_start, search_bytes;
2156         int ret;
2157
2158 again:
2159         end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
2160
2161         /*
2162          * We need to search for bits in this bitmap.  We could only cover some
2163          * of the extent in this bitmap thanks to how we add space, so we need
2164          * to search for as much as it as we can and clear that amount, and then
2165          * go searching for the next bit.
2166          */
2167         search_start = *offset;
2168         search_bytes = ctl->unit;
2169         search_bytes = min(search_bytes, end - search_start + 1);
2170         ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2171                             false);
2172         if (ret < 0 || search_start != *offset)
2173                 return -EINVAL;
2174
2175         /* We may have found more bits than what we need */
2176         search_bytes = min(search_bytes, *bytes);
2177
2178         /* Cannot clear past the end of the bitmap */
2179         search_bytes = min(search_bytes, end - search_start + 1);
2180
2181         bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
2182         *offset += search_bytes;
2183         *bytes -= search_bytes;
2184
2185         if (*bytes) {
2186                 struct rb_node *next = rb_next(&bitmap_info->offset_index);
2187                 if (!bitmap_info->bytes)
2188                         free_bitmap(ctl, bitmap_info);
2189
2190                 /*
2191                  * no entry after this bitmap, but we still have bytes to
2192                  * remove, so something has gone wrong.
2193                  */
2194                 if (!next)
2195                         return -EINVAL;
2196
2197                 bitmap_info = rb_entry(next, struct btrfs_free_space,
2198                                        offset_index);
2199
2200                 /*
2201                  * if the next entry isn't a bitmap we need to return to let the
2202                  * extent stuff do its work.
2203                  */
2204                 if (!bitmap_info->bitmap)
2205                         return -EAGAIN;
2206
2207                 /*
2208                  * Ok the next item is a bitmap, but it may not actually hold
2209                  * the information for the rest of this free space stuff, so
2210                  * look for it, and if we don't find it return so we can try
2211                  * everything over again.
2212                  */
2213                 search_start = *offset;
2214                 search_bytes = ctl->unit;
2215                 ret = search_bitmap(ctl, bitmap_info, &search_start,
2216                                     &search_bytes, false);
2217                 if (ret < 0 || search_start != *offset)
2218                         return -EAGAIN;
2219
2220                 goto again;
2221         } else if (!bitmap_info->bytes)
2222                 free_bitmap(ctl, bitmap_info);
2223
2224         return 0;
2225 }
2226
2227 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2228                                struct btrfs_free_space *info, u64 offset,
2229                                u64 bytes, enum btrfs_trim_state trim_state)
2230 {
2231         u64 bytes_to_set = 0;
2232         u64 end;
2233
2234         /*
2235          * This is a tradeoff to make bitmap trim state minimal.  We mark the
2236          * whole bitmap untrimmed if at any point we add untrimmed regions.
2237          */
2238         if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2239                 if (btrfs_free_space_trimmed(info)) {
2240                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
2241                                 info->bitmap_extents;
2242                         ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2243                 }
2244                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2245         }
2246
2247         end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2248
2249         bytes_to_set = min(end - offset, bytes);
2250
2251         bitmap_set_bits(ctl, info, offset, bytes_to_set);
2252
2253         return bytes_to_set;
2254
2255 }
2256
2257 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2258                       struct btrfs_free_space *info)
2259 {
2260         struct btrfs_block_group *block_group = ctl->block_group;
2261         struct btrfs_fs_info *fs_info = block_group->fs_info;
2262         bool forced = false;
2263
2264 #ifdef CONFIG_BTRFS_DEBUG
2265         if (btrfs_should_fragment_free_space(block_group))
2266                 forced = true;
2267 #endif
2268
2269         /* This is a way to reclaim large regions from the bitmaps. */
2270         if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2271                 return false;
2272
2273         /*
2274          * If we are below the extents threshold then we can add this as an
2275          * extent, and don't have to deal with the bitmap
2276          */
2277         if (!forced && ctl->free_extents < ctl->extents_thresh) {
2278                 /*
2279                  * If this block group has some small extents we don't want to
2280                  * use up all of our free slots in the cache with them, we want
2281                  * to reserve them to larger extents, however if we have plenty
2282                  * of cache left then go ahead an dadd them, no sense in adding
2283                  * the overhead of a bitmap if we don't have to.
2284                  */
2285                 if (info->bytes <= fs_info->sectorsize * 8) {
2286                         if (ctl->free_extents * 3 <= ctl->extents_thresh)
2287                                 return false;
2288                 } else {
2289                         return false;
2290                 }
2291         }
2292
2293         /*
2294          * The original block groups from mkfs can be really small, like 8
2295          * megabytes, so don't bother with a bitmap for those entries.  However
2296          * some block groups can be smaller than what a bitmap would cover but
2297          * are still large enough that they could overflow the 32k memory limit,
2298          * so allow those block groups to still be allowed to have a bitmap
2299          * entry.
2300          */
2301         if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2302                 return false;
2303
2304         return true;
2305 }
2306
2307 static const struct btrfs_free_space_op free_space_op = {
2308         .use_bitmap             = use_bitmap,
2309 };
2310
2311 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2312                               struct btrfs_free_space *info)
2313 {
2314         struct btrfs_free_space *bitmap_info;
2315         struct btrfs_block_group *block_group = NULL;
2316         int added = 0;
2317         u64 bytes, offset, bytes_added;
2318         enum btrfs_trim_state trim_state;
2319         int ret;
2320
2321         bytes = info->bytes;
2322         offset = info->offset;
2323         trim_state = info->trim_state;
2324
2325         if (!ctl->op->use_bitmap(ctl, info))
2326                 return 0;
2327
2328         if (ctl->op == &free_space_op)
2329                 block_group = ctl->block_group;
2330 again:
2331         /*
2332          * Since we link bitmaps right into the cluster we need to see if we
2333          * have a cluster here, and if so and it has our bitmap we need to add
2334          * the free space to that bitmap.
2335          */
2336         if (block_group && !list_empty(&block_group->cluster_list)) {
2337                 struct btrfs_free_cluster *cluster;
2338                 struct rb_node *node;
2339                 struct btrfs_free_space *entry;
2340
2341                 cluster = list_entry(block_group->cluster_list.next,
2342                                      struct btrfs_free_cluster,
2343                                      block_group_list);
2344                 spin_lock(&cluster->lock);
2345                 node = rb_first(&cluster->root);
2346                 if (!node) {
2347                         spin_unlock(&cluster->lock);
2348                         goto no_cluster_bitmap;
2349                 }
2350
2351                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2352                 if (!entry->bitmap) {
2353                         spin_unlock(&cluster->lock);
2354                         goto no_cluster_bitmap;
2355                 }
2356
2357                 if (entry->offset == offset_to_bitmap(ctl, offset)) {
2358                         bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2359                                                           bytes, trim_state);
2360                         bytes -= bytes_added;
2361                         offset += bytes_added;
2362                 }
2363                 spin_unlock(&cluster->lock);
2364                 if (!bytes) {
2365                         ret = 1;
2366                         goto out;
2367                 }
2368         }
2369
2370 no_cluster_bitmap:
2371         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2372                                          1, 0);
2373         if (!bitmap_info) {
2374                 ASSERT(added == 0);
2375                 goto new_bitmap;
2376         }
2377
2378         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2379                                           trim_state);
2380         bytes -= bytes_added;
2381         offset += bytes_added;
2382         added = 0;
2383
2384         if (!bytes) {
2385                 ret = 1;
2386                 goto out;
2387         } else
2388                 goto again;
2389
2390 new_bitmap:
2391         if (info && info->bitmap) {
2392                 add_new_bitmap(ctl, info, offset);
2393                 added = 1;
2394                 info = NULL;
2395                 goto again;
2396         } else {
2397                 spin_unlock(&ctl->tree_lock);
2398
2399                 /* no pre-allocated info, allocate a new one */
2400                 if (!info) {
2401                         info = kmem_cache_zalloc(btrfs_free_space_cachep,
2402                                                  GFP_NOFS);
2403                         if (!info) {
2404                                 spin_lock(&ctl->tree_lock);
2405                                 ret = -ENOMEM;
2406                                 goto out;
2407                         }
2408                 }
2409
2410                 /* allocate the bitmap */
2411                 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2412                                                  GFP_NOFS);
2413                 info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2414                 spin_lock(&ctl->tree_lock);
2415                 if (!info->bitmap) {
2416                         ret = -ENOMEM;
2417                         goto out;
2418                 }
2419                 goto again;
2420         }
2421
2422 out:
2423         if (info) {
2424                 if (info->bitmap)
2425                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
2426                                         info->bitmap);
2427                 kmem_cache_free(btrfs_free_space_cachep, info);
2428         }
2429
2430         return ret;
2431 }
2432
2433 /*
2434  * Free space merging rules:
2435  *  1) Merge trimmed areas together
2436  *  2) Let untrimmed areas coalesce with trimmed areas
2437  *  3) Always pull neighboring regions from bitmaps
2438  *
2439  * The above rules are for when we merge free space based on btrfs_trim_state.
2440  * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2441  * same reason: to promote larger extent regions which makes life easier for
2442  * find_free_extent().  Rule 2 enables coalescing based on the common path
2443  * being returning free space from btrfs_finish_extent_commit().  So when free
2444  * space is trimmed, it will prevent aggregating trimmed new region and
2445  * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2446  * and provide find_free_extent() with the largest extents possible hoping for
2447  * the reuse path.
2448  */
2449 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2450                           struct btrfs_free_space *info, bool update_stat)
2451 {
2452         struct btrfs_free_space *left_info = NULL;
2453         struct btrfs_free_space *right_info;
2454         bool merged = false;
2455         u64 offset = info->offset;
2456         u64 bytes = info->bytes;
2457         const bool is_trimmed = btrfs_free_space_trimmed(info);
2458         struct rb_node *right_prev = NULL;
2459
2460         /*
2461          * first we want to see if there is free space adjacent to the range we
2462          * are adding, if there is remove that struct and add a new one to
2463          * cover the entire range
2464          */
2465         right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2466         if (right_info)
2467                 right_prev = rb_prev(&right_info->offset_index);
2468
2469         if (right_prev)
2470                 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index);
2471         else if (!right_info)
2472                 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2473
2474         /* See try_merge_free_space() comment. */
2475         if (right_info && !right_info->bitmap &&
2476             (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2477                 unlink_free_space(ctl, right_info, update_stat);
2478                 info->bytes += right_info->bytes;
2479                 kmem_cache_free(btrfs_free_space_cachep, right_info);
2480                 merged = true;
2481         }
2482
2483         /* See try_merge_free_space() comment. */
2484         if (left_info && !left_info->bitmap &&
2485             left_info->offset + left_info->bytes == offset &&
2486             (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2487                 unlink_free_space(ctl, left_info, update_stat);
2488                 info->offset = left_info->offset;
2489                 info->bytes += left_info->bytes;
2490                 kmem_cache_free(btrfs_free_space_cachep, left_info);
2491                 merged = true;
2492         }
2493
2494         return merged;
2495 }
2496
2497 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2498                                      struct btrfs_free_space *info,
2499                                      bool update_stat)
2500 {
2501         struct btrfs_free_space *bitmap;
2502         unsigned long i;
2503         unsigned long j;
2504         const u64 end = info->offset + info->bytes;
2505         const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2506         u64 bytes;
2507
2508         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2509         if (!bitmap)
2510                 return false;
2511
2512         i = offset_to_bit(bitmap->offset, ctl->unit, end);
2513         j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2514         if (j == i)
2515                 return false;
2516         bytes = (j - i) * ctl->unit;
2517         info->bytes += bytes;
2518
2519         /* See try_merge_free_space() comment. */
2520         if (!btrfs_free_space_trimmed(bitmap))
2521                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2522
2523         bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
2524
2525         if (!bitmap->bytes)
2526                 free_bitmap(ctl, bitmap);
2527
2528         return true;
2529 }
2530
2531 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2532                                        struct btrfs_free_space *info,
2533                                        bool update_stat)
2534 {
2535         struct btrfs_free_space *bitmap;
2536         u64 bitmap_offset;
2537         unsigned long i;
2538         unsigned long j;
2539         unsigned long prev_j;
2540         u64 bytes;
2541
2542         bitmap_offset = offset_to_bitmap(ctl, info->offset);
2543         /* If we're on a boundary, try the previous logical bitmap. */
2544         if (bitmap_offset == info->offset) {
2545                 if (info->offset == 0)
2546                         return false;
2547                 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2548         }
2549
2550         bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2551         if (!bitmap)
2552                 return false;
2553
2554         i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2555         j = 0;
2556         prev_j = (unsigned long)-1;
2557         for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2558                 if (j > i)
2559                         break;
2560                 prev_j = j;
2561         }
2562         if (prev_j == i)
2563                 return false;
2564
2565         if (prev_j == (unsigned long)-1)
2566                 bytes = (i + 1) * ctl->unit;
2567         else
2568                 bytes = (i - prev_j) * ctl->unit;
2569
2570         info->offset -= bytes;
2571         info->bytes += bytes;
2572
2573         /* See try_merge_free_space() comment. */
2574         if (!btrfs_free_space_trimmed(bitmap))
2575                 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2576
2577         bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
2578
2579         if (!bitmap->bytes)
2580                 free_bitmap(ctl, bitmap);
2581
2582         return true;
2583 }
2584
2585 /*
2586  * We prefer always to allocate from extent entries, both for clustered and
2587  * non-clustered allocation requests. So when attempting to add a new extent
2588  * entry, try to see if there's adjacent free space in bitmap entries, and if
2589  * there is, migrate that space from the bitmaps to the extent.
2590  * Like this we get better chances of satisfying space allocation requests
2591  * because we attempt to satisfy them based on a single cache entry, and never
2592  * on 2 or more entries - even if the entries represent a contiguous free space
2593  * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2594  * ends).
2595  */
2596 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2597                               struct btrfs_free_space *info,
2598                               bool update_stat)
2599 {
2600         /*
2601          * Only work with disconnected entries, as we can change their offset,
2602          * and must be extent entries.
2603          */
2604         ASSERT(!info->bitmap);
2605         ASSERT(RB_EMPTY_NODE(&info->offset_index));
2606
2607         if (ctl->total_bitmaps > 0) {
2608                 bool stole_end;
2609                 bool stole_front = false;
2610
2611                 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2612                 if (ctl->total_bitmaps > 0)
2613                         stole_front = steal_from_bitmap_to_front(ctl, info,
2614                                                                  update_stat);
2615
2616                 if (stole_end || stole_front)
2617                         try_merge_free_space(ctl, info, update_stat);
2618         }
2619 }
2620
2621 int __btrfs_add_free_space(struct btrfs_block_group *block_group,
2622                            u64 offset, u64 bytes,
2623                            enum btrfs_trim_state trim_state)
2624 {
2625         struct btrfs_fs_info *fs_info = block_group->fs_info;
2626         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2627         struct btrfs_free_space *info;
2628         int ret = 0;
2629         u64 filter_bytes = bytes;
2630
2631         ASSERT(!btrfs_is_zoned(fs_info));
2632
2633         info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2634         if (!info)
2635                 return -ENOMEM;
2636
2637         info->offset = offset;
2638         info->bytes = bytes;
2639         info->trim_state = trim_state;
2640         RB_CLEAR_NODE(&info->offset_index);
2641         RB_CLEAR_NODE(&info->bytes_index);
2642
2643         spin_lock(&ctl->tree_lock);
2644
2645         if (try_merge_free_space(ctl, info, true))
2646                 goto link;
2647
2648         /*
2649          * There was no extent directly to the left or right of this new
2650          * extent then we know we're going to have to allocate a new extent, so
2651          * before we do that see if we need to drop this into a bitmap
2652          */
2653         ret = insert_into_bitmap(ctl, info);
2654         if (ret < 0) {
2655                 goto out;
2656         } else if (ret) {
2657                 ret = 0;
2658                 goto out;
2659         }
2660 link:
2661         /*
2662          * Only steal free space from adjacent bitmaps if we're sure we're not
2663          * going to add the new free space to existing bitmap entries - because
2664          * that would mean unnecessary work that would be reverted. Therefore
2665          * attempt to steal space from bitmaps if we're adding an extent entry.
2666          */
2667         steal_from_bitmap(ctl, info, true);
2668
2669         filter_bytes = max(filter_bytes, info->bytes);
2670
2671         ret = link_free_space(ctl, info);
2672         if (ret)
2673                 kmem_cache_free(btrfs_free_space_cachep, info);
2674 out:
2675         btrfs_discard_update_discardable(block_group);
2676         spin_unlock(&ctl->tree_lock);
2677
2678         if (ret) {
2679                 btrfs_crit(fs_info, "unable to add free space :%d", ret);
2680                 ASSERT(ret != -EEXIST);
2681         }
2682
2683         if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2684                 btrfs_discard_check_filter(block_group, filter_bytes);
2685                 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2686         }
2687
2688         return ret;
2689 }
2690
2691 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2692                                         u64 bytenr, u64 size, bool used)
2693 {
2694         struct btrfs_space_info *sinfo = block_group->space_info;
2695         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2696         u64 offset = bytenr - block_group->start;
2697         u64 to_free, to_unusable;
2698         int bg_reclaim_threshold = 0;
2699         bool initial = (size == block_group->length);
2700         u64 reclaimable_unusable;
2701
2702         WARN_ON(!initial && offset + size > block_group->zone_capacity);
2703
2704         if (!initial)
2705                 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
2706
2707         spin_lock(&ctl->tree_lock);
2708         /* Count initial region as zone_unusable until it gets activated. */
2709         if (!used)
2710                 to_free = size;
2711         else if (initial &&
2712                  test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) &&
2713                  (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
2714                 to_free = 0;
2715         else if (initial)
2716                 to_free = block_group->zone_capacity;
2717         else if (offset >= block_group->alloc_offset)
2718                 to_free = size;
2719         else if (offset + size <= block_group->alloc_offset)
2720                 to_free = 0;
2721         else
2722                 to_free = offset + size - block_group->alloc_offset;
2723         to_unusable = size - to_free;
2724
2725         ctl->free_space += to_free;
2726         /*
2727          * If the block group is read-only, we should account freed space into
2728          * bytes_readonly.
2729          */
2730         if (!block_group->ro)
2731                 block_group->zone_unusable += to_unusable;
2732         spin_unlock(&ctl->tree_lock);
2733         if (!used) {
2734                 spin_lock(&block_group->lock);
2735                 block_group->alloc_offset -= size;
2736                 spin_unlock(&block_group->lock);
2737         }
2738
2739         reclaimable_unusable = block_group->zone_unusable -
2740                                (block_group->length - block_group->zone_capacity);
2741         /* All the region is now unusable. Mark it as unused and reclaim */
2742         if (block_group->zone_unusable == block_group->length &&
2743             block_group->alloc_offset) {
2744                 btrfs_mark_bg_unused(block_group);
2745         } else if (bg_reclaim_threshold &&
2746                    reclaimable_unusable >=
2747                    mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
2748                 btrfs_mark_bg_to_reclaim(block_group);
2749         }
2750
2751         return 0;
2752 }
2753
2754 int btrfs_add_free_space(struct btrfs_block_group *block_group,
2755                          u64 bytenr, u64 size)
2756 {
2757         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2758
2759         if (btrfs_is_zoned(block_group->fs_info))
2760                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2761                                                     true);
2762
2763         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2764                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2765
2766         return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2767 }
2768
2769 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2770                                 u64 bytenr, u64 size)
2771 {
2772         if (btrfs_is_zoned(block_group->fs_info))
2773                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2774                                                     false);
2775
2776         return btrfs_add_free_space(block_group, bytenr, size);
2777 }
2778
2779 /*
2780  * This is a subtle distinction because when adding free space back in general,
2781  * we want it to be added as untrimmed for async. But in the case where we add
2782  * it on loading of a block group, we want to consider it trimmed.
2783  */
2784 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2785                                        u64 bytenr, u64 size)
2786 {
2787         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2788
2789         if (btrfs_is_zoned(block_group->fs_info))
2790                 return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2791                                                     true);
2792
2793         if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2794             btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2795                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
2796
2797         return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2798 }
2799
2800 int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2801                             u64 offset, u64 bytes)
2802 {
2803         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2804         struct btrfs_free_space *info;
2805         int ret;
2806         bool re_search = false;
2807
2808         if (btrfs_is_zoned(block_group->fs_info)) {
2809                 /*
2810                  * This can happen with conventional zones when replaying log.
2811                  * Since the allocation info of tree-log nodes are not recorded
2812                  * to the extent-tree, calculate_alloc_pointer() failed to
2813                  * advance the allocation pointer after last allocated tree log
2814                  * node blocks.
2815                  *
2816                  * This function is called from
2817                  * btrfs_pin_extent_for_log_replay() when replaying the log.
2818                  * Advance the pointer not to overwrite the tree-log nodes.
2819                  */
2820                 if (block_group->start + block_group->alloc_offset <
2821                     offset + bytes) {
2822                         block_group->alloc_offset =
2823                                 offset + bytes - block_group->start;
2824                 }
2825                 return 0;
2826         }
2827
2828         spin_lock(&ctl->tree_lock);
2829
2830 again:
2831         ret = 0;
2832         if (!bytes)
2833                 goto out_lock;
2834
2835         info = tree_search_offset(ctl, offset, 0, 0);
2836         if (!info) {
2837                 /*
2838                  * oops didn't find an extent that matched the space we wanted
2839                  * to remove, look for a bitmap instead
2840                  */
2841                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2842                                           1, 0);
2843                 if (!info) {
2844                         /*
2845                          * If we found a partial bit of our free space in a
2846                          * bitmap but then couldn't find the other part this may
2847                          * be a problem, so WARN about it.
2848                          */
2849                         WARN_ON(re_search);
2850                         goto out_lock;
2851                 }
2852         }
2853
2854         re_search = false;
2855         if (!info->bitmap) {
2856                 unlink_free_space(ctl, info, true);
2857                 if (offset == info->offset) {
2858                         u64 to_free = min(bytes, info->bytes);
2859
2860                         info->bytes -= to_free;
2861                         info->offset += to_free;
2862                         if (info->bytes) {
2863                                 ret = link_free_space(ctl, info);
2864                                 WARN_ON(ret);
2865                         } else {
2866                                 kmem_cache_free(btrfs_free_space_cachep, info);
2867                         }
2868
2869                         offset += to_free;
2870                         bytes -= to_free;
2871                         goto again;
2872                 } else {
2873                         u64 old_end = info->bytes + info->offset;
2874
2875                         info->bytes = offset - info->offset;
2876                         ret = link_free_space(ctl, info);
2877                         WARN_ON(ret);
2878                         if (ret)
2879                                 goto out_lock;
2880
2881                         /* Not enough bytes in this entry to satisfy us */
2882                         if (old_end < offset + bytes) {
2883                                 bytes -= old_end - offset;
2884                                 offset = old_end;
2885                                 goto again;
2886                         } else if (old_end == offset + bytes) {
2887                                 /* all done */
2888                                 goto out_lock;
2889                         }
2890                         spin_unlock(&ctl->tree_lock);
2891
2892                         ret = __btrfs_add_free_space(block_group,
2893                                                      offset + bytes,
2894                                                      old_end - (offset + bytes),
2895                                                      info->trim_state);
2896                         WARN_ON(ret);
2897                         goto out;
2898                 }
2899         }
2900
2901         ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2902         if (ret == -EAGAIN) {
2903                 re_search = true;
2904                 goto again;
2905         }
2906 out_lock:
2907         btrfs_discard_update_discardable(block_group);
2908         spin_unlock(&ctl->tree_lock);
2909 out:
2910         return ret;
2911 }
2912
2913 void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2914                            u64 bytes)
2915 {
2916         struct btrfs_fs_info *fs_info = block_group->fs_info;
2917         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918         struct btrfs_free_space *info;
2919         struct rb_node *n;
2920         int count = 0;
2921
2922         /*
2923          * Zoned btrfs does not use free space tree and cluster. Just print
2924          * out the free space after the allocation offset.
2925          */
2926         if (btrfs_is_zoned(fs_info)) {
2927                 btrfs_info(fs_info, "free space %llu active %d",
2928                            block_group->zone_capacity - block_group->alloc_offset,
2929                            test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
2930                                     &block_group->runtime_flags));
2931                 return;
2932         }
2933
2934         spin_lock(&ctl->tree_lock);
2935         for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2936                 info = rb_entry(n, struct btrfs_free_space, offset_index);
2937                 if (info->bytes >= bytes && !block_group->ro)
2938                         count++;
2939                 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2940                            info->offset, info->bytes,
2941                        (info->bitmap) ? "yes" : "no");
2942         }
2943         spin_unlock(&ctl->tree_lock);
2944         btrfs_info(fs_info, "block group has cluster?: %s",
2945                list_empty(&block_group->cluster_list) ? "no" : "yes");
2946         btrfs_info(fs_info,
2947                    "%d blocks of free space at or bigger than bytes is", count);
2948 }
2949
2950 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2951                                struct btrfs_free_space_ctl *ctl)
2952 {
2953         struct btrfs_fs_info *fs_info = block_group->fs_info;
2954
2955         spin_lock_init(&ctl->tree_lock);
2956         ctl->unit = fs_info->sectorsize;
2957         ctl->start = block_group->start;
2958         ctl->block_group = block_group;
2959         ctl->op = &free_space_op;
2960         ctl->free_space_bytes = RB_ROOT_CACHED;
2961         INIT_LIST_HEAD(&ctl->trimming_ranges);
2962         mutex_init(&ctl->cache_writeout_mutex);
2963
2964         /*
2965          * we only want to have 32k of ram per block group for keeping
2966          * track of free space, and if we pass 1/2 of that we want to
2967          * start converting things over to using bitmaps
2968          */
2969         ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2970 }
2971
2972 /*
2973  * for a given cluster, put all of its extents back into the free
2974  * space cache.  If the block group passed doesn't match the block group
2975  * pointed to by the cluster, someone else raced in and freed the
2976  * cluster already.  In that case, we just return without changing anything
2977  */
2978 static void __btrfs_return_cluster_to_free_space(
2979                              struct btrfs_block_group *block_group,
2980                              struct btrfs_free_cluster *cluster)
2981 {
2982         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2983         struct rb_node *node;
2984
2985         lockdep_assert_held(&ctl->tree_lock);
2986
2987         spin_lock(&cluster->lock);
2988         if (cluster->block_group != block_group) {
2989                 spin_unlock(&cluster->lock);
2990                 return;
2991         }
2992
2993         cluster->block_group = NULL;
2994         cluster->window_start = 0;
2995         list_del_init(&cluster->block_group_list);
2996
2997         node = rb_first(&cluster->root);
2998         while (node) {
2999                 struct btrfs_free_space *entry;
3000
3001                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3002                 node = rb_next(&entry->offset_index);
3003                 rb_erase(&entry->offset_index, &cluster->root);
3004                 RB_CLEAR_NODE(&entry->offset_index);
3005
3006                 if (!entry->bitmap) {
3007                         /* Merging treats extents as if they were new */
3008                         if (!btrfs_free_space_trimmed(entry)) {
3009                                 ctl->discardable_extents[BTRFS_STAT_CURR]--;
3010                                 ctl->discardable_bytes[BTRFS_STAT_CURR] -=
3011                                         entry->bytes;
3012                         }
3013
3014                         try_merge_free_space(ctl, entry, false);
3015                         steal_from_bitmap(ctl, entry, false);
3016
3017                         /* As we insert directly, update these statistics */
3018                         if (!btrfs_free_space_trimmed(entry)) {
3019                                 ctl->discardable_extents[BTRFS_STAT_CURR]++;
3020                                 ctl->discardable_bytes[BTRFS_STAT_CURR] +=
3021                                         entry->bytes;
3022                         }
3023                 }
3024                 tree_insert_offset(ctl, NULL, entry);
3025                 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
3026                               entry_less);
3027         }
3028         cluster->root = RB_ROOT;
3029         spin_unlock(&cluster->lock);
3030         btrfs_put_block_group(block_group);
3031 }
3032
3033 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
3034 {
3035         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3036         struct btrfs_free_cluster *cluster;
3037         struct list_head *head;
3038
3039         spin_lock(&ctl->tree_lock);
3040         while ((head = block_group->cluster_list.next) !=
3041                &block_group->cluster_list) {
3042                 cluster = list_entry(head, struct btrfs_free_cluster,
3043                                      block_group_list);
3044
3045                 WARN_ON(cluster->block_group != block_group);
3046                 __btrfs_return_cluster_to_free_space(block_group, cluster);
3047
3048                 cond_resched_lock(&ctl->tree_lock);
3049         }
3050         __btrfs_remove_free_space_cache(ctl);
3051         btrfs_discard_update_discardable(block_group);
3052         spin_unlock(&ctl->tree_lock);
3053
3054 }
3055
3056 /*
3057  * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3058  */
3059 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3060 {
3061         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3062         struct btrfs_free_space *info;
3063         struct rb_node *node;
3064         bool ret = true;
3065
3066         spin_lock(&ctl->tree_lock);
3067         node = rb_first(&ctl->free_space_offset);
3068
3069         while (node) {
3070                 info = rb_entry(node, struct btrfs_free_space, offset_index);
3071
3072                 if (!btrfs_free_space_trimmed(info)) {
3073                         ret = false;
3074                         break;
3075                 }
3076
3077                 node = rb_next(node);
3078         }
3079
3080         spin_unlock(&ctl->tree_lock);
3081         return ret;
3082 }
3083
3084 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
3085                                u64 offset, u64 bytes, u64 empty_size,
3086                                u64 *max_extent_size)
3087 {
3088         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3089         struct btrfs_discard_ctl *discard_ctl =
3090                                         &block_group->fs_info->discard_ctl;
3091         struct btrfs_free_space *entry = NULL;
3092         u64 bytes_search = bytes + empty_size;
3093         u64 ret = 0;
3094         u64 align_gap = 0;
3095         u64 align_gap_len = 0;
3096         enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3097         bool use_bytes_index = (offset == block_group->start);
3098
3099         ASSERT(!btrfs_is_zoned(block_group->fs_info));
3100
3101         spin_lock(&ctl->tree_lock);
3102         entry = find_free_space(ctl, &offset, &bytes_search,
3103                                 block_group->full_stripe_len, max_extent_size,
3104                                 use_bytes_index);
3105         if (!entry)
3106                 goto out;
3107
3108         ret = offset;
3109         if (entry->bitmap) {
3110                 bitmap_clear_bits(ctl, entry, offset, bytes, true);
3111
3112                 if (!btrfs_free_space_trimmed(entry))
3113                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3114
3115                 if (!entry->bytes)
3116                         free_bitmap(ctl, entry);
3117         } else {
3118                 unlink_free_space(ctl, entry, true);
3119                 align_gap_len = offset - entry->offset;
3120                 align_gap = entry->offset;
3121                 align_gap_trim_state = entry->trim_state;
3122
3123                 if (!btrfs_free_space_trimmed(entry))
3124                         atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3125
3126                 entry->offset = offset + bytes;
3127                 WARN_ON(entry->bytes < bytes + align_gap_len);
3128
3129                 entry->bytes -= bytes + align_gap_len;
3130                 if (!entry->bytes)
3131                         kmem_cache_free(btrfs_free_space_cachep, entry);
3132                 else
3133                         link_free_space(ctl, entry);
3134         }
3135 out:
3136         btrfs_discard_update_discardable(block_group);
3137         spin_unlock(&ctl->tree_lock);
3138
3139         if (align_gap_len)
3140                 __btrfs_add_free_space(block_group, align_gap, align_gap_len,
3141                                        align_gap_trim_state);
3142         return ret;
3143 }
3144
3145 /*
3146  * given a cluster, put all of its extents back into the free space
3147  * cache.  If a block group is passed, this function will only free
3148  * a cluster that belongs to the passed block group.
3149  *
3150  * Otherwise, it'll get a reference on the block group pointed to by the
3151  * cluster and remove the cluster from it.
3152  */
3153 void btrfs_return_cluster_to_free_space(
3154                                struct btrfs_block_group *block_group,
3155                                struct btrfs_free_cluster *cluster)
3156 {
3157         struct btrfs_free_space_ctl *ctl;
3158
3159         /* first, get a safe pointer to the block group */
3160         spin_lock(&cluster->lock);
3161         if (!block_group) {
3162                 block_group = cluster->block_group;
3163                 if (!block_group) {
3164                         spin_unlock(&cluster->lock);
3165                         return;
3166                 }
3167         } else if (cluster->block_group != block_group) {
3168                 /* someone else has already freed it don't redo their work */
3169                 spin_unlock(&cluster->lock);
3170                 return;
3171         }
3172         btrfs_get_block_group(block_group);
3173         spin_unlock(&cluster->lock);
3174
3175         ctl = block_group->free_space_ctl;
3176
3177         /* now return any extents the cluster had on it */
3178         spin_lock(&ctl->tree_lock);
3179         __btrfs_return_cluster_to_free_space(block_group, cluster);
3180         spin_unlock(&ctl->tree_lock);
3181
3182         btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3183
3184         /* finally drop our ref */
3185         btrfs_put_block_group(block_group);
3186 }
3187
3188 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3189                                    struct btrfs_free_cluster *cluster,
3190                                    struct btrfs_free_space *entry,
3191                                    u64 bytes, u64 min_start,
3192                                    u64 *max_extent_size)
3193 {
3194         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3195         int err;
3196         u64 search_start = cluster->window_start;
3197         u64 search_bytes = bytes;
3198         u64 ret = 0;
3199
3200         search_start = min_start;
3201         search_bytes = bytes;
3202
3203         err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3204         if (err) {
3205                 *max_extent_size = max(get_max_extent_size(entry),
3206                                        *max_extent_size);
3207                 return 0;
3208         }
3209
3210         ret = search_start;
3211         bitmap_clear_bits(ctl, entry, ret, bytes, false);
3212
3213         return ret;
3214 }
3215
3216 /*
3217  * given a cluster, try to allocate 'bytes' from it, returns 0
3218  * if it couldn't find anything suitably large, or a logical disk offset
3219  * if things worked out
3220  */
3221 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3222                              struct btrfs_free_cluster *cluster, u64 bytes,
3223                              u64 min_start, u64 *max_extent_size)
3224 {
3225         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3226         struct btrfs_discard_ctl *discard_ctl =
3227                                         &block_group->fs_info->discard_ctl;
3228         struct btrfs_free_space *entry = NULL;
3229         struct rb_node *node;
3230         u64 ret = 0;
3231
3232         ASSERT(!btrfs_is_zoned(block_group->fs_info));
3233
3234         spin_lock(&cluster->lock);
3235         if (bytes > cluster->max_size)
3236                 goto out;
3237
3238         if (cluster->block_group != block_group)
3239                 goto out;
3240
3241         node = rb_first(&cluster->root);
3242         if (!node)
3243                 goto out;
3244
3245         entry = rb_entry(node, struct btrfs_free_space, offset_index);
3246         while (1) {
3247                 if (entry->bytes < bytes)
3248                         *max_extent_size = max(get_max_extent_size(entry),
3249                                                *max_extent_size);
3250
3251                 if (entry->bytes < bytes ||
3252                     (!entry->bitmap && entry->offset < min_start)) {
3253                         node = rb_next(&entry->offset_index);
3254                         if (!node)
3255                                 break;
3256                         entry = rb_entry(node, struct btrfs_free_space,
3257                                          offset_index);
3258                         continue;
3259                 }
3260
3261                 if (entry->bitmap) {
3262                         ret = btrfs_alloc_from_bitmap(block_group,
3263                                                       cluster, entry, bytes,
3264                                                       cluster->window_start,
3265                                                       max_extent_size);
3266                         if (ret == 0) {
3267                                 node = rb_next(&entry->offset_index);
3268                                 if (!node)
3269                                         break;
3270                                 entry = rb_entry(node, struct btrfs_free_space,
3271                                                  offset_index);
3272                                 continue;
3273                         }
3274                         cluster->window_start += bytes;
3275                 } else {
3276                         ret = entry->offset;
3277
3278                         entry->offset += bytes;
3279                         entry->bytes -= bytes;
3280                 }
3281
3282                 break;
3283         }
3284 out:
3285         spin_unlock(&cluster->lock);
3286
3287         if (!ret)
3288                 return 0;
3289
3290         spin_lock(&ctl->tree_lock);
3291
3292         if (!btrfs_free_space_trimmed(entry))
3293                 atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3294
3295         ctl->free_space -= bytes;
3296         if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3297                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3298
3299         spin_lock(&cluster->lock);
3300         if (entry->bytes == 0) {
3301                 rb_erase(&entry->offset_index, &cluster->root);
3302                 ctl->free_extents--;
3303                 if (entry->bitmap) {
3304                         kmem_cache_free(btrfs_free_space_bitmap_cachep,
3305                                         entry->bitmap);
3306                         ctl->total_bitmaps--;
3307                         recalculate_thresholds(ctl);
3308                 } else if (!btrfs_free_space_trimmed(entry)) {
3309                         ctl->discardable_extents[BTRFS_STAT_CURR]--;
3310                 }
3311                 kmem_cache_free(btrfs_free_space_cachep, entry);
3312         }
3313
3314         spin_unlock(&cluster->lock);
3315         spin_unlock(&ctl->tree_lock);
3316
3317         return ret;
3318 }
3319
3320 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3321                                 struct btrfs_free_space *entry,
3322                                 struct btrfs_free_cluster *cluster,
3323                                 u64 offset, u64 bytes,
3324                                 u64 cont1_bytes, u64 min_bytes)
3325 {
3326         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3327         unsigned long next_zero;
3328         unsigned long i;
3329         unsigned long want_bits;
3330         unsigned long min_bits;
3331         unsigned long found_bits;
3332         unsigned long max_bits = 0;
3333         unsigned long start = 0;
3334         unsigned long total_found = 0;
3335         int ret;
3336
3337         lockdep_assert_held(&ctl->tree_lock);
3338
3339         i = offset_to_bit(entry->offset, ctl->unit,
3340                           max_t(u64, offset, entry->offset));
3341         want_bits = bytes_to_bits(bytes, ctl->unit);
3342         min_bits = bytes_to_bits(min_bytes, ctl->unit);
3343
3344         /*
3345          * Don't bother looking for a cluster in this bitmap if it's heavily
3346          * fragmented.
3347          */
3348         if (entry->max_extent_size &&
3349             entry->max_extent_size < cont1_bytes)
3350                 return -ENOSPC;
3351 again:
3352         found_bits = 0;
3353         for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3354                 next_zero = find_next_zero_bit(entry->bitmap,
3355                                                BITS_PER_BITMAP, i);
3356                 if (next_zero - i >= min_bits) {
3357                         found_bits = next_zero - i;
3358                         if (found_bits > max_bits)
3359                                 max_bits = found_bits;
3360                         break;
3361                 }
3362                 if (next_zero - i > max_bits)
3363                         max_bits = next_zero - i;
3364                 i = next_zero;
3365         }
3366
3367         if (!found_bits) {
3368                 entry->max_extent_size = (u64)max_bits * ctl->unit;
3369                 return -ENOSPC;
3370         }
3371
3372         if (!total_found) {
3373                 start = i;
3374                 cluster->max_size = 0;
3375         }
3376
3377         total_found += found_bits;
3378
3379         if (cluster->max_size < found_bits * ctl->unit)
3380                 cluster->max_size = found_bits * ctl->unit;
3381
3382         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3383                 i = next_zero + 1;
3384                 goto again;
3385         }
3386
3387         cluster->window_start = start * ctl->unit + entry->offset;
3388         rb_erase(&entry->offset_index, &ctl->free_space_offset);
3389         rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3390
3391         /*
3392          * We need to know if we're currently on the normal space index when we
3393          * manipulate the bitmap so that we know we need to remove and re-insert
3394          * it into the space_index tree.  Clear the bytes_index node here so the
3395          * bitmap manipulation helpers know not to mess with the space_index
3396          * until this bitmap entry is added back into the normal cache.
3397          */
3398         RB_CLEAR_NODE(&entry->bytes_index);
3399
3400         ret = tree_insert_offset(ctl, cluster, entry);
3401         ASSERT(!ret); /* -EEXIST; Logic error */
3402
3403         trace_btrfs_setup_cluster(block_group, cluster,
3404                                   total_found * ctl->unit, 1);
3405         return 0;
3406 }
3407
3408 /*
3409  * This searches the block group for just extents to fill the cluster with.
3410  * Try to find a cluster with at least bytes total bytes, at least one
3411  * extent of cont1_bytes, and other clusters of at least min_bytes.
3412  */
3413 static noinline int
3414 setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3415                         struct btrfs_free_cluster *cluster,
3416                         struct list_head *bitmaps, u64 offset, u64 bytes,
3417                         u64 cont1_bytes, u64 min_bytes)
3418 {
3419         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3420         struct btrfs_free_space *first = NULL;
3421         struct btrfs_free_space *entry = NULL;
3422         struct btrfs_free_space *last;
3423         struct rb_node *node;
3424         u64 window_free;
3425         u64 max_extent;
3426         u64 total_size = 0;
3427
3428         lockdep_assert_held(&ctl->tree_lock);
3429
3430         entry = tree_search_offset(ctl, offset, 0, 1);
3431         if (!entry)
3432                 return -ENOSPC;
3433
3434         /*
3435          * We don't want bitmaps, so just move along until we find a normal
3436          * extent entry.
3437          */
3438         while (entry->bitmap || entry->bytes < min_bytes) {
3439                 if (entry->bitmap && list_empty(&entry->list))
3440                         list_add_tail(&entry->list, bitmaps);
3441                 node = rb_next(&entry->offset_index);
3442                 if (!node)
3443                         return -ENOSPC;
3444                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3445         }
3446
3447         window_free = entry->bytes;
3448         max_extent = entry->bytes;
3449         first = entry;
3450         last = entry;
3451
3452         for (node = rb_next(&entry->offset_index); node;
3453              node = rb_next(&entry->offset_index)) {
3454                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3455
3456                 if (entry->bitmap) {
3457                         if (list_empty(&entry->list))
3458                                 list_add_tail(&entry->list, bitmaps);
3459                         continue;
3460                 }
3461
3462                 if (entry->bytes < min_bytes)
3463                         continue;
3464
3465                 last = entry;
3466                 window_free += entry->bytes;
3467                 if (entry->bytes > max_extent)
3468                         max_extent = entry->bytes;
3469         }
3470
3471         if (window_free < bytes || max_extent < cont1_bytes)
3472                 return -ENOSPC;
3473
3474         cluster->window_start = first->offset;
3475
3476         node = &first->offset_index;
3477
3478         /*
3479          * now we've found our entries, pull them out of the free space
3480          * cache and put them into the cluster rbtree
3481          */
3482         do {
3483                 int ret;
3484
3485                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
3486                 node = rb_next(&entry->offset_index);
3487                 if (entry->bitmap || entry->bytes < min_bytes)
3488                         continue;
3489
3490                 rb_erase(&entry->offset_index, &ctl->free_space_offset);
3491                 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3492                 ret = tree_insert_offset(ctl, cluster, entry);
3493                 total_size += entry->bytes;
3494                 ASSERT(!ret); /* -EEXIST; Logic error */
3495         } while (node && entry != last);
3496
3497         cluster->max_size = max_extent;
3498         trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3499         return 0;
3500 }
3501
3502 /*
3503  * This specifically looks for bitmaps that may work in the cluster, we assume
3504  * that we have already failed to find extents that will work.
3505  */
3506 static noinline int
3507 setup_cluster_bitmap(struct btrfs_block_group *block_group,
3508                      struct btrfs_free_cluster *cluster,
3509                      struct list_head *bitmaps, u64 offset, u64 bytes,
3510                      u64 cont1_bytes, u64 min_bytes)
3511 {
3512         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3513         struct btrfs_free_space *entry = NULL;
3514         int ret = -ENOSPC;
3515         u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3516
3517         if (ctl->total_bitmaps == 0)
3518                 return -ENOSPC;
3519
3520         /*
3521          * The bitmap that covers offset won't be in the list unless offset
3522          * is just its start offset.
3523          */
3524         if (!list_empty(bitmaps))
3525                 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3526
3527         if (!entry || entry->offset != bitmap_offset) {
3528                 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3529                 if (entry && list_empty(&entry->list))
3530                         list_add(&entry->list, bitmaps);
3531         }
3532
3533         list_for_each_entry(entry, bitmaps, list) {
3534                 if (entry->bytes < bytes)
3535                         continue;
3536                 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3537                                            bytes, cont1_bytes, min_bytes);
3538                 if (!ret)
3539                         return 0;
3540         }
3541
3542         /*
3543          * The bitmaps list has all the bitmaps that record free space
3544          * starting after offset, so no more search is required.
3545          */
3546         return -ENOSPC;
3547 }
3548
3549 /*
3550  * here we try to find a cluster of blocks in a block group.  The goal
3551  * is to find at least bytes+empty_size.
3552  * We might not find them all in one contiguous area.
3553  *
3554  * returns zero and sets up cluster if things worked out, otherwise
3555  * it returns -enospc
3556  */
3557 int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3558                              struct btrfs_free_cluster *cluster,
3559                              u64 offset, u64 bytes, u64 empty_size)
3560 {
3561         struct btrfs_fs_info *fs_info = block_group->fs_info;
3562         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3563         struct btrfs_free_space *entry, *tmp;
3564         LIST_HEAD(bitmaps);
3565         u64 min_bytes;
3566         u64 cont1_bytes;
3567         int ret;
3568
3569         /*
3570          * Choose the minimum extent size we'll require for this
3571          * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3572          * For metadata, allow allocates with smaller extents.  For
3573          * data, keep it dense.
3574          */
3575         if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3576                 cont1_bytes = bytes + empty_size;
3577                 min_bytes = cont1_bytes;
3578         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3579                 cont1_bytes = bytes;
3580                 min_bytes = fs_info->sectorsize;
3581         } else {
3582                 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3583                 min_bytes = fs_info->sectorsize;
3584         }
3585
3586         spin_lock(&ctl->tree_lock);
3587
3588         /*
3589          * If we know we don't have enough space to make a cluster don't even
3590          * bother doing all the work to try and find one.
3591          */
3592         if (ctl->free_space < bytes) {
3593                 spin_unlock(&ctl->tree_lock);
3594                 return -ENOSPC;
3595         }
3596
3597         spin_lock(&cluster->lock);
3598
3599         /* someone already found a cluster, hooray */
3600         if (cluster->block_group) {
3601                 ret = 0;
3602                 goto out;
3603         }
3604
3605         trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3606                                  min_bytes);
3607
3608         ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3609                                       bytes + empty_size,
3610                                       cont1_bytes, min_bytes);
3611         if (ret)
3612                 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3613                                            offset, bytes + empty_size,
3614                                            cont1_bytes, min_bytes);
3615
3616         /* Clear our temporary list */
3617         list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3618                 list_del_init(&entry->list);
3619
3620         if (!ret) {
3621                 btrfs_get_block_group(block_group);
3622                 list_add_tail(&cluster->block_group_list,
3623                               &block_group->cluster_list);
3624                 cluster->block_group = block_group;
3625         } else {
3626                 trace_btrfs_failed_cluster_setup(block_group);
3627         }
3628 out:
3629         spin_unlock(&cluster->lock);
3630         spin_unlock(&ctl->tree_lock);
3631
3632         return ret;
3633 }
3634
3635 /*
3636  * simple code to zero out a cluster
3637  */
3638 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3639 {
3640         spin_lock_init(&cluster->lock);
3641         spin_lock_init(&cluster->refill_lock);
3642         cluster->root = RB_ROOT;
3643         cluster->max_size = 0;
3644         cluster->fragmented = false;
3645         INIT_LIST_HEAD(&cluster->block_group_list);
3646         cluster->block_group = NULL;
3647 }
3648
3649 static int do_trimming(struct btrfs_block_group *block_group,
3650                        u64 *total_trimmed, u64 start, u64 bytes,
3651                        u64 reserved_start, u64 reserved_bytes,
3652                        enum btrfs_trim_state reserved_trim_state,
3653                        struct btrfs_trim_range *trim_entry)
3654 {
3655         struct btrfs_space_info *space_info = block_group->space_info;
3656         struct btrfs_fs_info *fs_info = block_group->fs_info;
3657         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3658         int ret;
3659         int update = 0;
3660         const u64 end = start + bytes;
3661         const u64 reserved_end = reserved_start + reserved_bytes;
3662         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3663         u64 trimmed = 0;
3664
3665         spin_lock(&space_info->lock);
3666         spin_lock(&block_group->lock);
3667         if (!block_group->ro) {
3668                 block_group->reserved += reserved_bytes;
3669                 space_info->bytes_reserved += reserved_bytes;
3670                 update = 1;
3671         }
3672         spin_unlock(&block_group->lock);
3673         spin_unlock(&space_info->lock);
3674
3675         ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3676         if (!ret) {
3677                 *total_trimmed += trimmed;
3678                 trim_state = BTRFS_TRIM_STATE_TRIMMED;
3679         }
3680
3681         mutex_lock(&ctl->cache_writeout_mutex);
3682         if (reserved_start < start)
3683                 __btrfs_add_free_space(block_group, reserved_start,
3684                                        start - reserved_start,
3685                                        reserved_trim_state);
3686         if (end < reserved_end)
3687                 __btrfs_add_free_space(block_group, end, reserved_end - end,
3688                                        reserved_trim_state);
3689         __btrfs_add_free_space(block_group, start, bytes, trim_state);
3690         list_del(&trim_entry->list);
3691         mutex_unlock(&ctl->cache_writeout_mutex);
3692
3693         if (update) {
3694                 spin_lock(&space_info->lock);
3695                 spin_lock(&block_group->lock);
3696                 if (block_group->ro)
3697                         space_info->bytes_readonly += reserved_bytes;
3698                 block_group->reserved -= reserved_bytes;
3699                 space_info->bytes_reserved -= reserved_bytes;
3700                 spin_unlock(&block_group->lock);
3701                 spin_unlock(&space_info->lock);
3702         }
3703
3704         return ret;
3705 }
3706
3707 /*
3708  * If @async is set, then we will trim 1 region and return.
3709  */
3710 static int trim_no_bitmap(struct btrfs_block_group *block_group,
3711                           u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3712                           bool async)
3713 {
3714         struct btrfs_discard_ctl *discard_ctl =
3715                                         &block_group->fs_info->discard_ctl;
3716         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3717         struct btrfs_free_space *entry;
3718         struct rb_node *node;
3719         int ret = 0;
3720         u64 extent_start;
3721         u64 extent_bytes;
3722         enum btrfs_trim_state extent_trim_state;
3723         u64 bytes;
3724         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3725
3726         while (start < end) {
3727                 struct btrfs_trim_range trim_entry;
3728
3729                 mutex_lock(&ctl->cache_writeout_mutex);
3730                 spin_lock(&ctl->tree_lock);
3731
3732                 if (ctl->free_space < minlen)
3733                         goto out_unlock;
3734
3735                 entry = tree_search_offset(ctl, start, 0, 1);
3736                 if (!entry)
3737                         goto out_unlock;
3738
3739                 /* Skip bitmaps and if async, already trimmed entries */
3740                 while (entry->bitmap ||
3741                        (async && btrfs_free_space_trimmed(entry))) {
3742                         node = rb_next(&entry->offset_index);
3743                         if (!node)
3744                                 goto out_unlock;
3745                         entry = rb_entry(node, struct btrfs_free_space,
3746                                          offset_index);
3747                 }
3748
3749                 if (entry->offset >= end)
3750                         goto out_unlock;
3751
3752                 extent_start = entry->offset;
3753                 extent_bytes = entry->bytes;
3754                 extent_trim_state = entry->trim_state;
3755                 if (async) {
3756                         start = entry->offset;
3757                         bytes = entry->bytes;
3758                         if (bytes < minlen) {
3759                                 spin_unlock(&ctl->tree_lock);
3760                                 mutex_unlock(&ctl->cache_writeout_mutex);
3761                                 goto next;
3762                         }
3763                         unlink_free_space(ctl, entry, true);
3764                         /*
3765                          * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3766                          * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3767                          * X when we come back around.  So trim it now.
3768                          */
3769                         if (max_discard_size &&
3770                             bytes >= (max_discard_size +
3771                                       BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3772                                 bytes = max_discard_size;
3773                                 extent_bytes = max_discard_size;
3774                                 entry->offset += max_discard_size;
3775                                 entry->bytes -= max_discard_size;
3776                                 link_free_space(ctl, entry);
3777                         } else {
3778                                 kmem_cache_free(btrfs_free_space_cachep, entry);
3779                         }
3780                 } else {
3781                         start = max(start, extent_start);
3782                         bytes = min(extent_start + extent_bytes, end) - start;
3783                         if (bytes < minlen) {
3784                                 spin_unlock(&ctl->tree_lock);
3785                                 mutex_unlock(&ctl->cache_writeout_mutex);
3786                                 goto next;
3787                         }
3788
3789                         unlink_free_space(ctl, entry, true);
3790                         kmem_cache_free(btrfs_free_space_cachep, entry);
3791                 }
3792
3793                 spin_unlock(&ctl->tree_lock);
3794                 trim_entry.start = extent_start;
3795                 trim_entry.bytes = extent_bytes;
3796                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3797                 mutex_unlock(&ctl->cache_writeout_mutex);
3798
3799                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3800                                   extent_start, extent_bytes, extent_trim_state,
3801                                   &trim_entry);
3802                 if (ret) {
3803                         block_group->discard_cursor = start + bytes;
3804                         break;
3805                 }
3806 next:
3807                 start += bytes;
3808                 block_group->discard_cursor = start;
3809                 if (async && *total_trimmed)
3810                         break;
3811
3812                 if (fatal_signal_pending(current)) {
3813                         ret = -ERESTARTSYS;
3814                         break;
3815                 }
3816
3817                 cond_resched();
3818         }
3819
3820         return ret;
3821
3822 out_unlock:
3823         block_group->discard_cursor = btrfs_block_group_end(block_group);
3824         spin_unlock(&ctl->tree_lock);
3825         mutex_unlock(&ctl->cache_writeout_mutex);
3826
3827         return ret;
3828 }
3829
3830 /*
3831  * If we break out of trimming a bitmap prematurely, we should reset the
3832  * trimming bit.  In a rather contrieved case, it's possible to race here so
3833  * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3834  *
3835  * start = start of bitmap
3836  * end = near end of bitmap
3837  *
3838  * Thread 1:                    Thread 2:
3839  * trim_bitmaps(start)
3840  *                              trim_bitmaps(end)
3841  *                              end_trimming_bitmap()
3842  * reset_trimming_bitmap()
3843  */
3844 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3845 {
3846         struct btrfs_free_space *entry;
3847
3848         spin_lock(&ctl->tree_lock);
3849         entry = tree_search_offset(ctl, offset, 1, 0);
3850         if (entry) {
3851                 if (btrfs_free_space_trimmed(entry)) {
3852                         ctl->discardable_extents[BTRFS_STAT_CURR] +=
3853                                 entry->bitmap_extents;
3854                         ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3855                 }
3856                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3857         }
3858
3859         spin_unlock(&ctl->tree_lock);
3860 }
3861
3862 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3863                                 struct btrfs_free_space *entry)
3864 {
3865         if (btrfs_free_space_trimming_bitmap(entry)) {
3866                 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3867                 ctl->discardable_extents[BTRFS_STAT_CURR] -=
3868                         entry->bitmap_extents;
3869                 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3870         }
3871 }
3872
3873 /*
3874  * If @async is set, then we will trim 1 region and return.
3875  */
3876 static int trim_bitmaps(struct btrfs_block_group *block_group,
3877                         u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3878                         u64 maxlen, bool async)
3879 {
3880         struct btrfs_discard_ctl *discard_ctl =
3881                                         &block_group->fs_info->discard_ctl;
3882         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3883         struct btrfs_free_space *entry;
3884         int ret = 0;
3885         int ret2;
3886         u64 bytes;
3887         u64 offset = offset_to_bitmap(ctl, start);
3888         const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3889
3890         while (offset < end) {
3891                 bool next_bitmap = false;
3892                 struct btrfs_trim_range trim_entry;
3893
3894                 mutex_lock(&ctl->cache_writeout_mutex);
3895                 spin_lock(&ctl->tree_lock);
3896
3897                 if (ctl->free_space < minlen) {
3898                         block_group->discard_cursor =
3899                                 btrfs_block_group_end(block_group);
3900                         spin_unlock(&ctl->tree_lock);
3901                         mutex_unlock(&ctl->cache_writeout_mutex);
3902                         break;
3903                 }
3904
3905                 entry = tree_search_offset(ctl, offset, 1, 0);
3906                 /*
3907                  * Bitmaps are marked trimmed lossily now to prevent constant
3908                  * discarding of the same bitmap (the reason why we are bound
3909                  * by the filters).  So, retrim the block group bitmaps when we
3910                  * are preparing to punt to the unused_bgs list.  This uses
3911                  * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3912                  * which is the only discard index which sets minlen to 0.
3913                  */
3914                 if (!entry || (async && minlen && start == offset &&
3915                                btrfs_free_space_trimmed(entry))) {
3916                         spin_unlock(&ctl->tree_lock);
3917                         mutex_unlock(&ctl->cache_writeout_mutex);
3918                         next_bitmap = true;
3919                         goto next;
3920                 }
3921
3922                 /*
3923                  * Async discard bitmap trimming begins at by setting the start
3924                  * to be key.objectid and the offset_to_bitmap() aligns to the
3925                  * start of the bitmap.  This lets us know we are fully
3926                  * scanning the bitmap rather than only some portion of it.
3927                  */
3928                 if (start == offset)
3929                         entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3930
3931                 bytes = minlen;
3932                 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3933                 if (ret2 || start >= end) {
3934                         /*
3935                          * We lossily consider a bitmap trimmed if we only skip
3936                          * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3937                          */
3938                         if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3939                                 end_trimming_bitmap(ctl, entry);
3940                         else
3941                                 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3942                         spin_unlock(&ctl->tree_lock);
3943                         mutex_unlock(&ctl->cache_writeout_mutex);
3944                         next_bitmap = true;
3945                         goto next;
3946                 }
3947
3948                 /*
3949                  * We already trimmed a region, but are using the locking above
3950                  * to reset the trim_state.
3951                  */
3952                 if (async && *total_trimmed) {
3953                         spin_unlock(&ctl->tree_lock);
3954                         mutex_unlock(&ctl->cache_writeout_mutex);
3955                         goto out;
3956                 }
3957
3958                 bytes = min(bytes, end - start);
3959                 if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3960                         spin_unlock(&ctl->tree_lock);
3961                         mutex_unlock(&ctl->cache_writeout_mutex);
3962                         goto next;
3963                 }
3964
3965                 /*
3966                  * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3967                  * If X < @minlen, we won't trim X when we come back around.
3968                  * So trim it now.  We differ here from trimming extents as we
3969                  * don't keep individual state per bit.
3970                  */
3971                 if (async &&
3972                     max_discard_size &&
3973                     bytes > (max_discard_size + minlen))
3974                         bytes = max_discard_size;
3975
3976                 bitmap_clear_bits(ctl, entry, start, bytes, true);
3977                 if (entry->bytes == 0)
3978                         free_bitmap(ctl, entry);
3979
3980                 spin_unlock(&ctl->tree_lock);
3981                 trim_entry.start = start;
3982                 trim_entry.bytes = bytes;
3983                 list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3984                 mutex_unlock(&ctl->cache_writeout_mutex);
3985
3986                 ret = do_trimming(block_group, total_trimmed, start, bytes,
3987                                   start, bytes, 0, &trim_entry);
3988                 if (ret) {
3989                         reset_trimming_bitmap(ctl, offset);
3990                         block_group->discard_cursor =
3991                                 btrfs_block_group_end(block_group);
3992                         break;
3993                 }
3994 next:
3995                 if (next_bitmap) {
3996                         offset += BITS_PER_BITMAP * ctl->unit;
3997                         start = offset;
3998                 } else {
3999                         start += bytes;
4000                 }
4001                 block_group->discard_cursor = start;
4002
4003                 if (fatal_signal_pending(current)) {
4004                         if (start != offset)
4005                                 reset_trimming_bitmap(ctl, offset);
4006                         ret = -ERESTARTSYS;
4007                         break;
4008                 }
4009
4010                 cond_resched();
4011         }
4012
4013         if (offset >= end)
4014                 block_group->discard_cursor = end;
4015
4016 out:
4017         return ret;
4018 }
4019
4020 int btrfs_trim_block_group(struct btrfs_block_group *block_group,
4021                            u64 *trimmed, u64 start, u64 end, u64 minlen)
4022 {
4023         struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
4024         int ret;
4025         u64 rem = 0;
4026
4027         ASSERT(!btrfs_is_zoned(block_group->fs_info));
4028
4029         *trimmed = 0;
4030
4031         spin_lock(&block_group->lock);
4032         if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4033                 spin_unlock(&block_group->lock);
4034                 return 0;
4035         }
4036         btrfs_freeze_block_group(block_group);
4037         spin_unlock(&block_group->lock);
4038
4039         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
4040         if (ret)
4041                 goto out;
4042
4043         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
4044         div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4045         /* If we ended in the middle of a bitmap, reset the trimming flag */
4046         if (rem)
4047                 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
4048 out:
4049         btrfs_unfreeze_block_group(block_group);
4050         return ret;
4051 }
4052
4053 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4054                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
4055                                    bool async)
4056 {
4057         int ret;
4058
4059         *trimmed = 0;
4060
4061         spin_lock(&block_group->lock);
4062         if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4063                 spin_unlock(&block_group->lock);
4064                 return 0;
4065         }
4066         btrfs_freeze_block_group(block_group);
4067         spin_unlock(&block_group->lock);
4068
4069         ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
4070         btrfs_unfreeze_block_group(block_group);
4071
4072         return ret;
4073 }
4074
4075 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4076                                    u64 *trimmed, u64 start, u64 end, u64 minlen,
4077                                    u64 maxlen, bool async)
4078 {
4079         int ret;
4080
4081         *trimmed = 0;
4082
4083         spin_lock(&block_group->lock);
4084         if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) {
4085                 spin_unlock(&block_group->lock);
4086                 return 0;
4087         }
4088         btrfs_freeze_block_group(block_group);
4089         spin_unlock(&block_group->lock);
4090
4091         ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4092                            async);
4093
4094         btrfs_unfreeze_block_group(block_group);
4095
4096         return ret;
4097 }
4098
4099 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4100 {
4101         return btrfs_super_cache_generation(fs_info->super_copy);
4102 }
4103
4104 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4105                                        struct btrfs_trans_handle *trans)
4106 {
4107         struct btrfs_block_group *block_group;
4108         struct rb_node *node;
4109         int ret = 0;
4110
4111         btrfs_info(fs_info, "cleaning free space cache v1");
4112
4113         node = rb_first_cached(&fs_info->block_group_cache_tree);
4114         while (node) {
4115                 block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4116                 ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4117                 if (ret)
4118                         goto out;
4119                 node = rb_next(node);
4120         }
4121 out:
4122         return ret;
4123 }
4124
4125 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4126 {
4127         struct btrfs_trans_handle *trans;
4128         int ret;
4129
4130         /*
4131          * update_super_roots will appropriately set or unset
4132          * super_copy->cache_generation based on SPACE_CACHE and
4133          * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4134          * transaction commit whether we are enabling space cache v1 and don't
4135          * have any other work to do, or are disabling it and removing free
4136          * space inodes.
4137          */
4138         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4139         if (IS_ERR(trans))
4140                 return PTR_ERR(trans);
4141
4142         if (!active) {
4143                 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4144                 ret = cleanup_free_space_cache_v1(fs_info, trans);
4145                 if (ret) {
4146                         btrfs_abort_transaction(trans, ret);
4147                         btrfs_end_transaction(trans);
4148                         goto out;
4149                 }
4150         }
4151
4152         ret = btrfs_commit_transaction(trans);
4153 out:
4154         clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4155
4156         return ret;
4157 }
4158
4159 int __init btrfs_free_space_init(void)
4160 {
4161         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
4162                         sizeof(struct btrfs_free_space), 0,
4163                         SLAB_MEM_SPREAD, NULL);
4164         if (!btrfs_free_space_cachep)
4165                 return -ENOMEM;
4166
4167         btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
4168                                                         PAGE_SIZE, PAGE_SIZE,
4169                                                         SLAB_MEM_SPREAD, NULL);
4170         if (!btrfs_free_space_bitmap_cachep) {
4171                 kmem_cache_destroy(btrfs_free_space_cachep);
4172                 return -ENOMEM;
4173         }
4174
4175         return 0;
4176 }
4177
4178 void __cold btrfs_free_space_exit(void)
4179 {
4180         kmem_cache_destroy(btrfs_free_space_cachep);
4181         kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
4182 }
4183
4184 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4185 /*
4186  * Use this if you need to make a bitmap or extent entry specifically, it
4187  * doesn't do any of the merging that add_free_space does, this acts a lot like
4188  * how the free space cache loading stuff works, so you can get really weird
4189  * configurations.
4190  */
4191 int test_add_free_space_entry(struct btrfs_block_group *cache,
4192                               u64 offset, u64 bytes, bool bitmap)
4193 {
4194         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4195         struct btrfs_free_space *info = NULL, *bitmap_info;
4196         void *map = NULL;
4197         enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4198         u64 bytes_added;
4199         int ret;
4200
4201 again:
4202         if (!info) {
4203                 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4204                 if (!info)
4205                         return -ENOMEM;
4206         }
4207
4208         if (!bitmap) {
4209                 spin_lock(&ctl->tree_lock);
4210                 info->offset = offset;
4211                 info->bytes = bytes;
4212                 info->max_extent_size = 0;
4213                 ret = link_free_space(ctl, info);
4214                 spin_unlock(&ctl->tree_lock);
4215                 if (ret)
4216                         kmem_cache_free(btrfs_free_space_cachep, info);
4217                 return ret;
4218         }
4219
4220         if (!map) {
4221                 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4222                 if (!map) {
4223                         kmem_cache_free(btrfs_free_space_cachep, info);
4224                         return -ENOMEM;
4225                 }
4226         }
4227
4228         spin_lock(&ctl->tree_lock);
4229         bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4230                                          1, 0);
4231         if (!bitmap_info) {
4232                 info->bitmap = map;
4233                 map = NULL;
4234                 add_new_bitmap(ctl, info, offset);
4235                 bitmap_info = info;
4236                 info = NULL;
4237         }
4238
4239         bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4240                                           trim_state);
4241
4242         bytes -= bytes_added;
4243         offset += bytes_added;
4244         spin_unlock(&ctl->tree_lock);
4245
4246         if (bytes)
4247                 goto again;
4248
4249         if (info)
4250                 kmem_cache_free(btrfs_free_space_cachep, info);
4251         if (map)
4252                 kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4253         return 0;
4254 }
4255
4256 /*
4257  * Checks to see if the given range is in the free space cache.  This is really
4258  * just used to check the absence of space, so if there is free space in the
4259  * range at all we will return 1.
4260  */
4261 int test_check_exists(struct btrfs_block_group *cache,
4262                       u64 offset, u64 bytes)
4263 {
4264         struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4265         struct btrfs_free_space *info;
4266         int ret = 0;
4267
4268         spin_lock(&ctl->tree_lock);
4269         info = tree_search_offset(ctl, offset, 0, 0);
4270         if (!info) {
4271                 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4272                                           1, 0);
4273                 if (!info)
4274                         goto out;
4275         }
4276
4277 have_info:
4278         if (info->bitmap) {
4279                 u64 bit_off, bit_bytes;
4280                 struct rb_node *n;
4281                 struct btrfs_free_space *tmp;
4282
4283                 bit_off = offset;
4284                 bit_bytes = ctl->unit;
4285                 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4286                 if (!ret) {
4287                         if (bit_off == offset) {
4288                                 ret = 1;
4289                                 goto out;
4290                         } else if (bit_off > offset &&
4291                                    offset + bytes > bit_off) {
4292                                 ret = 1;
4293                                 goto out;
4294                         }
4295                 }
4296
4297                 n = rb_prev(&info->offset_index);
4298                 while (n) {
4299                         tmp = rb_entry(n, struct btrfs_free_space,
4300                                        offset_index);
4301                         if (tmp->offset + tmp->bytes < offset)
4302                                 break;
4303                         if (offset + bytes < tmp->offset) {
4304                                 n = rb_prev(&tmp->offset_index);
4305                                 continue;
4306                         }
4307                         info = tmp;
4308                         goto have_info;
4309                 }
4310
4311                 n = rb_next(&info->offset_index);
4312                 while (n) {
4313                         tmp = rb_entry(n, struct btrfs_free_space,
4314                                        offset_index);
4315                         if (offset + bytes < tmp->offset)
4316                                 break;
4317                         if (tmp->offset + tmp->bytes < offset) {
4318                                 n = rb_next(&tmp->offset_index);
4319                                 continue;
4320                         }
4321                         info = tmp;
4322                         goto have_info;
4323                 }
4324
4325                 ret = 0;
4326                 goto out;
4327         }
4328
4329         if (info->offset == offset) {
4330                 ret = 1;
4331                 goto out;
4332         }
4333
4334         if (offset > info->offset && offset < info->offset + info->bytes)
4335                 ret = 1;
4336 out:
4337         spin_unlock(&ctl->tree_lock);
4338         return ret;
4339 }
4340 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */