15a410bbbe704b921bd8296c5a453c6ac14fcc0c
[platform/kernel/linux-starfive.git] / fs / btrfs / ordered-data.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/slab.h>
7 #include <linux/blkdev.h>
8 #include <linux/writeback.h>
9 #include <linux/sched/mm.h>
10 #include "messages.h"
11 #include "misc.h"
12 #include "ctree.h"
13 #include "transaction.h"
14 #include "btrfs_inode.h"
15 #include "extent_io.h"
16 #include "disk-io.h"
17 #include "compression.h"
18 #include "delalloc-space.h"
19 #include "qgroup.h"
20 #include "subpage.h"
21 #include "file.h"
22 #include "super.h"
23
24 static struct kmem_cache *btrfs_ordered_extent_cache;
25
26 static u64 entry_end(struct btrfs_ordered_extent *entry)
27 {
28         if (entry->file_offset + entry->num_bytes < entry->file_offset)
29                 return (u64)-1;
30         return entry->file_offset + entry->num_bytes;
31 }
32
33 /* returns NULL if the insertion worked, or it returns the node it did find
34  * in the tree
35  */
36 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
37                                    struct rb_node *node)
38 {
39         struct rb_node **p = &root->rb_node;
40         struct rb_node *parent = NULL;
41         struct btrfs_ordered_extent *entry;
42
43         while (*p) {
44                 parent = *p;
45                 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
46
47                 if (file_offset < entry->file_offset)
48                         p = &(*p)->rb_left;
49                 else if (file_offset >= entry_end(entry))
50                         p = &(*p)->rb_right;
51                 else
52                         return parent;
53         }
54
55         rb_link_node(node, parent, p);
56         rb_insert_color(node, root);
57         return NULL;
58 }
59
60 /*
61  * look for a given offset in the tree, and if it can't be found return the
62  * first lesser offset
63  */
64 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
65                                      struct rb_node **prev_ret)
66 {
67         struct rb_node *n = root->rb_node;
68         struct rb_node *prev = NULL;
69         struct rb_node *test;
70         struct btrfs_ordered_extent *entry;
71         struct btrfs_ordered_extent *prev_entry = NULL;
72
73         while (n) {
74                 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
75                 prev = n;
76                 prev_entry = entry;
77
78                 if (file_offset < entry->file_offset)
79                         n = n->rb_left;
80                 else if (file_offset >= entry_end(entry))
81                         n = n->rb_right;
82                 else
83                         return n;
84         }
85         if (!prev_ret)
86                 return NULL;
87
88         while (prev && file_offset >= entry_end(prev_entry)) {
89                 test = rb_next(prev);
90                 if (!test)
91                         break;
92                 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
93                                       rb_node);
94                 if (file_offset < entry_end(prev_entry))
95                         break;
96
97                 prev = test;
98         }
99         if (prev)
100                 prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
101                                       rb_node);
102         while (prev && file_offset < entry_end(prev_entry)) {
103                 test = rb_prev(prev);
104                 if (!test)
105                         break;
106                 prev_entry = rb_entry(test, struct btrfs_ordered_extent,
107                                       rb_node);
108                 prev = test;
109         }
110         *prev_ret = prev;
111         return NULL;
112 }
113
114 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
115                           u64 len)
116 {
117         if (file_offset + len <= entry->file_offset ||
118             entry->file_offset + entry->num_bytes <= file_offset)
119                 return 0;
120         return 1;
121 }
122
123 /*
124  * look find the first ordered struct that has this offset, otherwise
125  * the first one less than this offset
126  */
127 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
128                                           u64 file_offset)
129 {
130         struct rb_root *root = &tree->tree;
131         struct rb_node *prev = NULL;
132         struct rb_node *ret;
133         struct btrfs_ordered_extent *entry;
134
135         if (tree->last) {
136                 entry = rb_entry(tree->last, struct btrfs_ordered_extent,
137                                  rb_node);
138                 if (in_range(file_offset, entry->file_offset, entry->num_bytes))
139                         return tree->last;
140         }
141         ret = __tree_search(root, file_offset, &prev);
142         if (!ret)
143                 ret = prev;
144         if (ret)
145                 tree->last = ret;
146         return ret;
147 }
148
149 /*
150  * Add an ordered extent to the per-inode tree.
151  *
152  * @inode:           Inode that this extent is for.
153  * @file_offset:     Logical offset in file where the extent starts.
154  * @num_bytes:       Logical length of extent in file.
155  * @ram_bytes:       Full length of unencoded data.
156  * @disk_bytenr:     Offset of extent on disk.
157  * @disk_num_bytes:  Size of extent on disk.
158  * @offset:          Offset into unencoded data where file data starts.
159  * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
160  * @compress_type:   Compression algorithm used for data.
161  *
162  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
163  * tree is given a single reference on the ordered extent that was inserted, and
164  * the returned pointer is given a second reference.
165  *
166  * Return: the new ordered extent or error pointer.
167  */
168 struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
169                         struct btrfs_inode *inode, u64 file_offset,
170                         u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
171                         u64 disk_num_bytes, u64 offset, unsigned long flags,
172                         int compress_type)
173 {
174         struct btrfs_root *root = inode->root;
175         struct btrfs_fs_info *fs_info = root->fs_info;
176         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
177         struct rb_node *node;
178         struct btrfs_ordered_extent *entry;
179         int ret;
180
181         if (flags &
182             ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
183                 /* For nocow write, we can release the qgroup rsv right now */
184                 ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
185                 if (ret < 0)
186                         return ERR_PTR(ret);
187                 ret = 0;
188         } else {
189                 /*
190                  * The ordered extent has reserved qgroup space, release now
191                  * and pass the reserved number for qgroup_record to free.
192                  */
193                 ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
194                 if (ret < 0)
195                         return ERR_PTR(ret);
196         }
197         entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
198         if (!entry)
199                 return ERR_PTR(-ENOMEM);
200
201         entry->file_offset = file_offset;
202         entry->num_bytes = num_bytes;
203         entry->ram_bytes = ram_bytes;
204         entry->disk_bytenr = disk_bytenr;
205         entry->disk_num_bytes = disk_num_bytes;
206         entry->offset = offset;
207         entry->bytes_left = num_bytes;
208         entry->inode = igrab(&inode->vfs_inode);
209         entry->compress_type = compress_type;
210         entry->truncated_len = (u64)-1;
211         entry->qgroup_rsv = ret;
212
213         ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
214         entry->flags = flags;
215
216         percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
217                                  fs_info->delalloc_batch);
218
219         /* one ref for the tree */
220         refcount_set(&entry->refs, 1);
221         init_waitqueue_head(&entry->wait);
222         INIT_LIST_HEAD(&entry->list);
223         INIT_LIST_HEAD(&entry->log_list);
224         INIT_LIST_HEAD(&entry->root_extent_list);
225         INIT_LIST_HEAD(&entry->work_list);
226         init_completion(&entry->completion);
227
228         trace_btrfs_ordered_extent_add(inode, entry);
229
230         spin_lock_irq(&tree->lock);
231         node = tree_insert(&tree->tree, file_offset,
232                            &entry->rb_node);
233         if (node)
234                 btrfs_panic(fs_info, -EEXIST,
235                                 "inconsistency in ordered tree at offset %llu",
236                                 file_offset);
237         spin_unlock_irq(&tree->lock);
238
239         spin_lock(&root->ordered_extent_lock);
240         list_add_tail(&entry->root_extent_list,
241                       &root->ordered_extents);
242         root->nr_ordered_extents++;
243         if (root->nr_ordered_extents == 1) {
244                 spin_lock(&fs_info->ordered_root_lock);
245                 BUG_ON(!list_empty(&root->ordered_root));
246                 list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
247                 spin_unlock(&fs_info->ordered_root_lock);
248         }
249         spin_unlock(&root->ordered_extent_lock);
250
251         /*
252          * We don't need the count_max_extents here, we can assume that all of
253          * that work has been done at higher layers, so this is truly the
254          * smallest the extent is going to get.
255          */
256         spin_lock(&inode->lock);
257         btrfs_mod_outstanding_extents(inode, 1);
258         spin_unlock(&inode->lock);
259
260         /* One ref for the returned entry to match semantics of lookup. */
261         refcount_inc(&entry->refs);
262
263         return entry;
264 }
265
266 /*
267  * Add a new btrfs_ordered_extent for the range, but drop the reference instead
268  * of returning it to the caller.
269  */
270 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
271                              u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
272                              u64 disk_num_bytes, u64 offset, unsigned long flags,
273                              int compress_type)
274 {
275         struct btrfs_ordered_extent *ordered;
276
277         ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
278                                              ram_bytes, disk_bytenr,
279                                              disk_num_bytes, offset, flags,
280                                              compress_type);
281
282         if (IS_ERR(ordered))
283                 return PTR_ERR(ordered);
284         btrfs_put_ordered_extent(ordered);
285
286         return 0;
287 }
288
289 /*
290  * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
291  * when an ordered extent is finished.  If the list covers more than one
292  * ordered extent, it is split across multiples.
293  */
294 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
295                            struct btrfs_ordered_sum *sum)
296 {
297         struct btrfs_ordered_inode_tree *tree;
298
299         tree = &BTRFS_I(entry->inode)->ordered_tree;
300         spin_lock_irq(&tree->lock);
301         list_add_tail(&sum->list, &entry->list);
302         spin_unlock_irq(&tree->lock);
303 }
304
305 static void finish_ordered_fn(struct btrfs_work *work)
306 {
307         struct btrfs_ordered_extent *ordered_extent;
308
309         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
310         btrfs_finish_ordered_io(ordered_extent);
311 }
312
313 /*
314  * Mark all ordered extents io inside the specified range finished.
315  *
316  * @page:        The involved page for the operation.
317  *               For uncompressed buffered IO, the page status also needs to be
318  *               updated to indicate whether the pending ordered io is finished.
319  *               Can be NULL for direct IO and compressed write.
320  *               For these cases, callers are ensured they won't execute the
321  *               endio function twice.
322  *
323  * This function is called for endio, thus the range must have ordered
324  * extent(s) covering it.
325  */
326 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
327                                     struct page *page, u64 file_offset,
328                                     u64 num_bytes, bool uptodate)
329 {
330         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
331         struct btrfs_fs_info *fs_info = inode->root->fs_info;
332         struct btrfs_workqueue *wq;
333         struct rb_node *node;
334         struct btrfs_ordered_extent *entry = NULL;
335         unsigned long flags;
336         u64 cur = file_offset;
337
338         if (btrfs_is_free_space_inode(inode))
339                 wq = fs_info->endio_freespace_worker;
340         else
341                 wq = fs_info->endio_write_workers;
342
343         if (page)
344                 ASSERT(page->mapping && page_offset(page) <= file_offset &&
345                        file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
346
347         spin_lock_irqsave(&tree->lock, flags);
348         while (cur < file_offset + num_bytes) {
349                 u64 entry_end;
350                 u64 end;
351                 u32 len;
352
353                 node = tree_search(tree, cur);
354                 /* No ordered extents at all */
355                 if (!node)
356                         break;
357
358                 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
359                 entry_end = entry->file_offset + entry->num_bytes;
360                 /*
361                  * |<-- OE --->|  |
362                  *                cur
363                  * Go to next OE.
364                  */
365                 if (cur >= entry_end) {
366                         node = rb_next(node);
367                         /* No more ordered extents, exit */
368                         if (!node)
369                                 break;
370                         entry = rb_entry(node, struct btrfs_ordered_extent,
371                                          rb_node);
372
373                         /* Go to next ordered extent and continue */
374                         cur = entry->file_offset;
375                         continue;
376                 }
377                 /*
378                  * |    |<--- OE --->|
379                  * cur
380                  * Go to the start of OE.
381                  */
382                 if (cur < entry->file_offset) {
383                         cur = entry->file_offset;
384                         continue;
385                 }
386
387                 /*
388                  * Now we are definitely inside one ordered extent.
389                  *
390                  * |<--- OE --->|
391                  *      |
392                  *      cur
393                  */
394                 end = min(entry->file_offset + entry->num_bytes,
395                           file_offset + num_bytes) - 1;
396                 ASSERT(end + 1 - cur < U32_MAX);
397                 len = end + 1 - cur;
398
399                 if (page) {
400                         /*
401                          * Ordered (Private2) bit indicates whether we still
402                          * have pending io unfinished for the ordered extent.
403                          *
404                          * If there's no such bit, we need to skip to next range.
405                          */
406                         if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
407                                 cur += len;
408                                 continue;
409                         }
410                         btrfs_page_clear_ordered(fs_info, page, cur, len);
411                 }
412
413                 /* Now we're fine to update the accounting */
414                 if (unlikely(len > entry->bytes_left)) {
415                         WARN_ON(1);
416                         btrfs_crit(fs_info,
417 "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
418                                    inode->root->root_key.objectid,
419                                    btrfs_ino(inode),
420                                    entry->file_offset,
421                                    entry->num_bytes,
422                                    len, entry->bytes_left);
423                         entry->bytes_left = 0;
424                 } else {
425                         entry->bytes_left -= len;
426                 }
427
428                 if (!uptodate)
429                         set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
430
431                 /*
432                  * All the IO of the ordered extent is finished, we need to queue
433                  * the finish_func to be executed.
434                  */
435                 if (entry->bytes_left == 0) {
436                         set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
437                         cond_wake_up(&entry->wait);
438                         refcount_inc(&entry->refs);
439                         trace_btrfs_ordered_extent_mark_finished(inode, entry);
440                         spin_unlock_irqrestore(&tree->lock, flags);
441                         btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
442                         btrfs_queue_work(wq, &entry->work);
443                         spin_lock_irqsave(&tree->lock, flags);
444                 }
445                 cur += len;
446         }
447         spin_unlock_irqrestore(&tree->lock, flags);
448 }
449
450 /*
451  * Finish IO for one ordered extent across a given range.  The range can only
452  * contain one ordered extent.
453  *
454  * @cached:      The cached ordered extent. If not NULL, we can skip the tree
455  *               search and use the ordered extent directly.
456  *               Will be also used to store the finished ordered extent.
457  * @file_offset: File offset for the finished IO
458  * @io_size:     Length of the finish IO range
459  *
460  * Return true if the ordered extent is finished in the range, and update
461  * @cached.
462  * Return false otherwise.
463  *
464  * NOTE: The range can NOT cross multiple ordered extents.
465  * Thus caller should ensure the range doesn't cross ordered extents.
466  */
467 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
468                                     struct btrfs_ordered_extent **cached,
469                                     u64 file_offset, u64 io_size)
470 {
471         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
472         struct rb_node *node;
473         struct btrfs_ordered_extent *entry = NULL;
474         unsigned long flags;
475         bool finished = false;
476
477         spin_lock_irqsave(&tree->lock, flags);
478         if (cached && *cached) {
479                 entry = *cached;
480                 goto have_entry;
481         }
482
483         node = tree_search(tree, file_offset);
484         if (!node)
485                 goto out;
486
487         entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
488 have_entry:
489         if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
490                 goto out;
491
492         if (io_size > entry->bytes_left)
493                 btrfs_crit(inode->root->fs_info,
494                            "bad ordered accounting left %llu size %llu",
495                        entry->bytes_left, io_size);
496
497         entry->bytes_left -= io_size;
498
499         if (entry->bytes_left == 0) {
500                 /*
501                  * Ensure only one caller can set the flag and finished_ret
502                  * accordingly
503                  */
504                 finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
505                 /* test_and_set_bit implies a barrier */
506                 cond_wake_up_nomb(&entry->wait);
507         }
508 out:
509         if (finished && cached && entry) {
510                 *cached = entry;
511                 refcount_inc(&entry->refs);
512                 trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
513         }
514         spin_unlock_irqrestore(&tree->lock, flags);
515         return finished;
516 }
517
518 /*
519  * used to drop a reference on an ordered extent.  This will free
520  * the extent if the last reference is dropped
521  */
522 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
523 {
524         struct list_head *cur;
525         struct btrfs_ordered_sum *sum;
526
527         trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
528
529         if (refcount_dec_and_test(&entry->refs)) {
530                 ASSERT(list_empty(&entry->root_extent_list));
531                 ASSERT(list_empty(&entry->log_list));
532                 ASSERT(RB_EMPTY_NODE(&entry->rb_node));
533                 if (entry->inode)
534                         btrfs_add_delayed_iput(BTRFS_I(entry->inode));
535                 while (!list_empty(&entry->list)) {
536                         cur = entry->list.next;
537                         sum = list_entry(cur, struct btrfs_ordered_sum, list);
538                         list_del(&sum->list);
539                         kvfree(sum);
540                 }
541                 kmem_cache_free(btrfs_ordered_extent_cache, entry);
542         }
543 }
544
545 /*
546  * remove an ordered extent from the tree.  No references are dropped
547  * and waiters are woken up.
548  */
549 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
550                                  struct btrfs_ordered_extent *entry)
551 {
552         struct btrfs_ordered_inode_tree *tree;
553         struct btrfs_root *root = btrfs_inode->root;
554         struct btrfs_fs_info *fs_info = root->fs_info;
555         struct rb_node *node;
556         bool pending;
557         bool freespace_inode;
558
559         /*
560          * If this is a free space inode the thread has not acquired the ordered
561          * extents lockdep map.
562          */
563         freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
564
565         btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
566         /* This is paired with btrfs_add_ordered_extent. */
567         spin_lock(&btrfs_inode->lock);
568         btrfs_mod_outstanding_extents(btrfs_inode, -1);
569         spin_unlock(&btrfs_inode->lock);
570         if (root != fs_info->tree_root) {
571                 u64 release;
572
573                 if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
574                         release = entry->disk_num_bytes;
575                 else
576                         release = entry->num_bytes;
577                 btrfs_delalloc_release_metadata(btrfs_inode, release, false);
578         }
579
580         percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
581                                  fs_info->delalloc_batch);
582
583         tree = &btrfs_inode->ordered_tree;
584         spin_lock_irq(&tree->lock);
585         node = &entry->rb_node;
586         rb_erase(node, &tree->tree);
587         RB_CLEAR_NODE(node);
588         if (tree->last == node)
589                 tree->last = NULL;
590         set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
591         pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
592         spin_unlock_irq(&tree->lock);
593
594         /*
595          * The current running transaction is waiting on us, we need to let it
596          * know that we're complete and wake it up.
597          */
598         if (pending) {
599                 struct btrfs_transaction *trans;
600
601                 /*
602                  * The checks for trans are just a formality, it should be set,
603                  * but if it isn't we don't want to deref/assert under the spin
604                  * lock, so be nice and check if trans is set, but ASSERT() so
605                  * if it isn't set a developer will notice.
606                  */
607                 spin_lock(&fs_info->trans_lock);
608                 trans = fs_info->running_transaction;
609                 if (trans)
610                         refcount_inc(&trans->use_count);
611                 spin_unlock(&fs_info->trans_lock);
612
613                 ASSERT(trans);
614                 if (trans) {
615                         if (atomic_dec_and_test(&trans->pending_ordered))
616                                 wake_up(&trans->pending_wait);
617                         btrfs_put_transaction(trans);
618                 }
619         }
620
621         btrfs_lockdep_release(fs_info, btrfs_trans_pending_ordered);
622
623         spin_lock(&root->ordered_extent_lock);
624         list_del_init(&entry->root_extent_list);
625         root->nr_ordered_extents--;
626
627         trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
628
629         if (!root->nr_ordered_extents) {
630                 spin_lock(&fs_info->ordered_root_lock);
631                 BUG_ON(list_empty(&root->ordered_root));
632                 list_del_init(&root->ordered_root);
633                 spin_unlock(&fs_info->ordered_root_lock);
634         }
635         spin_unlock(&root->ordered_extent_lock);
636         wake_up(&entry->wait);
637         if (!freespace_inode)
638                 btrfs_lockdep_release(fs_info, btrfs_ordered_extent);
639 }
640
641 static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
642 {
643         struct btrfs_ordered_extent *ordered;
644
645         ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
646         btrfs_start_ordered_extent(ordered);
647         complete(&ordered->completion);
648 }
649
650 /*
651  * wait for all the ordered extents in a root.  This is done when balancing
652  * space between drives.
653  */
654 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
655                                const u64 range_start, const u64 range_len)
656 {
657         struct btrfs_fs_info *fs_info = root->fs_info;
658         LIST_HEAD(splice);
659         LIST_HEAD(skipped);
660         LIST_HEAD(works);
661         struct btrfs_ordered_extent *ordered, *next;
662         u64 count = 0;
663         const u64 range_end = range_start + range_len;
664
665         mutex_lock(&root->ordered_extent_mutex);
666         spin_lock(&root->ordered_extent_lock);
667         list_splice_init(&root->ordered_extents, &splice);
668         while (!list_empty(&splice) && nr) {
669                 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
670                                            root_extent_list);
671
672                 if (range_end <= ordered->disk_bytenr ||
673                     ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
674                         list_move_tail(&ordered->root_extent_list, &skipped);
675                         cond_resched_lock(&root->ordered_extent_lock);
676                         continue;
677                 }
678
679                 list_move_tail(&ordered->root_extent_list,
680                                &root->ordered_extents);
681                 refcount_inc(&ordered->refs);
682                 spin_unlock(&root->ordered_extent_lock);
683
684                 btrfs_init_work(&ordered->flush_work,
685                                 btrfs_run_ordered_extent_work, NULL, NULL);
686                 list_add_tail(&ordered->work_list, &works);
687                 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
688
689                 cond_resched();
690                 spin_lock(&root->ordered_extent_lock);
691                 if (nr != U64_MAX)
692                         nr--;
693                 count++;
694         }
695         list_splice_tail(&skipped, &root->ordered_extents);
696         list_splice_tail(&splice, &root->ordered_extents);
697         spin_unlock(&root->ordered_extent_lock);
698
699         list_for_each_entry_safe(ordered, next, &works, work_list) {
700                 list_del_init(&ordered->work_list);
701                 wait_for_completion(&ordered->completion);
702                 btrfs_put_ordered_extent(ordered);
703                 cond_resched();
704         }
705         mutex_unlock(&root->ordered_extent_mutex);
706
707         return count;
708 }
709
710 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
711                              const u64 range_start, const u64 range_len)
712 {
713         struct btrfs_root *root;
714         struct list_head splice;
715         u64 done;
716
717         INIT_LIST_HEAD(&splice);
718
719         mutex_lock(&fs_info->ordered_operations_mutex);
720         spin_lock(&fs_info->ordered_root_lock);
721         list_splice_init(&fs_info->ordered_roots, &splice);
722         while (!list_empty(&splice) && nr) {
723                 root = list_first_entry(&splice, struct btrfs_root,
724                                         ordered_root);
725                 root = btrfs_grab_root(root);
726                 BUG_ON(!root);
727                 list_move_tail(&root->ordered_root,
728                                &fs_info->ordered_roots);
729                 spin_unlock(&fs_info->ordered_root_lock);
730
731                 done = btrfs_wait_ordered_extents(root, nr,
732                                                   range_start, range_len);
733                 btrfs_put_root(root);
734
735                 spin_lock(&fs_info->ordered_root_lock);
736                 if (nr != U64_MAX) {
737                         nr -= done;
738                 }
739         }
740         list_splice_tail(&splice, &fs_info->ordered_roots);
741         spin_unlock(&fs_info->ordered_root_lock);
742         mutex_unlock(&fs_info->ordered_operations_mutex);
743 }
744
745 /*
746  * Start IO and wait for a given ordered extent to finish.
747  *
748  * Wait on page writeback for all the pages in the extent and the IO completion
749  * code to insert metadata into the btree corresponding to the extent.
750  */
751 void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
752 {
753         u64 start = entry->file_offset;
754         u64 end = start + entry->num_bytes - 1;
755         struct btrfs_inode *inode = BTRFS_I(entry->inode);
756         bool freespace_inode;
757
758         trace_btrfs_ordered_extent_start(inode, entry);
759
760         /*
761          * If this is a free space inode do not take the ordered extents lockdep
762          * map.
763          */
764         freespace_inode = btrfs_is_free_space_inode(inode);
765
766         /*
767          * pages in the range can be dirty, clean or writeback.  We
768          * start IO on any dirty ones so the wait doesn't stall waiting
769          * for the flusher thread to find them
770          */
771         if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
772                 filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
773
774         if (!freespace_inode)
775                 btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
776         wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
777 }
778
779 /*
780  * Used to wait on ordered extents across a large range of bytes.
781  */
782 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
783 {
784         int ret = 0;
785         int ret_wb = 0;
786         u64 end;
787         u64 orig_end;
788         struct btrfs_ordered_extent *ordered;
789
790         if (start + len < start) {
791                 orig_end = OFFSET_MAX;
792         } else {
793                 orig_end = start + len - 1;
794                 if (orig_end > OFFSET_MAX)
795                         orig_end = OFFSET_MAX;
796         }
797
798         /* start IO across the range first to instantiate any delalloc
799          * extents
800          */
801         ret = btrfs_fdatawrite_range(inode, start, orig_end);
802         if (ret)
803                 return ret;
804
805         /*
806          * If we have a writeback error don't return immediately. Wait first
807          * for any ordered extents that haven't completed yet. This is to make
808          * sure no one can dirty the same page ranges and call writepages()
809          * before the ordered extents complete - to avoid failures (-EEXIST)
810          * when adding the new ordered extents to the ordered tree.
811          */
812         ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
813
814         end = orig_end;
815         while (1) {
816                 ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
817                 if (!ordered)
818                         break;
819                 if (ordered->file_offset > orig_end) {
820                         btrfs_put_ordered_extent(ordered);
821                         break;
822                 }
823                 if (ordered->file_offset + ordered->num_bytes <= start) {
824                         btrfs_put_ordered_extent(ordered);
825                         break;
826                 }
827                 btrfs_start_ordered_extent(ordered);
828                 end = ordered->file_offset;
829                 /*
830                  * If the ordered extent had an error save the error but don't
831                  * exit without waiting first for all other ordered extents in
832                  * the range to complete.
833                  */
834                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
835                         ret = -EIO;
836                 btrfs_put_ordered_extent(ordered);
837                 if (end == 0 || end == start)
838                         break;
839                 end--;
840         }
841         return ret_wb ? ret_wb : ret;
842 }
843
844 /*
845  * find an ordered extent corresponding to file_offset.  return NULL if
846  * nothing is found, otherwise take a reference on the extent and return it
847  */
848 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
849                                                          u64 file_offset)
850 {
851         struct btrfs_ordered_inode_tree *tree;
852         struct rb_node *node;
853         struct btrfs_ordered_extent *entry = NULL;
854         unsigned long flags;
855
856         tree = &inode->ordered_tree;
857         spin_lock_irqsave(&tree->lock, flags);
858         node = tree_search(tree, file_offset);
859         if (!node)
860                 goto out;
861
862         entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
863         if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
864                 entry = NULL;
865         if (entry) {
866                 refcount_inc(&entry->refs);
867                 trace_btrfs_ordered_extent_lookup(inode, entry);
868         }
869 out:
870         spin_unlock_irqrestore(&tree->lock, flags);
871         return entry;
872 }
873
874 /* Since the DIO code tries to lock a wide area we need to look for any ordered
875  * extents that exist in the range, rather than just the start of the range.
876  */
877 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
878                 struct btrfs_inode *inode, u64 file_offset, u64 len)
879 {
880         struct btrfs_ordered_inode_tree *tree;
881         struct rb_node *node;
882         struct btrfs_ordered_extent *entry = NULL;
883
884         tree = &inode->ordered_tree;
885         spin_lock_irq(&tree->lock);
886         node = tree_search(tree, file_offset);
887         if (!node) {
888                 node = tree_search(tree, file_offset + len);
889                 if (!node)
890                         goto out;
891         }
892
893         while (1) {
894                 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
895                 if (range_overlaps(entry, file_offset, len))
896                         break;
897
898                 if (entry->file_offset >= file_offset + len) {
899                         entry = NULL;
900                         break;
901                 }
902                 entry = NULL;
903                 node = rb_next(node);
904                 if (!node)
905                         break;
906         }
907 out:
908         if (entry) {
909                 refcount_inc(&entry->refs);
910                 trace_btrfs_ordered_extent_lookup_range(inode, entry);
911         }
912         spin_unlock_irq(&tree->lock);
913         return entry;
914 }
915
916 /*
917  * Adds all ordered extents to the given list. The list ends up sorted by the
918  * file_offset of the ordered extents.
919  */
920 void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
921                                            struct list_head *list)
922 {
923         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
924         struct rb_node *n;
925
926         ASSERT(inode_is_locked(&inode->vfs_inode));
927
928         spin_lock_irq(&tree->lock);
929         for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
930                 struct btrfs_ordered_extent *ordered;
931
932                 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
933
934                 if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
935                         continue;
936
937                 ASSERT(list_empty(&ordered->log_list));
938                 list_add_tail(&ordered->log_list, list);
939                 refcount_inc(&ordered->refs);
940                 trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
941         }
942         spin_unlock_irq(&tree->lock);
943 }
944
945 /*
946  * lookup and return any extent before 'file_offset'.  NULL is returned
947  * if none is found
948  */
949 struct btrfs_ordered_extent *
950 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
951 {
952         struct btrfs_ordered_inode_tree *tree;
953         struct rb_node *node;
954         struct btrfs_ordered_extent *entry = NULL;
955
956         tree = &inode->ordered_tree;
957         spin_lock_irq(&tree->lock);
958         node = tree_search(tree, file_offset);
959         if (!node)
960                 goto out;
961
962         entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
963         refcount_inc(&entry->refs);
964         trace_btrfs_ordered_extent_lookup_first(inode, entry);
965 out:
966         spin_unlock_irq(&tree->lock);
967         return entry;
968 }
969
970 /*
971  * Lookup the first ordered extent that overlaps the range
972  * [@file_offset, @file_offset + @len).
973  *
974  * The difference between this and btrfs_lookup_first_ordered_extent() is
975  * that this one won't return any ordered extent that does not overlap the range.
976  * And the difference against btrfs_lookup_ordered_extent() is, this function
977  * ensures the first ordered extent gets returned.
978  */
979 struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
980                         struct btrfs_inode *inode, u64 file_offset, u64 len)
981 {
982         struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
983         struct rb_node *node;
984         struct rb_node *cur;
985         struct rb_node *prev;
986         struct rb_node *next;
987         struct btrfs_ordered_extent *entry = NULL;
988
989         spin_lock_irq(&tree->lock);
990         node = tree->tree.rb_node;
991         /*
992          * Here we don't want to use tree_search() which will use tree->last
993          * and screw up the search order.
994          * And __tree_search() can't return the adjacent ordered extents
995          * either, thus here we do our own search.
996          */
997         while (node) {
998                 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
999
1000                 if (file_offset < entry->file_offset) {
1001                         node = node->rb_left;
1002                 } else if (file_offset >= entry_end(entry)) {
1003                         node = node->rb_right;
1004                 } else {
1005                         /*
1006                          * Direct hit, got an ordered extent that starts at
1007                          * @file_offset
1008                          */
1009                         goto out;
1010                 }
1011         }
1012         if (!entry) {
1013                 /* Empty tree */
1014                 goto out;
1015         }
1016
1017         cur = &entry->rb_node;
1018         /* We got an entry around @file_offset, check adjacent entries */
1019         if (entry->file_offset < file_offset) {
1020                 prev = cur;
1021                 next = rb_next(cur);
1022         } else {
1023                 prev = rb_prev(cur);
1024                 next = cur;
1025         }
1026         if (prev) {
1027                 entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
1028                 if (range_overlaps(entry, file_offset, len))
1029                         goto out;
1030         }
1031         if (next) {
1032                 entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
1033                 if (range_overlaps(entry, file_offset, len))
1034                         goto out;
1035         }
1036         /* No ordered extent in the range */
1037         entry = NULL;
1038 out:
1039         if (entry) {
1040                 refcount_inc(&entry->refs);
1041                 trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
1042         }
1043
1044         spin_unlock_irq(&tree->lock);
1045         return entry;
1046 }
1047
1048 /*
1049  * Lock the passed range and ensures all pending ordered extents in it are run
1050  * to completion.
1051  *
1052  * @inode:        Inode whose ordered tree is to be searched
1053  * @start:        Beginning of range to flush
1054  * @end:          Last byte of range to lock
1055  * @cached_state: If passed, will return the extent state responsible for the
1056  *                locked range. It's the caller's responsibility to free the
1057  *                cached state.
1058  *
1059  * Always return with the given range locked, ensuring after it's called no
1060  * order extent can be pending.
1061  */
1062 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1063                                         u64 end,
1064                                         struct extent_state **cached_state)
1065 {
1066         struct btrfs_ordered_extent *ordered;
1067         struct extent_state *cache = NULL;
1068         struct extent_state **cachedp = &cache;
1069
1070         if (cached_state)
1071                 cachedp = cached_state;
1072
1073         while (1) {
1074                 lock_extent(&inode->io_tree, start, end, cachedp);
1075                 ordered = btrfs_lookup_ordered_range(inode, start,
1076                                                      end - start + 1);
1077                 if (!ordered) {
1078                         /*
1079                          * If no external cached_state has been passed then
1080                          * decrement the extra ref taken for cachedp since we
1081                          * aren't exposing it outside of this function
1082                          */
1083                         if (!cached_state)
1084                                 refcount_dec(&cache->refs);
1085                         break;
1086                 }
1087                 unlock_extent(&inode->io_tree, start, end, cachedp);
1088                 btrfs_start_ordered_extent(ordered);
1089                 btrfs_put_ordered_extent(ordered);
1090         }
1091 }
1092
1093 /*
1094  * Lock the passed range and ensure all pending ordered extents in it are run
1095  * to completion in nowait mode.
1096  *
1097  * Return true if btrfs_lock_ordered_range does not return any extents,
1098  * otherwise false.
1099  */
1100 bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
1101                                   struct extent_state **cached_state)
1102 {
1103         struct btrfs_ordered_extent *ordered;
1104
1105         if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
1106                 return false;
1107
1108         ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
1109         if (!ordered)
1110                 return true;
1111
1112         btrfs_put_ordered_extent(ordered);
1113         unlock_extent(&inode->io_tree, start, end, cached_state);
1114
1115         return false;
1116 }
1117
1118 /* Split out a new ordered extent for this first @len bytes of @ordered. */
1119 struct btrfs_ordered_extent *btrfs_split_ordered_extent(
1120                         struct btrfs_ordered_extent *ordered, u64 len)
1121 {
1122         struct inode *inode = ordered->inode;
1123         struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
1124         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1125         u64 file_offset = ordered->file_offset;
1126         u64 disk_bytenr = ordered->disk_bytenr;
1127         unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
1128         struct rb_node *node;
1129
1130         trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
1131
1132         ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
1133
1134         /*
1135          * The entire bio must be covered by the ordered extent, but we can't
1136          * reduce the original extent to a zero length either.
1137          */
1138         if (WARN_ON_ONCE(len >= ordered->num_bytes))
1139                 return ERR_PTR(-EINVAL);
1140         /* We cannot split once ordered extent is past end_bio. */
1141         if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
1142                 return ERR_PTR(-EINVAL);
1143         /* We cannot split a compressed ordered extent. */
1144         if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
1145                 return ERR_PTR(-EINVAL);
1146         /* Checksum list should be empty. */
1147         if (WARN_ON_ONCE(!list_empty(&ordered->list)))
1148                 return ERR_PTR(-EINVAL);
1149
1150         spin_lock_irq(&tree->lock);
1151         /* Remove from tree once */
1152         node = &ordered->rb_node;
1153         rb_erase(node, &tree->tree);
1154         RB_CLEAR_NODE(node);
1155         if (tree->last == node)
1156                 tree->last = NULL;
1157
1158         ordered->file_offset += len;
1159         ordered->disk_bytenr += len;
1160         ordered->num_bytes -= len;
1161         ordered->disk_num_bytes -= len;
1162         ordered->bytes_left -= len;
1163
1164         /* Re-insert the node */
1165         node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
1166         if (node)
1167                 btrfs_panic(fs_info, -EEXIST,
1168                         "zoned: inconsistency in ordered tree at offset %llu",
1169                             ordered->file_offset);
1170
1171         spin_unlock_irq(&tree->lock);
1172
1173         /*
1174          * The splitting extent is already counted and will be added again in
1175          * btrfs_alloc_ordered_extent(). Subtract len to avoid double counting.
1176          */
1177         percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
1178
1179         return btrfs_alloc_ordered_extent(BTRFS_I(inode), file_offset, len, len,
1180                                           disk_bytenr, len, 0, flags,
1181                                           ordered->compress_type);
1182 }
1183
1184 int __init ordered_data_init(void)
1185 {
1186         btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
1187                                      sizeof(struct btrfs_ordered_extent), 0,
1188                                      SLAB_MEM_SPREAD,
1189                                      NULL);
1190         if (!btrfs_ordered_extent_cache)
1191                 return -ENOMEM;
1192
1193         return 0;
1194 }
1195
1196 void __cold ordered_data_exit(void)
1197 {
1198         kmem_cache_destroy(btrfs_ordered_extent_cache);
1199 }