Merge tag 'for-linus-6.1-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-starfive.git] / fs / btrfs / delayed-inode.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2011 Fujitsu.  All rights reserved.
4  * Written by Miao Xie <miaox@cn.fujitsu.com>
5  */
6
7 #include <linux/slab.h>
8 #include <linux/iversion.h>
9 #include "misc.h"
10 #include "delayed-inode.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "ctree.h"
14 #include "qgroup.h"
15 #include "locking.h"
16 #include "inode-item.h"
17
18 #define BTRFS_DELAYED_WRITEBACK         512
19 #define BTRFS_DELAYED_BACKGROUND        128
20 #define BTRFS_DELAYED_BATCH             16
21
22 static struct kmem_cache *delayed_node_cache;
23
24 int __init btrfs_delayed_inode_init(void)
25 {
26         delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
27                                         sizeof(struct btrfs_delayed_node),
28                                         0,
29                                         SLAB_MEM_SPREAD,
30                                         NULL);
31         if (!delayed_node_cache)
32                 return -ENOMEM;
33         return 0;
34 }
35
36 void __cold btrfs_delayed_inode_exit(void)
37 {
38         kmem_cache_destroy(delayed_node_cache);
39 }
40
41 static inline void btrfs_init_delayed_node(
42                                 struct btrfs_delayed_node *delayed_node,
43                                 struct btrfs_root *root, u64 inode_id)
44 {
45         delayed_node->root = root;
46         delayed_node->inode_id = inode_id;
47         refcount_set(&delayed_node->refs, 0);
48         delayed_node->ins_root = RB_ROOT_CACHED;
49         delayed_node->del_root = RB_ROOT_CACHED;
50         mutex_init(&delayed_node->mutex);
51         INIT_LIST_HEAD(&delayed_node->n_list);
52         INIT_LIST_HEAD(&delayed_node->p_list);
53 }
54
55 static struct btrfs_delayed_node *btrfs_get_delayed_node(
56                 struct btrfs_inode *btrfs_inode)
57 {
58         struct btrfs_root *root = btrfs_inode->root;
59         u64 ino = btrfs_ino(btrfs_inode);
60         struct btrfs_delayed_node *node;
61
62         node = READ_ONCE(btrfs_inode->delayed_node);
63         if (node) {
64                 refcount_inc(&node->refs);
65                 return node;
66         }
67
68         spin_lock(&root->inode_lock);
69         node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
70
71         if (node) {
72                 if (btrfs_inode->delayed_node) {
73                         refcount_inc(&node->refs);      /* can be accessed */
74                         BUG_ON(btrfs_inode->delayed_node != node);
75                         spin_unlock(&root->inode_lock);
76                         return node;
77                 }
78
79                 /*
80                  * It's possible that we're racing into the middle of removing
81                  * this node from the radix tree.  In this case, the refcount
82                  * was zero and it should never go back to one.  Just return
83                  * NULL like it was never in the radix at all; our release
84                  * function is in the process of removing it.
85                  *
86                  * Some implementations of refcount_inc refuse to bump the
87                  * refcount once it has hit zero.  If we don't do this dance
88                  * here, refcount_inc() may decide to just WARN_ONCE() instead
89                  * of actually bumping the refcount.
90                  *
91                  * If this node is properly in the radix, we want to bump the
92                  * refcount twice, once for the inode and once for this get
93                  * operation.
94                  */
95                 if (refcount_inc_not_zero(&node->refs)) {
96                         refcount_inc(&node->refs);
97                         btrfs_inode->delayed_node = node;
98                 } else {
99                         node = NULL;
100                 }
101
102                 spin_unlock(&root->inode_lock);
103                 return node;
104         }
105         spin_unlock(&root->inode_lock);
106
107         return NULL;
108 }
109
110 /* Will return either the node or PTR_ERR(-ENOMEM) */
111 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
112                 struct btrfs_inode *btrfs_inode)
113 {
114         struct btrfs_delayed_node *node;
115         struct btrfs_root *root = btrfs_inode->root;
116         u64 ino = btrfs_ino(btrfs_inode);
117         int ret;
118
119 again:
120         node = btrfs_get_delayed_node(btrfs_inode);
121         if (node)
122                 return node;
123
124         node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
125         if (!node)
126                 return ERR_PTR(-ENOMEM);
127         btrfs_init_delayed_node(node, root, ino);
128
129         /* cached in the btrfs inode and can be accessed */
130         refcount_set(&node->refs, 2);
131
132         ret = radix_tree_preload(GFP_NOFS);
133         if (ret) {
134                 kmem_cache_free(delayed_node_cache, node);
135                 return ERR_PTR(ret);
136         }
137
138         spin_lock(&root->inode_lock);
139         ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
140         if (ret == -EEXIST) {
141                 spin_unlock(&root->inode_lock);
142                 kmem_cache_free(delayed_node_cache, node);
143                 radix_tree_preload_end();
144                 goto again;
145         }
146         btrfs_inode->delayed_node = node;
147         spin_unlock(&root->inode_lock);
148         radix_tree_preload_end();
149
150         return node;
151 }
152
153 /*
154  * Call it when holding delayed_node->mutex
155  *
156  * If mod = 1, add this node into the prepared list.
157  */
158 static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
159                                      struct btrfs_delayed_node *node,
160                                      int mod)
161 {
162         spin_lock(&root->lock);
163         if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
164                 if (!list_empty(&node->p_list))
165                         list_move_tail(&node->p_list, &root->prepare_list);
166                 else if (mod)
167                         list_add_tail(&node->p_list, &root->prepare_list);
168         } else {
169                 list_add_tail(&node->n_list, &root->node_list);
170                 list_add_tail(&node->p_list, &root->prepare_list);
171                 refcount_inc(&node->refs);      /* inserted into list */
172                 root->nodes++;
173                 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
174         }
175         spin_unlock(&root->lock);
176 }
177
178 /* Call it when holding delayed_node->mutex */
179 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
180                                        struct btrfs_delayed_node *node)
181 {
182         spin_lock(&root->lock);
183         if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
184                 root->nodes--;
185                 refcount_dec(&node->refs);      /* not in the list */
186                 list_del_init(&node->n_list);
187                 if (!list_empty(&node->p_list))
188                         list_del_init(&node->p_list);
189                 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
190         }
191         spin_unlock(&root->lock);
192 }
193
194 static struct btrfs_delayed_node *btrfs_first_delayed_node(
195                         struct btrfs_delayed_root *delayed_root)
196 {
197         struct list_head *p;
198         struct btrfs_delayed_node *node = NULL;
199
200         spin_lock(&delayed_root->lock);
201         if (list_empty(&delayed_root->node_list))
202                 goto out;
203
204         p = delayed_root->node_list.next;
205         node = list_entry(p, struct btrfs_delayed_node, n_list);
206         refcount_inc(&node->refs);
207 out:
208         spin_unlock(&delayed_root->lock);
209
210         return node;
211 }
212
213 static struct btrfs_delayed_node *btrfs_next_delayed_node(
214                                                 struct btrfs_delayed_node *node)
215 {
216         struct btrfs_delayed_root *delayed_root;
217         struct list_head *p;
218         struct btrfs_delayed_node *next = NULL;
219
220         delayed_root = node->root->fs_info->delayed_root;
221         spin_lock(&delayed_root->lock);
222         if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
223                 /* not in the list */
224                 if (list_empty(&delayed_root->node_list))
225                         goto out;
226                 p = delayed_root->node_list.next;
227         } else if (list_is_last(&node->n_list, &delayed_root->node_list))
228                 goto out;
229         else
230                 p = node->n_list.next;
231
232         next = list_entry(p, struct btrfs_delayed_node, n_list);
233         refcount_inc(&next->refs);
234 out:
235         spin_unlock(&delayed_root->lock);
236
237         return next;
238 }
239
240 static void __btrfs_release_delayed_node(
241                                 struct btrfs_delayed_node *delayed_node,
242                                 int mod)
243 {
244         struct btrfs_delayed_root *delayed_root;
245
246         if (!delayed_node)
247                 return;
248
249         delayed_root = delayed_node->root->fs_info->delayed_root;
250
251         mutex_lock(&delayed_node->mutex);
252         if (delayed_node->count)
253                 btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
254         else
255                 btrfs_dequeue_delayed_node(delayed_root, delayed_node);
256         mutex_unlock(&delayed_node->mutex);
257
258         if (refcount_dec_and_test(&delayed_node->refs)) {
259                 struct btrfs_root *root = delayed_node->root;
260
261                 spin_lock(&root->inode_lock);
262                 /*
263                  * Once our refcount goes to zero, nobody is allowed to bump it
264                  * back up.  We can delete it now.
265                  */
266                 ASSERT(refcount_read(&delayed_node->refs) == 0);
267                 radix_tree_delete(&root->delayed_nodes_tree,
268                                   delayed_node->inode_id);
269                 spin_unlock(&root->inode_lock);
270                 kmem_cache_free(delayed_node_cache, delayed_node);
271         }
272 }
273
274 static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
275 {
276         __btrfs_release_delayed_node(node, 0);
277 }
278
279 static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
280                                         struct btrfs_delayed_root *delayed_root)
281 {
282         struct list_head *p;
283         struct btrfs_delayed_node *node = NULL;
284
285         spin_lock(&delayed_root->lock);
286         if (list_empty(&delayed_root->prepare_list))
287                 goto out;
288
289         p = delayed_root->prepare_list.next;
290         list_del_init(p);
291         node = list_entry(p, struct btrfs_delayed_node, p_list);
292         refcount_inc(&node->refs);
293 out:
294         spin_unlock(&delayed_root->lock);
295
296         return node;
297 }
298
299 static inline void btrfs_release_prepared_delayed_node(
300                                         struct btrfs_delayed_node *node)
301 {
302         __btrfs_release_delayed_node(node, 1);
303 }
304
305 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
306                                            struct btrfs_delayed_node *node,
307                                            enum btrfs_delayed_item_type type)
308 {
309         struct btrfs_delayed_item *item;
310
311         item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
312         if (item) {
313                 item->data_len = data_len;
314                 item->type = type;
315                 item->bytes_reserved = 0;
316                 item->delayed_node = node;
317                 RB_CLEAR_NODE(&item->rb_node);
318                 INIT_LIST_HEAD(&item->log_list);
319                 item->logged = false;
320                 refcount_set(&item->refs, 1);
321         }
322         return item;
323 }
324
325 /*
326  * __btrfs_lookup_delayed_item - look up the delayed item by key
327  * @delayed_node: pointer to the delayed node
328  * @index:        the dir index value to lookup (offset of a dir index key)
329  *
330  * Note: if we don't find the right item, we will return the prev item and
331  * the next item.
332  */
333 static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
334                                 struct rb_root *root,
335                                 u64 index)
336 {
337         struct rb_node *node = root->rb_node;
338         struct btrfs_delayed_item *delayed_item = NULL;
339
340         while (node) {
341                 delayed_item = rb_entry(node, struct btrfs_delayed_item,
342                                         rb_node);
343                 if (delayed_item->index < index)
344                         node = node->rb_right;
345                 else if (delayed_item->index > index)
346                         node = node->rb_left;
347                 else
348                         return delayed_item;
349         }
350
351         return NULL;
352 }
353
354 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
355                                     struct btrfs_delayed_item *ins)
356 {
357         struct rb_node **p, *node;
358         struct rb_node *parent_node = NULL;
359         struct rb_root_cached *root;
360         struct btrfs_delayed_item *item;
361         bool leftmost = true;
362
363         if (ins->type == BTRFS_DELAYED_INSERTION_ITEM)
364                 root = &delayed_node->ins_root;
365         else
366                 root = &delayed_node->del_root;
367
368         p = &root->rb_root.rb_node;
369         node = &ins->rb_node;
370
371         while (*p) {
372                 parent_node = *p;
373                 item = rb_entry(parent_node, struct btrfs_delayed_item,
374                                  rb_node);
375
376                 if (item->index < ins->index) {
377                         p = &(*p)->rb_right;
378                         leftmost = false;
379                 } else if (item->index > ins->index) {
380                         p = &(*p)->rb_left;
381                 } else {
382                         return -EEXIST;
383                 }
384         }
385
386         rb_link_node(node, parent_node, p);
387         rb_insert_color_cached(node, root, leftmost);
388
389         if (ins->type == BTRFS_DELAYED_INSERTION_ITEM &&
390             ins->index >= delayed_node->index_cnt)
391                 delayed_node->index_cnt = ins->index + 1;
392
393         delayed_node->count++;
394         atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
395         return 0;
396 }
397
398 static void finish_one_item(struct btrfs_delayed_root *delayed_root)
399 {
400         int seq = atomic_inc_return(&delayed_root->items_seq);
401
402         /* atomic_dec_return implies a barrier */
403         if ((atomic_dec_return(&delayed_root->items) <
404             BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0))
405                 cond_wake_up_nomb(&delayed_root->wait);
406 }
407
408 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
409 {
410         struct rb_root_cached *root;
411         struct btrfs_delayed_root *delayed_root;
412
413         /* Not inserted, ignore it. */
414         if (RB_EMPTY_NODE(&delayed_item->rb_node))
415                 return;
416
417         delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
418
419         BUG_ON(!delayed_root);
420
421         if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM)
422                 root = &delayed_item->delayed_node->ins_root;
423         else
424                 root = &delayed_item->delayed_node->del_root;
425
426         rb_erase_cached(&delayed_item->rb_node, root);
427         RB_CLEAR_NODE(&delayed_item->rb_node);
428         delayed_item->delayed_node->count--;
429
430         finish_one_item(delayed_root);
431 }
432
433 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
434 {
435         if (item) {
436                 __btrfs_remove_delayed_item(item);
437                 if (refcount_dec_and_test(&item->refs))
438                         kfree(item);
439         }
440 }
441
442 static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
443                                         struct btrfs_delayed_node *delayed_node)
444 {
445         struct rb_node *p;
446         struct btrfs_delayed_item *item = NULL;
447
448         p = rb_first_cached(&delayed_node->ins_root);
449         if (p)
450                 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
451
452         return item;
453 }
454
455 static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
456                                         struct btrfs_delayed_node *delayed_node)
457 {
458         struct rb_node *p;
459         struct btrfs_delayed_item *item = NULL;
460
461         p = rb_first_cached(&delayed_node->del_root);
462         if (p)
463                 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
464
465         return item;
466 }
467
468 static struct btrfs_delayed_item *__btrfs_next_delayed_item(
469                                                 struct btrfs_delayed_item *item)
470 {
471         struct rb_node *p;
472         struct btrfs_delayed_item *next = NULL;
473
474         p = rb_next(&item->rb_node);
475         if (p)
476                 next = rb_entry(p, struct btrfs_delayed_item, rb_node);
477
478         return next;
479 }
480
481 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
482                                                struct btrfs_delayed_item *item)
483 {
484         struct btrfs_block_rsv *src_rsv;
485         struct btrfs_block_rsv *dst_rsv;
486         struct btrfs_fs_info *fs_info = trans->fs_info;
487         u64 num_bytes;
488         int ret;
489
490         if (!trans->bytes_reserved)
491                 return 0;
492
493         src_rsv = trans->block_rsv;
494         dst_rsv = &fs_info->delayed_block_rsv;
495
496         num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
497
498         /*
499          * Here we migrate space rsv from transaction rsv, since have already
500          * reserved space when starting a transaction.  So no need to reserve
501          * qgroup space here.
502          */
503         ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
504         if (!ret) {
505                 trace_btrfs_space_reservation(fs_info, "delayed_item",
506                                               item->delayed_node->inode_id,
507                                               num_bytes, 1);
508                 /*
509                  * For insertions we track reserved metadata space by accounting
510                  * for the number of leaves that will be used, based on the delayed
511                  * node's index_items_size field.
512                  */
513                 if (item->type == BTRFS_DELAYED_DELETION_ITEM)
514                         item->bytes_reserved = num_bytes;
515         }
516
517         return ret;
518 }
519
520 static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
521                                                 struct btrfs_delayed_item *item)
522 {
523         struct btrfs_block_rsv *rsv;
524         struct btrfs_fs_info *fs_info = root->fs_info;
525
526         if (!item->bytes_reserved)
527                 return;
528
529         rsv = &fs_info->delayed_block_rsv;
530         /*
531          * Check btrfs_delayed_item_reserve_metadata() to see why we don't need
532          * to release/reserve qgroup space.
533          */
534         trace_btrfs_space_reservation(fs_info, "delayed_item",
535                                       item->delayed_node->inode_id,
536                                       item->bytes_reserved, 0);
537         btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL);
538 }
539
540 static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node,
541                                               unsigned int num_leaves)
542 {
543         struct btrfs_fs_info *fs_info = node->root->fs_info;
544         const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves);
545
546         /* There are no space reservations during log replay, bail out. */
547         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
548                 return;
549
550         trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id,
551                                       bytes, 0);
552         btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL);
553 }
554
555 static int btrfs_delayed_inode_reserve_metadata(
556                                         struct btrfs_trans_handle *trans,
557                                         struct btrfs_root *root,
558                                         struct btrfs_delayed_node *node)
559 {
560         struct btrfs_fs_info *fs_info = root->fs_info;
561         struct btrfs_block_rsv *src_rsv;
562         struct btrfs_block_rsv *dst_rsv;
563         u64 num_bytes;
564         int ret;
565
566         src_rsv = trans->block_rsv;
567         dst_rsv = &fs_info->delayed_block_rsv;
568
569         num_bytes = btrfs_calc_metadata_size(fs_info, 1);
570
571         /*
572          * btrfs_dirty_inode will update the inode under btrfs_join_transaction
573          * which doesn't reserve space for speed.  This is a problem since we
574          * still need to reserve space for this update, so try to reserve the
575          * space.
576          *
577          * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
578          * we always reserve enough to update the inode item.
579          */
580         if (!src_rsv || (!trans->bytes_reserved &&
581                          src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
582                 ret = btrfs_qgroup_reserve_meta(root, num_bytes,
583                                           BTRFS_QGROUP_RSV_META_PREALLOC, true);
584                 if (ret < 0)
585                         return ret;
586                 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,
587                                           BTRFS_RESERVE_NO_FLUSH);
588                 /* NO_FLUSH could only fail with -ENOSPC */
589                 ASSERT(ret == 0 || ret == -ENOSPC);
590                 if (ret)
591                         btrfs_qgroup_free_meta_prealloc(root, num_bytes);
592         } else {
593                 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
594         }
595
596         if (!ret) {
597                 trace_btrfs_space_reservation(fs_info, "delayed_inode",
598                                               node->inode_id, num_bytes, 1);
599                 node->bytes_reserved = num_bytes;
600         }
601
602         return ret;
603 }
604
605 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
606                                                 struct btrfs_delayed_node *node,
607                                                 bool qgroup_free)
608 {
609         struct btrfs_block_rsv *rsv;
610
611         if (!node->bytes_reserved)
612                 return;
613
614         rsv = &fs_info->delayed_block_rsv;
615         trace_btrfs_space_reservation(fs_info, "delayed_inode",
616                                       node->inode_id, node->bytes_reserved, 0);
617         btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL);
618         if (qgroup_free)
619                 btrfs_qgroup_free_meta_prealloc(node->root,
620                                 node->bytes_reserved);
621         else
622                 btrfs_qgroup_convert_reserved_meta(node->root,
623                                 node->bytes_reserved);
624         node->bytes_reserved = 0;
625 }
626
627 /*
628  * Insert a single delayed item or a batch of delayed items, as many as possible
629  * that fit in a leaf. The delayed items (dir index keys) are sorted by their key
630  * in the rbtree, and if there's a gap between two consecutive dir index items,
631  * then it means at some point we had delayed dir indexes to add but they got
632  * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them
633  * into the subvolume tree. Dir index keys also have their offsets coming from a
634  * monotonically increasing counter, so we can't get new keys with an offset that
635  * fits within a gap between delayed dir index items.
636  */
637 static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
638                                      struct btrfs_root *root,
639                                      struct btrfs_path *path,
640                                      struct btrfs_delayed_item *first_item)
641 {
642         struct btrfs_fs_info *fs_info = root->fs_info;
643         struct btrfs_delayed_node *node = first_item->delayed_node;
644         LIST_HEAD(item_list);
645         struct btrfs_delayed_item *curr;
646         struct btrfs_delayed_item *next;
647         const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info);
648         struct btrfs_item_batch batch;
649         struct btrfs_key first_key;
650         const u32 first_data_size = first_item->data_len;
651         int total_size;
652         char *ins_data = NULL;
653         int ret;
654         bool continuous_keys_only = false;
655
656         lockdep_assert_held(&node->mutex);
657
658         /*
659          * During normal operation the delayed index offset is continuously
660          * increasing, so we can batch insert all items as there will not be any
661          * overlapping keys in the tree.
662          *
663          * The exception to this is log replay, where we may have interleaved
664          * offsets in the tree, so our batch needs to be continuous keys only in
665          * order to ensure we do not end up with out of order items in our leaf.
666          */
667         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
668                 continuous_keys_only = true;
669
670         /*
671          * For delayed items to insert, we track reserved metadata bytes based
672          * on the number of leaves that we will use.
673          * See btrfs_insert_delayed_dir_index() and
674          * btrfs_delayed_item_reserve_metadata()).
675          */
676         ASSERT(first_item->bytes_reserved == 0);
677
678         list_add_tail(&first_item->tree_list, &item_list);
679         batch.total_data_size = first_data_size;
680         batch.nr = 1;
681         total_size = first_data_size + sizeof(struct btrfs_item);
682         curr = first_item;
683
684         while (true) {
685                 int next_size;
686
687                 next = __btrfs_next_delayed_item(curr);
688                 if (!next)
689                         break;
690
691                 /*
692                  * We cannot allow gaps in the key space if we're doing log
693                  * replay.
694                  */
695                 if (continuous_keys_only && (next->index != curr->index + 1))
696                         break;
697
698                 ASSERT(next->bytes_reserved == 0);
699
700                 next_size = next->data_len + sizeof(struct btrfs_item);
701                 if (total_size + next_size > max_size)
702                         break;
703
704                 list_add_tail(&next->tree_list, &item_list);
705                 batch.nr++;
706                 total_size += next_size;
707                 batch.total_data_size += next->data_len;
708                 curr = next;
709         }
710
711         if (batch.nr == 1) {
712                 first_key.objectid = node->inode_id;
713                 first_key.type = BTRFS_DIR_INDEX_KEY;
714                 first_key.offset = first_item->index;
715                 batch.keys = &first_key;
716                 batch.data_sizes = &first_data_size;
717         } else {
718                 struct btrfs_key *ins_keys;
719                 u32 *ins_sizes;
720                 int i = 0;
721
722                 ins_data = kmalloc(batch.nr * sizeof(u32) +
723                                    batch.nr * sizeof(struct btrfs_key), GFP_NOFS);
724                 if (!ins_data) {
725                         ret = -ENOMEM;
726                         goto out;
727                 }
728                 ins_sizes = (u32 *)ins_data;
729                 ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32));
730                 batch.keys = ins_keys;
731                 batch.data_sizes = ins_sizes;
732                 list_for_each_entry(curr, &item_list, tree_list) {
733                         ins_keys[i].objectid = node->inode_id;
734                         ins_keys[i].type = BTRFS_DIR_INDEX_KEY;
735                         ins_keys[i].offset = curr->index;
736                         ins_sizes[i] = curr->data_len;
737                         i++;
738                 }
739         }
740
741         ret = btrfs_insert_empty_items(trans, root, path, &batch);
742         if (ret)
743                 goto out;
744
745         list_for_each_entry(curr, &item_list, tree_list) {
746                 char *data_ptr;
747
748                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
749                 write_extent_buffer(path->nodes[0], &curr->data,
750                                     (unsigned long)data_ptr, curr->data_len);
751                 path->slots[0]++;
752         }
753
754         /*
755          * Now release our path before releasing the delayed items and their
756          * metadata reservations, so that we don't block other tasks for more
757          * time than needed.
758          */
759         btrfs_release_path(path);
760
761         ASSERT(node->index_item_leaves > 0);
762
763         /*
764          * For normal operations we will batch an entire leaf's worth of delayed
765          * items, so if there are more items to process we can decrement
766          * index_item_leaves by 1 as we inserted 1 leaf's worth of items.
767          *
768          * However for log replay we may not have inserted an entire leaf's
769          * worth of items, we may have not had continuous items, so decrementing
770          * here would mess up the index_item_leaves accounting.  For this case
771          * only clean up the accounting when there are no items left.
772          */
773         if (next && !continuous_keys_only) {
774                 /*
775                  * We inserted one batch of items into a leaf a there are more
776                  * items to flush in a future batch, now release one unit of
777                  * metadata space from the delayed block reserve, corresponding
778                  * the leaf we just flushed to.
779                  */
780                 btrfs_delayed_item_release_leaves(node, 1);
781                 node->index_item_leaves--;
782         } else if (!next) {
783                 /*
784                  * There are no more items to insert. We can have a number of
785                  * reserved leaves > 1 here - this happens when many dir index
786                  * items are added and then removed before they are flushed (file
787                  * names with a very short life, never span a transaction). So
788                  * release all remaining leaves.
789                  */
790                 btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
791                 node->index_item_leaves = 0;
792         }
793
794         list_for_each_entry_safe(curr, next, &item_list, tree_list) {
795                 list_del(&curr->tree_list);
796                 btrfs_release_delayed_item(curr);
797         }
798 out:
799         kfree(ins_data);
800         return ret;
801 }
802
803 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
804                                       struct btrfs_path *path,
805                                       struct btrfs_root *root,
806                                       struct btrfs_delayed_node *node)
807 {
808         int ret = 0;
809
810         while (ret == 0) {
811                 struct btrfs_delayed_item *curr;
812
813                 mutex_lock(&node->mutex);
814                 curr = __btrfs_first_delayed_insertion_item(node);
815                 if (!curr) {
816                         mutex_unlock(&node->mutex);
817                         break;
818                 }
819                 ret = btrfs_insert_delayed_item(trans, root, path, curr);
820                 mutex_unlock(&node->mutex);
821         }
822
823         return ret;
824 }
825
826 static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
827                                     struct btrfs_root *root,
828                                     struct btrfs_path *path,
829                                     struct btrfs_delayed_item *item)
830 {
831         const u64 ino = item->delayed_node->inode_id;
832         struct btrfs_fs_info *fs_info = root->fs_info;
833         struct btrfs_delayed_item *curr, *next;
834         struct extent_buffer *leaf = path->nodes[0];
835         LIST_HEAD(batch_list);
836         int nitems, slot, last_slot;
837         int ret;
838         u64 total_reserved_size = item->bytes_reserved;
839
840         ASSERT(leaf != NULL);
841
842         slot = path->slots[0];
843         last_slot = btrfs_header_nritems(leaf) - 1;
844         /*
845          * Our caller always gives us a path pointing to an existing item, so
846          * this can not happen.
847          */
848         ASSERT(slot <= last_slot);
849         if (WARN_ON(slot > last_slot))
850                 return -ENOENT;
851
852         nitems = 1;
853         curr = item;
854         list_add_tail(&curr->tree_list, &batch_list);
855
856         /*
857          * Keep checking if the next delayed item matches the next item in the
858          * leaf - if so, we can add it to the batch of items to delete from the
859          * leaf.
860          */
861         while (slot < last_slot) {
862                 struct btrfs_key key;
863
864                 next = __btrfs_next_delayed_item(curr);
865                 if (!next)
866                         break;
867
868                 slot++;
869                 btrfs_item_key_to_cpu(leaf, &key, slot);
870                 if (key.objectid != ino ||
871                     key.type != BTRFS_DIR_INDEX_KEY ||
872                     key.offset != next->index)
873                         break;
874                 nitems++;
875                 curr = next;
876                 list_add_tail(&curr->tree_list, &batch_list);
877                 total_reserved_size += curr->bytes_reserved;
878         }
879
880         ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
881         if (ret)
882                 return ret;
883
884         /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */
885         if (total_reserved_size > 0) {
886                 /*
887                  * Check btrfs_delayed_item_reserve_metadata() to see why we
888                  * don't need to release/reserve qgroup space.
889                  */
890                 trace_btrfs_space_reservation(fs_info, "delayed_item", ino,
891                                               total_reserved_size, 0);
892                 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv,
893                                         total_reserved_size, NULL);
894         }
895
896         list_for_each_entry_safe(curr, next, &batch_list, tree_list) {
897                 list_del(&curr->tree_list);
898                 btrfs_release_delayed_item(curr);
899         }
900
901         return 0;
902 }
903
904 static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
905                                       struct btrfs_path *path,
906                                       struct btrfs_root *root,
907                                       struct btrfs_delayed_node *node)
908 {
909         struct btrfs_key key;
910         int ret = 0;
911
912         key.objectid = node->inode_id;
913         key.type = BTRFS_DIR_INDEX_KEY;
914
915         while (ret == 0) {
916                 struct btrfs_delayed_item *item;
917
918                 mutex_lock(&node->mutex);
919                 item = __btrfs_first_delayed_deletion_item(node);
920                 if (!item) {
921                         mutex_unlock(&node->mutex);
922                         break;
923                 }
924
925                 key.offset = item->index;
926                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
927                 if (ret > 0) {
928                         /*
929                          * There's no matching item in the leaf. This means we
930                          * have already deleted this item in a past run of the
931                          * delayed items. We ignore errors when running delayed
932                          * items from an async context, through a work queue job
933                          * running btrfs_async_run_delayed_root(), and don't
934                          * release delayed items that failed to complete. This
935                          * is because we will retry later, and at transaction
936                          * commit time we always run delayed items and will
937                          * then deal with errors if they fail to run again.
938                          *
939                          * So just release delayed items for which we can't find
940                          * an item in the tree, and move to the next item.
941                          */
942                         btrfs_release_path(path);
943                         btrfs_release_delayed_item(item);
944                         ret = 0;
945                 } else if (ret == 0) {
946                         ret = btrfs_batch_delete_items(trans, root, path, item);
947                         btrfs_release_path(path);
948                 }
949
950                 /*
951                  * We unlock and relock on each iteration, this is to prevent
952                  * blocking other tasks for too long while we are being run from
953                  * the async context (work queue job). Those tasks are typically
954                  * running system calls like creat/mkdir/rename/unlink/etc which
955                  * need to add delayed items to this delayed node.
956                  */
957                 mutex_unlock(&node->mutex);
958         }
959
960         return ret;
961 }
962
963 static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
964 {
965         struct btrfs_delayed_root *delayed_root;
966
967         if (delayed_node &&
968             test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
969                 BUG_ON(!delayed_node->root);
970                 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
971                 delayed_node->count--;
972
973                 delayed_root = delayed_node->root->fs_info->delayed_root;
974                 finish_one_item(delayed_root);
975         }
976 }
977
978 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
979 {
980
981         if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) {
982                 struct btrfs_delayed_root *delayed_root;
983
984                 ASSERT(delayed_node->root);
985                 delayed_node->count--;
986
987                 delayed_root = delayed_node->root->fs_info->delayed_root;
988                 finish_one_item(delayed_root);
989         }
990 }
991
992 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
993                                         struct btrfs_root *root,
994                                         struct btrfs_path *path,
995                                         struct btrfs_delayed_node *node)
996 {
997         struct btrfs_fs_info *fs_info = root->fs_info;
998         struct btrfs_key key;
999         struct btrfs_inode_item *inode_item;
1000         struct extent_buffer *leaf;
1001         int mod;
1002         int ret;
1003
1004         key.objectid = node->inode_id;
1005         key.type = BTRFS_INODE_ITEM_KEY;
1006         key.offset = 0;
1007
1008         if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1009                 mod = -1;
1010         else
1011                 mod = 1;
1012
1013         ret = btrfs_lookup_inode(trans, root, path, &key, mod);
1014         if (ret > 0)
1015                 ret = -ENOENT;
1016         if (ret < 0)
1017                 goto out;
1018
1019         leaf = path->nodes[0];
1020         inode_item = btrfs_item_ptr(leaf, path->slots[0],
1021                                     struct btrfs_inode_item);
1022         write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1023                             sizeof(struct btrfs_inode_item));
1024         btrfs_mark_buffer_dirty(leaf);
1025
1026         if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1027                 goto out;
1028
1029         path->slots[0]++;
1030         if (path->slots[0] >= btrfs_header_nritems(leaf))
1031                 goto search;
1032 again:
1033         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1034         if (key.objectid != node->inode_id)
1035                 goto out;
1036
1037         if (key.type != BTRFS_INODE_REF_KEY &&
1038             key.type != BTRFS_INODE_EXTREF_KEY)
1039                 goto out;
1040
1041         /*
1042          * Delayed iref deletion is for the inode who has only one link,
1043          * so there is only one iref. The case that several irefs are
1044          * in the same item doesn't exist.
1045          */
1046         btrfs_del_item(trans, root, path);
1047 out:
1048         btrfs_release_delayed_iref(node);
1049         btrfs_release_path(path);
1050 err_out:
1051         btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
1052         btrfs_release_delayed_inode(node);
1053
1054         /*
1055          * If we fail to update the delayed inode we need to abort the
1056          * transaction, because we could leave the inode with the improper
1057          * counts behind.
1058          */
1059         if (ret && ret != -ENOENT)
1060                 btrfs_abort_transaction(trans, ret);
1061
1062         return ret;
1063
1064 search:
1065         btrfs_release_path(path);
1066
1067         key.type = BTRFS_INODE_EXTREF_KEY;
1068         key.offset = -1;
1069
1070         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1071         if (ret < 0)
1072                 goto err_out;
1073         ASSERT(ret);
1074
1075         ret = 0;
1076         leaf = path->nodes[0];
1077         path->slots[0]--;
1078         goto again;
1079 }
1080
1081 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1082                                              struct btrfs_root *root,
1083                                              struct btrfs_path *path,
1084                                              struct btrfs_delayed_node *node)
1085 {
1086         int ret;
1087
1088         mutex_lock(&node->mutex);
1089         if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
1090                 mutex_unlock(&node->mutex);
1091                 return 0;
1092         }
1093
1094         ret = __btrfs_update_delayed_inode(trans, root, path, node);
1095         mutex_unlock(&node->mutex);
1096         return ret;
1097 }
1098
1099 static inline int
1100 __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1101                                    struct btrfs_path *path,
1102                                    struct btrfs_delayed_node *node)
1103 {
1104         int ret;
1105
1106         ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1107         if (ret)
1108                 return ret;
1109
1110         ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1111         if (ret)
1112                 return ret;
1113
1114         ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1115         return ret;
1116 }
1117
1118 /*
1119  * Called when committing the transaction.
1120  * Returns 0 on success.
1121  * Returns < 0 on error and returns with an aborted transaction with any
1122  * outstanding delayed items cleaned up.
1123  */
1124 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
1125 {
1126         struct btrfs_fs_info *fs_info = trans->fs_info;
1127         struct btrfs_delayed_root *delayed_root;
1128         struct btrfs_delayed_node *curr_node, *prev_node;
1129         struct btrfs_path *path;
1130         struct btrfs_block_rsv *block_rsv;
1131         int ret = 0;
1132         bool count = (nr > 0);
1133
1134         if (TRANS_ABORTED(trans))
1135                 return -EIO;
1136
1137         path = btrfs_alloc_path();
1138         if (!path)
1139                 return -ENOMEM;
1140
1141         block_rsv = trans->block_rsv;
1142         trans->block_rsv = &fs_info->delayed_block_rsv;
1143
1144         delayed_root = fs_info->delayed_root;
1145
1146         curr_node = btrfs_first_delayed_node(delayed_root);
1147         while (curr_node && (!count || nr--)) {
1148                 ret = __btrfs_commit_inode_delayed_items(trans, path,
1149                                                          curr_node);
1150                 if (ret) {
1151                         btrfs_release_delayed_node(curr_node);
1152                         curr_node = NULL;
1153                         btrfs_abort_transaction(trans, ret);
1154                         break;
1155                 }
1156
1157                 prev_node = curr_node;
1158                 curr_node = btrfs_next_delayed_node(curr_node);
1159                 btrfs_release_delayed_node(prev_node);
1160         }
1161
1162         if (curr_node)
1163                 btrfs_release_delayed_node(curr_node);
1164         btrfs_free_path(path);
1165         trans->block_rsv = block_rsv;
1166
1167         return ret;
1168 }
1169
1170 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
1171 {
1172         return __btrfs_run_delayed_items(trans, -1);
1173 }
1174
1175 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
1176 {
1177         return __btrfs_run_delayed_items(trans, nr);
1178 }
1179
1180 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1181                                      struct btrfs_inode *inode)
1182 {
1183         struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1184         struct btrfs_path *path;
1185         struct btrfs_block_rsv *block_rsv;
1186         int ret;
1187
1188         if (!delayed_node)
1189                 return 0;
1190
1191         mutex_lock(&delayed_node->mutex);
1192         if (!delayed_node->count) {
1193                 mutex_unlock(&delayed_node->mutex);
1194                 btrfs_release_delayed_node(delayed_node);
1195                 return 0;
1196         }
1197         mutex_unlock(&delayed_node->mutex);
1198
1199         path = btrfs_alloc_path();
1200         if (!path) {
1201                 btrfs_release_delayed_node(delayed_node);
1202                 return -ENOMEM;
1203         }
1204
1205         block_rsv = trans->block_rsv;
1206         trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1207
1208         ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1209
1210         btrfs_release_delayed_node(delayed_node);
1211         btrfs_free_path(path);
1212         trans->block_rsv = block_rsv;
1213
1214         return ret;
1215 }
1216
1217 int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode)
1218 {
1219         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1220         struct btrfs_trans_handle *trans;
1221         struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1222         struct btrfs_path *path;
1223         struct btrfs_block_rsv *block_rsv;
1224         int ret;
1225
1226         if (!delayed_node)
1227                 return 0;
1228
1229         mutex_lock(&delayed_node->mutex);
1230         if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1231                 mutex_unlock(&delayed_node->mutex);
1232                 btrfs_release_delayed_node(delayed_node);
1233                 return 0;
1234         }
1235         mutex_unlock(&delayed_node->mutex);
1236
1237         trans = btrfs_join_transaction(delayed_node->root);
1238         if (IS_ERR(trans)) {
1239                 ret = PTR_ERR(trans);
1240                 goto out;
1241         }
1242
1243         path = btrfs_alloc_path();
1244         if (!path) {
1245                 ret = -ENOMEM;
1246                 goto trans_out;
1247         }
1248
1249         block_rsv = trans->block_rsv;
1250         trans->block_rsv = &fs_info->delayed_block_rsv;
1251
1252         mutex_lock(&delayed_node->mutex);
1253         if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
1254                 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1255                                                    path, delayed_node);
1256         else
1257                 ret = 0;
1258         mutex_unlock(&delayed_node->mutex);
1259
1260         btrfs_free_path(path);
1261         trans->block_rsv = block_rsv;
1262 trans_out:
1263         btrfs_end_transaction(trans);
1264         btrfs_btree_balance_dirty(fs_info);
1265 out:
1266         btrfs_release_delayed_node(delayed_node);
1267
1268         return ret;
1269 }
1270
1271 void btrfs_remove_delayed_node(struct btrfs_inode *inode)
1272 {
1273         struct btrfs_delayed_node *delayed_node;
1274
1275         delayed_node = READ_ONCE(inode->delayed_node);
1276         if (!delayed_node)
1277                 return;
1278
1279         inode->delayed_node = NULL;
1280         btrfs_release_delayed_node(delayed_node);
1281 }
1282
1283 struct btrfs_async_delayed_work {
1284         struct btrfs_delayed_root *delayed_root;
1285         int nr;
1286         struct btrfs_work work;
1287 };
1288
1289 static void btrfs_async_run_delayed_root(struct btrfs_work *work)
1290 {
1291         struct btrfs_async_delayed_work *async_work;
1292         struct btrfs_delayed_root *delayed_root;
1293         struct btrfs_trans_handle *trans;
1294         struct btrfs_path *path;
1295         struct btrfs_delayed_node *delayed_node = NULL;
1296         struct btrfs_root *root;
1297         struct btrfs_block_rsv *block_rsv;
1298         int total_done = 0;
1299
1300         async_work = container_of(work, struct btrfs_async_delayed_work, work);
1301         delayed_root = async_work->delayed_root;
1302
1303         path = btrfs_alloc_path();
1304         if (!path)
1305                 goto out;
1306
1307         do {
1308                 if (atomic_read(&delayed_root->items) <
1309                     BTRFS_DELAYED_BACKGROUND / 2)
1310                         break;
1311
1312                 delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
1313                 if (!delayed_node)
1314                         break;
1315
1316                 root = delayed_node->root;
1317
1318                 trans = btrfs_join_transaction(root);
1319                 if (IS_ERR(trans)) {
1320                         btrfs_release_path(path);
1321                         btrfs_release_prepared_delayed_node(delayed_node);
1322                         total_done++;
1323                         continue;
1324                 }
1325
1326                 block_rsv = trans->block_rsv;
1327                 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1328
1329                 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1330
1331                 trans->block_rsv = block_rsv;
1332                 btrfs_end_transaction(trans);
1333                 btrfs_btree_balance_dirty_nodelay(root->fs_info);
1334
1335                 btrfs_release_path(path);
1336                 btrfs_release_prepared_delayed_node(delayed_node);
1337                 total_done++;
1338
1339         } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK)
1340                  || total_done < async_work->nr);
1341
1342         btrfs_free_path(path);
1343 out:
1344         wake_up(&delayed_root->wait);
1345         kfree(async_work);
1346 }
1347
1348
1349 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1350                                      struct btrfs_fs_info *fs_info, int nr)
1351 {
1352         struct btrfs_async_delayed_work *async_work;
1353
1354         async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
1355         if (!async_work)
1356                 return -ENOMEM;
1357
1358         async_work->delayed_root = delayed_root;
1359         btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
1360                         NULL);
1361         async_work->nr = nr;
1362
1363         btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
1364         return 0;
1365 }
1366
1367 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
1368 {
1369         WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
1370 }
1371
1372 static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1373 {
1374         int val = atomic_read(&delayed_root->items_seq);
1375
1376         if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
1377                 return 1;
1378
1379         if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1380                 return 1;
1381
1382         return 0;
1383 }
1384
1385 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
1386 {
1387         struct btrfs_delayed_root *delayed_root = fs_info->delayed_root;
1388
1389         if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) ||
1390                 btrfs_workqueue_normal_congested(fs_info->delayed_workers))
1391                 return;
1392
1393         if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1394                 int seq;
1395                 int ret;
1396
1397                 seq = atomic_read(&delayed_root->items_seq);
1398
1399                 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
1400                 if (ret)
1401                         return;
1402
1403                 wait_event_interruptible(delayed_root->wait,
1404                                          could_end_wait(delayed_root, seq));
1405                 return;
1406         }
1407
1408         btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
1409 }
1410
1411 /* Will return 0 or -ENOMEM */
1412 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1413                                    const char *name, int name_len,
1414                                    struct btrfs_inode *dir,
1415                                    struct btrfs_disk_key *disk_key, u8 type,
1416                                    u64 index)
1417 {
1418         struct btrfs_fs_info *fs_info = trans->fs_info;
1419         const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info);
1420         struct btrfs_delayed_node *delayed_node;
1421         struct btrfs_delayed_item *delayed_item;
1422         struct btrfs_dir_item *dir_item;
1423         bool reserve_leaf_space;
1424         u32 data_len;
1425         int ret;
1426
1427         delayed_node = btrfs_get_or_create_delayed_node(dir);
1428         if (IS_ERR(delayed_node))
1429                 return PTR_ERR(delayed_node);
1430
1431         delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len,
1432                                                 delayed_node,
1433                                                 BTRFS_DELAYED_INSERTION_ITEM);
1434         if (!delayed_item) {
1435                 ret = -ENOMEM;
1436                 goto release_node;
1437         }
1438
1439         delayed_item->index = index;
1440
1441         dir_item = (struct btrfs_dir_item *)delayed_item->data;
1442         dir_item->location = *disk_key;
1443         btrfs_set_stack_dir_transid(dir_item, trans->transid);
1444         btrfs_set_stack_dir_data_len(dir_item, 0);
1445         btrfs_set_stack_dir_name_len(dir_item, name_len);
1446         btrfs_set_stack_dir_type(dir_item, type);
1447         memcpy((char *)(dir_item + 1), name, name_len);
1448
1449         data_len = delayed_item->data_len + sizeof(struct btrfs_item);
1450
1451         mutex_lock(&delayed_node->mutex);
1452
1453         if (delayed_node->index_item_leaves == 0 ||
1454             delayed_node->curr_index_batch_size + data_len > leaf_data_size) {
1455                 delayed_node->curr_index_batch_size = data_len;
1456                 reserve_leaf_space = true;
1457         } else {
1458                 delayed_node->curr_index_batch_size += data_len;
1459                 reserve_leaf_space = false;
1460         }
1461
1462         if (reserve_leaf_space) {
1463                 ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item);
1464                 /*
1465                  * Space was reserved for a dir index item insertion when we
1466                  * started the transaction, so getting a failure here should be
1467                  * impossible.
1468                  */
1469                 if (WARN_ON(ret)) {
1470                         mutex_unlock(&delayed_node->mutex);
1471                         btrfs_release_delayed_item(delayed_item);
1472                         goto release_node;
1473                 }
1474
1475                 delayed_node->index_item_leaves++;
1476         } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
1477                 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
1478
1479                 /*
1480                  * Adding the new dir index item does not require touching another
1481                  * leaf, so we can release 1 unit of metadata that was previously
1482                  * reserved when starting the transaction. This applies only to
1483                  * the case where we had a transaction start and excludes the
1484                  * transaction join case (when replaying log trees).
1485                  */
1486                 trace_btrfs_space_reservation(fs_info, "transaction",
1487                                               trans->transid, bytes, 0);
1488                 btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL);
1489                 ASSERT(trans->bytes_reserved >= bytes);
1490                 trans->bytes_reserved -= bytes;
1491         }
1492
1493         ret = __btrfs_add_delayed_item(delayed_node, delayed_item);
1494         if (unlikely(ret)) {
1495                 btrfs_err(trans->fs_info,
1496                           "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1497                           name_len, name, delayed_node->root->root_key.objectid,
1498                           delayed_node->inode_id, ret);
1499                 BUG();
1500         }
1501         mutex_unlock(&delayed_node->mutex);
1502
1503 release_node:
1504         btrfs_release_delayed_node(delayed_node);
1505         return ret;
1506 }
1507
1508 static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
1509                                                struct btrfs_delayed_node *node,
1510                                                u64 index)
1511 {
1512         struct btrfs_delayed_item *item;
1513
1514         mutex_lock(&node->mutex);
1515         item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
1516         if (!item) {
1517                 mutex_unlock(&node->mutex);
1518                 return 1;
1519         }
1520
1521         /*
1522          * For delayed items to insert, we track reserved metadata bytes based
1523          * on the number of leaves that we will use.
1524          * See btrfs_insert_delayed_dir_index() and
1525          * btrfs_delayed_item_reserve_metadata()).
1526          */
1527         ASSERT(item->bytes_reserved == 0);
1528         ASSERT(node->index_item_leaves > 0);
1529
1530         /*
1531          * If there's only one leaf reserved, we can decrement this item from the
1532          * current batch, otherwise we can not because we don't know which leaf
1533          * it belongs to. With the current limit on delayed items, we rarely
1534          * accumulate enough dir index items to fill more than one leaf (even
1535          * when using a leaf size of 4K).
1536          */
1537         if (node->index_item_leaves == 1) {
1538                 const u32 data_len = item->data_len + sizeof(struct btrfs_item);
1539
1540                 ASSERT(node->curr_index_batch_size >= data_len);
1541                 node->curr_index_batch_size -= data_len;
1542         }
1543
1544         btrfs_release_delayed_item(item);
1545
1546         /* If we now have no more dir index items, we can release all leaves. */
1547         if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) {
1548                 btrfs_delayed_item_release_leaves(node, node->index_item_leaves);
1549                 node->index_item_leaves = 0;
1550         }
1551
1552         mutex_unlock(&node->mutex);
1553         return 0;
1554 }
1555
1556 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1557                                    struct btrfs_inode *dir, u64 index)
1558 {
1559         struct btrfs_delayed_node *node;
1560         struct btrfs_delayed_item *item;
1561         int ret;
1562
1563         node = btrfs_get_or_create_delayed_node(dir);
1564         if (IS_ERR(node))
1565                 return PTR_ERR(node);
1566
1567         ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index);
1568         if (!ret)
1569                 goto end;
1570
1571         item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
1572         if (!item) {
1573                 ret = -ENOMEM;
1574                 goto end;
1575         }
1576
1577         item->index = index;
1578
1579         ret = btrfs_delayed_item_reserve_metadata(trans, item);
1580         /*
1581          * we have reserved enough space when we start a new transaction,
1582          * so reserving metadata failure is impossible.
1583          */
1584         if (ret < 0) {
1585                 btrfs_err(trans->fs_info,
1586 "metadata reservation failed for delayed dir item deltiona, should have been reserved");
1587                 btrfs_release_delayed_item(item);
1588                 goto end;
1589         }
1590
1591         mutex_lock(&node->mutex);
1592         ret = __btrfs_add_delayed_item(node, item);
1593         if (unlikely(ret)) {
1594                 btrfs_err(trans->fs_info,
1595                           "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
1596                           index, node->root->root_key.objectid,
1597                           node->inode_id, ret);
1598                 btrfs_delayed_item_release_metadata(dir->root, item);
1599                 btrfs_release_delayed_item(item);
1600         }
1601         mutex_unlock(&node->mutex);
1602 end:
1603         btrfs_release_delayed_node(node);
1604         return ret;
1605 }
1606
1607 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
1608 {
1609         struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1610
1611         if (!delayed_node)
1612                 return -ENOENT;
1613
1614         /*
1615          * Since we have held i_mutex of this directory, it is impossible that
1616          * a new directory index is added into the delayed node and index_cnt
1617          * is updated now. So we needn't lock the delayed node.
1618          */
1619         if (!delayed_node->index_cnt) {
1620                 btrfs_release_delayed_node(delayed_node);
1621                 return -EINVAL;
1622         }
1623
1624         inode->index_cnt = delayed_node->index_cnt;
1625         btrfs_release_delayed_node(delayed_node);
1626         return 0;
1627 }
1628
1629 bool btrfs_readdir_get_delayed_items(struct inode *inode,
1630                                      struct list_head *ins_list,
1631                                      struct list_head *del_list)
1632 {
1633         struct btrfs_delayed_node *delayed_node;
1634         struct btrfs_delayed_item *item;
1635
1636         delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
1637         if (!delayed_node)
1638                 return false;
1639
1640         /*
1641          * We can only do one readdir with delayed items at a time because of
1642          * item->readdir_list.
1643          */
1644         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
1645         btrfs_inode_lock(inode, 0);
1646
1647         mutex_lock(&delayed_node->mutex);
1648         item = __btrfs_first_delayed_insertion_item(delayed_node);
1649         while (item) {
1650                 refcount_inc(&item->refs);
1651                 list_add_tail(&item->readdir_list, ins_list);
1652                 item = __btrfs_next_delayed_item(item);
1653         }
1654
1655         item = __btrfs_first_delayed_deletion_item(delayed_node);
1656         while (item) {
1657                 refcount_inc(&item->refs);
1658                 list_add_tail(&item->readdir_list, del_list);
1659                 item = __btrfs_next_delayed_item(item);
1660         }
1661         mutex_unlock(&delayed_node->mutex);
1662         /*
1663          * This delayed node is still cached in the btrfs inode, so refs
1664          * must be > 1 now, and we needn't check it is going to be freed
1665          * or not.
1666          *
1667          * Besides that, this function is used to read dir, we do not
1668          * insert/delete delayed items in this period. So we also needn't
1669          * requeue or dequeue this delayed node.
1670          */
1671         refcount_dec(&delayed_node->refs);
1672
1673         return true;
1674 }
1675
1676 void btrfs_readdir_put_delayed_items(struct inode *inode,
1677                                      struct list_head *ins_list,
1678                                      struct list_head *del_list)
1679 {
1680         struct btrfs_delayed_item *curr, *next;
1681
1682         list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1683                 list_del(&curr->readdir_list);
1684                 if (refcount_dec_and_test(&curr->refs))
1685                         kfree(curr);
1686         }
1687
1688         list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1689                 list_del(&curr->readdir_list);
1690                 if (refcount_dec_and_test(&curr->refs))
1691                         kfree(curr);
1692         }
1693
1694         /*
1695          * The VFS is going to do up_read(), so we need to downgrade back to a
1696          * read lock.
1697          */
1698         downgrade_write(&inode->i_rwsem);
1699 }
1700
1701 int btrfs_should_delete_dir_index(struct list_head *del_list,
1702                                   u64 index)
1703 {
1704         struct btrfs_delayed_item *curr;
1705         int ret = 0;
1706
1707         list_for_each_entry(curr, del_list, readdir_list) {
1708                 if (curr->index > index)
1709                         break;
1710                 if (curr->index == index) {
1711                         ret = 1;
1712                         break;
1713                 }
1714         }
1715         return ret;
1716 }
1717
1718 /*
1719  * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1720  *
1721  */
1722 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1723                                     struct list_head *ins_list)
1724 {
1725         struct btrfs_dir_item *di;
1726         struct btrfs_delayed_item *curr, *next;
1727         struct btrfs_key location;
1728         char *name;
1729         int name_len;
1730         int over = 0;
1731         unsigned char d_type;
1732
1733         if (list_empty(ins_list))
1734                 return 0;
1735
1736         /*
1737          * Changing the data of the delayed item is impossible. So
1738          * we needn't lock them. And we have held i_mutex of the
1739          * directory, nobody can delete any directory indexes now.
1740          */
1741         list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1742                 list_del(&curr->readdir_list);
1743
1744                 if (curr->index < ctx->pos) {
1745                         if (refcount_dec_and_test(&curr->refs))
1746                                 kfree(curr);
1747                         continue;
1748                 }
1749
1750                 ctx->pos = curr->index;
1751
1752                 di = (struct btrfs_dir_item *)curr->data;
1753                 name = (char *)(di + 1);
1754                 name_len = btrfs_stack_dir_name_len(di);
1755
1756                 d_type = fs_ftype_to_dtype(di->type);
1757                 btrfs_disk_key_to_cpu(&location, &di->location);
1758
1759                 over = !dir_emit(ctx, name, name_len,
1760                                location.objectid, d_type);
1761
1762                 if (refcount_dec_and_test(&curr->refs))
1763                         kfree(curr);
1764
1765                 if (over)
1766                         return 1;
1767                 ctx->pos++;
1768         }
1769         return 0;
1770 }
1771
1772 static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1773                                   struct btrfs_inode_item *inode_item,
1774                                   struct inode *inode)
1775 {
1776         u64 flags;
1777
1778         btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
1779         btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
1780         btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1781         btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1782         btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
1783         btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1784         btrfs_set_stack_inode_generation(inode_item,
1785                                          BTRFS_I(inode)->generation);
1786         btrfs_set_stack_inode_sequence(inode_item,
1787                                        inode_peek_iversion(inode));
1788         btrfs_set_stack_inode_transid(inode_item, trans->transid);
1789         btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1790         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
1791                                           BTRFS_I(inode)->ro_flags);
1792         btrfs_set_stack_inode_flags(inode_item, flags);
1793         btrfs_set_stack_inode_block_group(inode_item, 0);
1794
1795         btrfs_set_stack_timespec_sec(&inode_item->atime,
1796                                      inode->i_atime.tv_sec);
1797         btrfs_set_stack_timespec_nsec(&inode_item->atime,
1798                                       inode->i_atime.tv_nsec);
1799
1800         btrfs_set_stack_timespec_sec(&inode_item->mtime,
1801                                      inode->i_mtime.tv_sec);
1802         btrfs_set_stack_timespec_nsec(&inode_item->mtime,
1803                                       inode->i_mtime.tv_nsec);
1804
1805         btrfs_set_stack_timespec_sec(&inode_item->ctime,
1806                                      inode->i_ctime.tv_sec);
1807         btrfs_set_stack_timespec_nsec(&inode_item->ctime,
1808                                       inode->i_ctime.tv_nsec);
1809
1810         btrfs_set_stack_timespec_sec(&inode_item->otime,
1811                                      BTRFS_I(inode)->i_otime.tv_sec);
1812         btrfs_set_stack_timespec_nsec(&inode_item->otime,
1813                                      BTRFS_I(inode)->i_otime.tv_nsec);
1814 }
1815
1816 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1817 {
1818         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1819         struct btrfs_delayed_node *delayed_node;
1820         struct btrfs_inode_item *inode_item;
1821
1822         delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
1823         if (!delayed_node)
1824                 return -ENOENT;
1825
1826         mutex_lock(&delayed_node->mutex);
1827         if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1828                 mutex_unlock(&delayed_node->mutex);
1829                 btrfs_release_delayed_node(delayed_node);
1830                 return -ENOENT;
1831         }
1832
1833         inode_item = &delayed_node->inode_item;
1834
1835         i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
1836         i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
1837         btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
1838         btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
1839                         round_up(i_size_read(inode), fs_info->sectorsize));
1840         inode->i_mode = btrfs_stack_inode_mode(inode_item);
1841         set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1842         inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1843         BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1844         BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
1845
1846         inode_set_iversion_queried(inode,
1847                                    btrfs_stack_inode_sequence(inode_item));
1848         inode->i_rdev = 0;
1849         *rdev = btrfs_stack_inode_rdev(inode_item);
1850         btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
1851                                 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
1852
1853         inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
1854         inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
1855
1856         inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
1857         inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
1858
1859         inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
1860         inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
1861
1862         BTRFS_I(inode)->i_otime.tv_sec =
1863                 btrfs_stack_timespec_sec(&inode_item->otime);
1864         BTRFS_I(inode)->i_otime.tv_nsec =
1865                 btrfs_stack_timespec_nsec(&inode_item->otime);
1866
1867         inode->i_generation = BTRFS_I(inode)->generation;
1868         BTRFS_I(inode)->index_cnt = (u64)-1;
1869
1870         mutex_unlock(&delayed_node->mutex);
1871         btrfs_release_delayed_node(delayed_node);
1872         return 0;
1873 }
1874
1875 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1876                                struct btrfs_root *root,
1877                                struct btrfs_inode *inode)
1878 {
1879         struct btrfs_delayed_node *delayed_node;
1880         int ret = 0;
1881
1882         delayed_node = btrfs_get_or_create_delayed_node(inode);
1883         if (IS_ERR(delayed_node))
1884                 return PTR_ERR(delayed_node);
1885
1886         mutex_lock(&delayed_node->mutex);
1887         if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1888                 fill_stack_inode_item(trans, &delayed_node->inode_item,
1889                                       &inode->vfs_inode);
1890                 goto release_node;
1891         }
1892
1893         ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1894         if (ret)
1895                 goto release_node;
1896
1897         fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
1898         set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1899         delayed_node->count++;
1900         atomic_inc(&root->fs_info->delayed_root->items);
1901 release_node:
1902         mutex_unlock(&delayed_node->mutex);
1903         btrfs_release_delayed_node(delayed_node);
1904         return ret;
1905 }
1906
1907 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode)
1908 {
1909         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1910         struct btrfs_delayed_node *delayed_node;
1911
1912         /*
1913          * we don't do delayed inode updates during log recovery because it
1914          * leads to enospc problems.  This means we also can't do
1915          * delayed inode refs
1916          */
1917         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
1918                 return -EAGAIN;
1919
1920         delayed_node = btrfs_get_or_create_delayed_node(inode);
1921         if (IS_ERR(delayed_node))
1922                 return PTR_ERR(delayed_node);
1923
1924         /*
1925          * We don't reserve space for inode ref deletion is because:
1926          * - We ONLY do async inode ref deletion for the inode who has only
1927          *   one link(i_nlink == 1), it means there is only one inode ref.
1928          *   And in most case, the inode ref and the inode item are in the
1929          *   same leaf, and we will deal with them at the same time.
1930          *   Since we are sure we will reserve the space for the inode item,
1931          *   it is unnecessary to reserve space for inode ref deletion.
1932          * - If the inode ref and the inode item are not in the same leaf,
1933          *   We also needn't worry about enospc problem, because we reserve
1934          *   much more space for the inode update than it needs.
1935          * - At the worst, we can steal some space from the global reservation.
1936          *   It is very rare.
1937          */
1938         mutex_lock(&delayed_node->mutex);
1939         if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1940                 goto release_node;
1941
1942         set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1943         delayed_node->count++;
1944         atomic_inc(&fs_info->delayed_root->items);
1945 release_node:
1946         mutex_unlock(&delayed_node->mutex);
1947         btrfs_release_delayed_node(delayed_node);
1948         return 0;
1949 }
1950
1951 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1952 {
1953         struct btrfs_root *root = delayed_node->root;
1954         struct btrfs_fs_info *fs_info = root->fs_info;
1955         struct btrfs_delayed_item *curr_item, *prev_item;
1956
1957         mutex_lock(&delayed_node->mutex);
1958         curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1959         while (curr_item) {
1960                 prev_item = curr_item;
1961                 curr_item = __btrfs_next_delayed_item(prev_item);
1962                 btrfs_release_delayed_item(prev_item);
1963         }
1964
1965         if (delayed_node->index_item_leaves > 0) {
1966                 btrfs_delayed_item_release_leaves(delayed_node,
1967                                           delayed_node->index_item_leaves);
1968                 delayed_node->index_item_leaves = 0;
1969         }
1970
1971         curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1972         while (curr_item) {
1973                 btrfs_delayed_item_release_metadata(root, curr_item);
1974                 prev_item = curr_item;
1975                 curr_item = __btrfs_next_delayed_item(prev_item);
1976                 btrfs_release_delayed_item(prev_item);
1977         }
1978
1979         btrfs_release_delayed_iref(delayed_node);
1980
1981         if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1982                 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
1983                 btrfs_release_delayed_inode(delayed_node);
1984         }
1985         mutex_unlock(&delayed_node->mutex);
1986 }
1987
1988 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
1989 {
1990         struct btrfs_delayed_node *delayed_node;
1991
1992         delayed_node = btrfs_get_delayed_node(inode);
1993         if (!delayed_node)
1994                 return;
1995
1996         __btrfs_kill_delayed_node(delayed_node);
1997         btrfs_release_delayed_node(delayed_node);
1998 }
1999
2000 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
2001 {
2002         u64 inode_id = 0;
2003         struct btrfs_delayed_node *delayed_nodes[8];
2004         int i, n;
2005
2006         while (1) {
2007                 spin_lock(&root->inode_lock);
2008                 n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
2009                                            (void **)delayed_nodes, inode_id,
2010                                            ARRAY_SIZE(delayed_nodes));
2011                 if (!n) {
2012                         spin_unlock(&root->inode_lock);
2013                         break;
2014                 }
2015
2016                 inode_id = delayed_nodes[n - 1]->inode_id + 1;
2017                 for (i = 0; i < n; i++) {
2018                         /*
2019                          * Don't increase refs in case the node is dead and
2020                          * about to be removed from the tree in the loop below
2021                          */
2022                         if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
2023                                 delayed_nodes[i] = NULL;
2024                 }
2025                 spin_unlock(&root->inode_lock);
2026
2027                 for (i = 0; i < n; i++) {
2028                         if (!delayed_nodes[i])
2029                                 continue;
2030                         __btrfs_kill_delayed_node(delayed_nodes[i]);
2031                         btrfs_release_delayed_node(delayed_nodes[i]);
2032                 }
2033         }
2034 }
2035
2036 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info)
2037 {
2038         struct btrfs_delayed_node *curr_node, *prev_node;
2039
2040         curr_node = btrfs_first_delayed_node(fs_info->delayed_root);
2041         while (curr_node) {
2042                 __btrfs_kill_delayed_node(curr_node);
2043
2044                 prev_node = curr_node;
2045                 curr_node = btrfs_next_delayed_node(curr_node);
2046                 btrfs_release_delayed_node(prev_node);
2047         }
2048 }
2049
2050 void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
2051                                  struct list_head *ins_list,
2052                                  struct list_head *del_list)
2053 {
2054         struct btrfs_delayed_node *node;
2055         struct btrfs_delayed_item *item;
2056
2057         node = btrfs_get_delayed_node(inode);
2058         if (!node)
2059                 return;
2060
2061         mutex_lock(&node->mutex);
2062         item = __btrfs_first_delayed_insertion_item(node);
2063         while (item) {
2064                 /*
2065                  * It's possible that the item is already in a log list. This
2066                  * can happen in case two tasks are trying to log the same
2067                  * directory. For example if we have tasks A and task B:
2068                  *
2069                  * Task A collected the delayed items into a log list while
2070                  * under the inode's log_mutex (at btrfs_log_inode()), but it
2071                  * only releases the items after logging the inodes they point
2072                  * to (if they are new inodes), which happens after unlocking
2073                  * the log mutex;
2074                  *
2075                  * Task B enters btrfs_log_inode() and acquires the log_mutex
2076                  * of the same directory inode, before task B releases the
2077                  * delayed items. This can happen for example when logging some
2078                  * inode we need to trigger logging of its parent directory, so
2079                  * logging two files that have the same parent directory can
2080                  * lead to this.
2081                  *
2082                  * If this happens, just ignore delayed items already in a log
2083                  * list. All the tasks logging the directory are under a log
2084                  * transaction and whichever finishes first can not sync the log
2085                  * before the other completes and leaves the log transaction.
2086                  */
2087                 if (!item->logged && list_empty(&item->log_list)) {
2088                         refcount_inc(&item->refs);
2089                         list_add_tail(&item->log_list, ins_list);
2090                 }
2091                 item = __btrfs_next_delayed_item(item);
2092         }
2093
2094         item = __btrfs_first_delayed_deletion_item(node);
2095         while (item) {
2096                 /* It may be non-empty, for the same reason mentioned above. */
2097                 if (!item->logged && list_empty(&item->log_list)) {
2098                         refcount_inc(&item->refs);
2099                         list_add_tail(&item->log_list, del_list);
2100                 }
2101                 item = __btrfs_next_delayed_item(item);
2102         }
2103         mutex_unlock(&node->mutex);
2104
2105         /*
2106          * We are called during inode logging, which means the inode is in use
2107          * and can not be evicted before we finish logging the inode. So we never
2108          * have the last reference on the delayed inode.
2109          * Also, we don't use btrfs_release_delayed_node() because that would
2110          * requeue the delayed inode (change its order in the list of prepared
2111          * nodes) and we don't want to do such change because we don't create or
2112          * delete delayed items.
2113          */
2114         ASSERT(refcount_read(&node->refs) > 1);
2115         refcount_dec(&node->refs);
2116 }
2117
2118 void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
2119                                  struct list_head *ins_list,
2120                                  struct list_head *del_list)
2121 {
2122         struct btrfs_delayed_node *node;
2123         struct btrfs_delayed_item *item;
2124         struct btrfs_delayed_item *next;
2125
2126         node = btrfs_get_delayed_node(inode);
2127         if (!node)
2128                 return;
2129
2130         mutex_lock(&node->mutex);
2131
2132         list_for_each_entry_safe(item, next, ins_list, log_list) {
2133                 item->logged = true;
2134                 list_del_init(&item->log_list);
2135                 if (refcount_dec_and_test(&item->refs))
2136                         kfree(item);
2137         }
2138
2139         list_for_each_entry_safe(item, next, del_list, log_list) {
2140                 item->logged = true;
2141                 list_del_init(&item->log_list);
2142                 if (refcount_dec_and_test(&item->refs))
2143                         kfree(item);
2144         }
2145
2146         mutex_unlock(&node->mutex);
2147
2148         /*
2149          * We are called during inode logging, which means the inode is in use
2150          * and can not be evicted before we finish logging the inode. So we never
2151          * have the last reference on the delayed inode.
2152          * Also, we don't use btrfs_release_delayed_node() because that would
2153          * requeue the delayed inode (change its order in the list of prepared
2154          * nodes) and we don't want to do such change because we don't create or
2155          * delete delayed items.
2156          */
2157         ASSERT(refcount_read(&node->refs) > 1);
2158         refcount_dec(&node->refs);
2159 }