Merge tag 'pinctrl-v6.6-2' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw...
[platform/kernel/linux-rpi.git] / fs / xfs / xfs_extent_busy.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * Copyright (c) 2010 David Chinner.
5  * Copyright (c) 2011 Christoph Hellwig.
6  * All Rights Reserved.
7  */
8 #include "xfs.h"
9 #include "xfs_fs.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_shared.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_alloc.h"
16 #include "xfs_extent_busy.h"
17 #include "xfs_trace.h"
18 #include "xfs_trans.h"
19 #include "xfs_log.h"
20 #include "xfs_ag.h"
21
22 static void
23 xfs_extent_busy_insert_list(
24         struct xfs_perag        *pag,
25         xfs_agblock_t           bno,
26         xfs_extlen_t            len,
27         unsigned int            flags,
28         struct list_head        *busy_list)
29 {
30         struct xfs_extent_busy  *new;
31         struct xfs_extent_busy  *busyp;
32         struct rb_node          **rbp;
33         struct rb_node          *parent = NULL;
34
35         new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
36         new->agno = pag->pag_agno;
37         new->bno = bno;
38         new->length = len;
39         INIT_LIST_HEAD(&new->list);
40         new->flags = flags;
41
42         /* trace before insert to be able to see failed inserts */
43         trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
44
45         spin_lock(&pag->pagb_lock);
46         rbp = &pag->pagb_tree.rb_node;
47         while (*rbp) {
48                 parent = *rbp;
49                 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
50
51                 if (new->bno < busyp->bno) {
52                         rbp = &(*rbp)->rb_left;
53                         ASSERT(new->bno + new->length <= busyp->bno);
54                 } else if (new->bno > busyp->bno) {
55                         rbp = &(*rbp)->rb_right;
56                         ASSERT(bno >= busyp->bno + busyp->length);
57                 } else {
58                         ASSERT(0);
59                 }
60         }
61
62         rb_link_node(&new->rb_node, parent, rbp);
63         rb_insert_color(&new->rb_node, &pag->pagb_tree);
64
65         list_add(&new->list, busy_list);
66         spin_unlock(&pag->pagb_lock);
67 }
68
69 void
70 xfs_extent_busy_insert(
71         struct xfs_trans        *tp,
72         struct xfs_perag        *pag,
73         xfs_agblock_t           bno,
74         xfs_extlen_t            len,
75         unsigned int            flags)
76 {
77         xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
78 }
79
80 void
81 xfs_extent_busy_insert_discard(
82         struct xfs_perag        *pag,
83         xfs_agblock_t           bno,
84         xfs_extlen_t            len,
85         struct list_head        *busy_list)
86 {
87         xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
88                         busy_list);
89 }
90
91 /*
92  * Search for a busy extent within the range of the extent we are about to
93  * allocate.  You need to be holding the busy extent tree lock when calling
94  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
95  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
96  * match. This is done so that a non-zero return indicates an overlap that
97  * will require a synchronous transaction, but it can still be
98  * used to distinguish between a partial or exact match.
99  */
100 int
101 xfs_extent_busy_search(
102         struct xfs_mount        *mp,
103         struct xfs_perag        *pag,
104         xfs_agblock_t           bno,
105         xfs_extlen_t            len)
106 {
107         struct rb_node          *rbp;
108         struct xfs_extent_busy  *busyp;
109         int                     match = 0;
110
111         /* find closest start bno overlap */
112         spin_lock(&pag->pagb_lock);
113         rbp = pag->pagb_tree.rb_node;
114         while (rbp) {
115                 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
116                 if (bno < busyp->bno) {
117                         /* may overlap, but exact start block is lower */
118                         if (bno + len > busyp->bno)
119                                 match = -1;
120                         rbp = rbp->rb_left;
121                 } else if (bno > busyp->bno) {
122                         /* may overlap, but exact start block is higher */
123                         if (bno < busyp->bno + busyp->length)
124                                 match = -1;
125                         rbp = rbp->rb_right;
126                 } else {
127                         /* bno matches busyp, length determines exact match */
128                         match = (busyp->length == len) ? 1 : -1;
129                         break;
130                 }
131         }
132         spin_unlock(&pag->pagb_lock);
133         return match;
134 }
135
136 /*
137  * The found free extent [fbno, fend] overlaps part or all of the given busy
138  * extent.  If the overlap covers the beginning, the end, or all of the busy
139  * extent, the overlapping portion can be made unbusy and used for the
140  * allocation.  We can't split a busy extent because we can't modify a
141  * transaction/CIL context busy list, but we can update an entry's block
142  * number or length.
143  *
144  * Returns true if the extent can safely be reused, or false if the search
145  * needs to be restarted.
146  */
147 STATIC bool
148 xfs_extent_busy_update_extent(
149         struct xfs_mount        *mp,
150         struct xfs_perag        *pag,
151         struct xfs_extent_busy  *busyp,
152         xfs_agblock_t           fbno,
153         xfs_extlen_t            flen,
154         bool                    userdata) __releases(&pag->pagb_lock)
155                                           __acquires(&pag->pagb_lock)
156 {
157         xfs_agblock_t           fend = fbno + flen;
158         xfs_agblock_t           bbno = busyp->bno;
159         xfs_agblock_t           bend = bbno + busyp->length;
160
161         /*
162          * This extent is currently being discarded.  Give the thread
163          * performing the discard a chance to mark the extent unbusy
164          * and retry.
165          */
166         if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
167                 spin_unlock(&pag->pagb_lock);
168                 delay(1);
169                 spin_lock(&pag->pagb_lock);
170                 return false;
171         }
172
173         /*
174          * If there is a busy extent overlapping a user allocation, we have
175          * no choice but to force the log and retry the search.
176          *
177          * Fortunately this does not happen during normal operation, but
178          * only if the filesystem is very low on space and has to dip into
179          * the AGFL for normal allocations.
180          */
181         if (userdata)
182                 goto out_force_log;
183
184         if (bbno < fbno && bend > fend) {
185                 /*
186                  * Case 1:
187                  *    bbno           bend
188                  *    +BBBBBBBBBBBBBBBBB+
189                  *        +---------+
190                  *        fbno   fend
191                  */
192
193                 /*
194                  * We would have to split the busy extent to be able to track
195                  * it correct, which we cannot do because we would have to
196                  * modify the list of busy extents attached to the transaction
197                  * or CIL context, which is immutable.
198                  *
199                  * Force out the log to clear the busy extent and retry the
200                  * search.
201                  */
202                 goto out_force_log;
203         } else if (bbno >= fbno && bend <= fend) {
204                 /*
205                  * Case 2:
206                  *    bbno           bend
207                  *    +BBBBBBBBBBBBBBBBB+
208                  *    +-----------------+
209                  *    fbno           fend
210                  *
211                  * Case 3:
212                  *    bbno           bend
213                  *    +BBBBBBBBBBBBBBBBB+
214                  *    +--------------------------+
215                  *    fbno                    fend
216                  *
217                  * Case 4:
218                  *             bbno           bend
219                  *             +BBBBBBBBBBBBBBBBB+
220                  *    +--------------------------+
221                  *    fbno                    fend
222                  *
223                  * Case 5:
224                  *             bbno           bend
225                  *             +BBBBBBBBBBBBBBBBB+
226                  *    +-----------------------------------+
227                  *    fbno                             fend
228                  *
229                  */
230
231                 /*
232                  * The busy extent is fully covered by the extent we are
233                  * allocating, and can simply be removed from the rbtree.
234                  * However we cannot remove it from the immutable list
235                  * tracking busy extents in the transaction or CIL context,
236                  * so set the length to zero to mark it invalid.
237                  *
238                  * We also need to restart the busy extent search from the
239                  * tree root, because erasing the node can rearrange the
240                  * tree topology.
241                  */
242                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
243                 busyp->length = 0;
244                 return false;
245         } else if (fend < bend) {
246                 /*
247                  * Case 6:
248                  *              bbno           bend
249                  *             +BBBBBBBBBBBBBBBBB+
250                  *             +---------+
251                  *             fbno   fend
252                  *
253                  * Case 7:
254                  *             bbno           bend
255                  *             +BBBBBBBBBBBBBBBBB+
256                  *    +------------------+
257                  *    fbno            fend
258                  *
259                  */
260                 busyp->bno = fend;
261                 busyp->length = bend - fend;
262         } else if (bbno < fbno) {
263                 /*
264                  * Case 8:
265                  *    bbno           bend
266                  *    +BBBBBBBBBBBBBBBBB+
267                  *        +-------------+
268                  *        fbno       fend
269                  *
270                  * Case 9:
271                  *    bbno           bend
272                  *    +BBBBBBBBBBBBBBBBB+
273                  *        +----------------------+
274                  *        fbno                fend
275                  */
276                 busyp->length = fbno - busyp->bno;
277         } else {
278                 ASSERT(0);
279         }
280
281         trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
282         return true;
283
284 out_force_log:
285         spin_unlock(&pag->pagb_lock);
286         xfs_log_force(mp, XFS_LOG_SYNC);
287         trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
288         spin_lock(&pag->pagb_lock);
289         return false;
290 }
291
292
293 /*
294  * For a given extent [fbno, flen], make sure we can reuse it safely.
295  */
296 void
297 xfs_extent_busy_reuse(
298         struct xfs_mount        *mp,
299         struct xfs_perag        *pag,
300         xfs_agblock_t           fbno,
301         xfs_extlen_t            flen,
302         bool                    userdata)
303 {
304         struct rb_node          *rbp;
305
306         ASSERT(flen > 0);
307         spin_lock(&pag->pagb_lock);
308 restart:
309         rbp = pag->pagb_tree.rb_node;
310         while (rbp) {
311                 struct xfs_extent_busy *busyp =
312                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
313                 xfs_agblock_t   bbno = busyp->bno;
314                 xfs_agblock_t   bend = bbno + busyp->length;
315
316                 if (fbno + flen <= bbno) {
317                         rbp = rbp->rb_left;
318                         continue;
319                 } else if (fbno >= bend) {
320                         rbp = rbp->rb_right;
321                         continue;
322                 }
323
324                 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
325                                                   userdata))
326                         goto restart;
327         }
328         spin_unlock(&pag->pagb_lock);
329 }
330
331 /*
332  * For a given extent [fbno, flen], search the busy extent list to find a
333  * subset of the extent that is not busy.  If *rlen is smaller than
334  * args->minlen no suitable extent could be found, and the higher level
335  * code needs to force out the log and retry the allocation.
336  *
337  * Return the current busy generation for the AG if the extent is busy. This
338  * value can be used to wait for at least one of the currently busy extents
339  * to be cleared. Note that the busy list is not guaranteed to be empty after
340  * the gen is woken. The state of a specific extent must always be confirmed
341  * with another call to xfs_extent_busy_trim() before it can be used.
342  */
343 bool
344 xfs_extent_busy_trim(
345         struct xfs_alloc_arg    *args,
346         xfs_agblock_t           *bno,
347         xfs_extlen_t            *len,
348         unsigned                *busy_gen)
349 {
350         xfs_agblock_t           fbno;
351         xfs_extlen_t            flen;
352         struct rb_node          *rbp;
353         bool                    ret = false;
354
355         ASSERT(*len > 0);
356
357         spin_lock(&args->pag->pagb_lock);
358         fbno = *bno;
359         flen = *len;
360         rbp = args->pag->pagb_tree.rb_node;
361         while (rbp && flen >= args->minlen) {
362                 struct xfs_extent_busy *busyp =
363                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
364                 xfs_agblock_t   fend = fbno + flen;
365                 xfs_agblock_t   bbno = busyp->bno;
366                 xfs_agblock_t   bend = bbno + busyp->length;
367
368                 if (fend <= bbno) {
369                         rbp = rbp->rb_left;
370                         continue;
371                 } else if (fbno >= bend) {
372                         rbp = rbp->rb_right;
373                         continue;
374                 }
375
376                 if (bbno <= fbno) {
377                         /* start overlap */
378
379                         /*
380                          * Case 1:
381                          *    bbno           bend
382                          *    +BBBBBBBBBBBBBBBBB+
383                          *        +---------+
384                          *        fbno   fend
385                          *
386                          * Case 2:
387                          *    bbno           bend
388                          *    +BBBBBBBBBBBBBBBBB+
389                          *    +-------------+
390                          *    fbno       fend
391                          *
392                          * Case 3:
393                          *    bbno           bend
394                          *    +BBBBBBBBBBBBBBBBB+
395                          *        +-------------+
396                          *        fbno       fend
397                          *
398                          * Case 4:
399                          *    bbno           bend
400                          *    +BBBBBBBBBBBBBBBBB+
401                          *    +-----------------+
402                          *    fbno           fend
403                          *
404                          * No unbusy region in extent, return failure.
405                          */
406                         if (fend <= bend)
407                                 goto fail;
408
409                         /*
410                          * Case 5:
411                          *    bbno           bend
412                          *    +BBBBBBBBBBBBBBBBB+
413                          *        +----------------------+
414                          *        fbno                fend
415                          *
416                          * Case 6:
417                          *    bbno           bend
418                          *    +BBBBBBBBBBBBBBBBB+
419                          *    +--------------------------+
420                          *    fbno                    fend
421                          *
422                          * Needs to be trimmed to:
423                          *                       +-------+
424                          *                       fbno fend
425                          */
426                         fbno = bend;
427                 } else if (bend >= fend) {
428                         /* end overlap */
429
430                         /*
431                          * Case 7:
432                          *             bbno           bend
433                          *             +BBBBBBBBBBBBBBBBB+
434                          *    +------------------+
435                          *    fbno            fend
436                          *
437                          * Case 8:
438                          *             bbno           bend
439                          *             +BBBBBBBBBBBBBBBBB+
440                          *    +--------------------------+
441                          *    fbno                    fend
442                          *
443                          * Needs to be trimmed to:
444                          *    +-------+
445                          *    fbno fend
446                          */
447                         fend = bbno;
448                 } else {
449                         /* middle overlap */
450
451                         /*
452                          * Case 9:
453                          *             bbno           bend
454                          *             +BBBBBBBBBBBBBBBBB+
455                          *    +-----------------------------------+
456                          *    fbno                             fend
457                          *
458                          * Can be trimmed to:
459                          *    +-------+        OR         +-------+
460                          *    fbno fend                   fbno fend
461                          *
462                          * Backward allocation leads to significant
463                          * fragmentation of directories, which degrades
464                          * directory performance, therefore we always want to
465                          * choose the option that produces forward allocation
466                          * patterns.
467                          * Preferring the lower bno extent will make the next
468                          * request use "fend" as the start of the next
469                          * allocation;  if the segment is no longer busy at
470                          * that point, we'll get a contiguous allocation, but
471                          * even if it is still busy, we will get a forward
472                          * allocation.
473                          * We try to avoid choosing the segment at "bend",
474                          * because that can lead to the next allocation
475                          * taking the segment at "fbno", which would be a
476                          * backward allocation.  We only use the segment at
477                          * "fbno" if it is much larger than the current
478                          * requested size, because in that case there's a
479                          * good chance subsequent allocations will be
480                          * contiguous.
481                          */
482                         if (bbno - fbno >= args->maxlen) {
483                                 /* left candidate fits perfect */
484                                 fend = bbno;
485                         } else if (fend - bend >= args->maxlen * 4) {
486                                 /* right candidate has enough free space */
487                                 fbno = bend;
488                         } else if (bbno - fbno >= args->minlen) {
489                                 /* left candidate fits minimum requirement */
490                                 fend = bbno;
491                         } else {
492                                 goto fail;
493                         }
494                 }
495
496                 flen = fend - fbno;
497         }
498 out:
499
500         if (fbno != *bno || flen != *len) {
501                 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
502                                           fbno, flen);
503                 *bno = fbno;
504                 *len = flen;
505                 *busy_gen = args->pag->pagb_gen;
506                 ret = true;
507         }
508         spin_unlock(&args->pag->pagb_lock);
509         return ret;
510 fail:
511         /*
512          * Return a zero extent length as failure indications.  All callers
513          * re-check if the trimmed extent satisfies the minlen requirement.
514          */
515         flen = 0;
516         goto out;
517 }
518
519 STATIC void
520 xfs_extent_busy_clear_one(
521         struct xfs_mount        *mp,
522         struct xfs_perag        *pag,
523         struct xfs_extent_busy  *busyp)
524 {
525         if (busyp->length) {
526                 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
527                                                 busyp->length);
528                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
529         }
530
531         list_del_init(&busyp->list);
532         kmem_free(busyp);
533 }
534
535 static void
536 xfs_extent_busy_put_pag(
537         struct xfs_perag        *pag,
538         bool                    wakeup)
539                 __releases(pag->pagb_lock)
540 {
541         if (wakeup) {
542                 pag->pagb_gen++;
543                 wake_up_all(&pag->pagb_wait);
544         }
545
546         spin_unlock(&pag->pagb_lock);
547         xfs_perag_put(pag);
548 }
549
550 /*
551  * Remove all extents on the passed in list from the busy extents tree.
552  * If do_discard is set skip extents that need to be discarded, and mark
553  * these as undergoing a discard operation instead.
554  */
555 void
556 xfs_extent_busy_clear(
557         struct xfs_mount        *mp,
558         struct list_head        *list,
559         bool                    do_discard)
560 {
561         struct xfs_extent_busy  *busyp, *n;
562         struct xfs_perag        *pag = NULL;
563         xfs_agnumber_t          agno = NULLAGNUMBER;
564         bool                    wakeup = false;
565
566         list_for_each_entry_safe(busyp, n, list, list) {
567                 if (busyp->agno != agno) {
568                         if (pag)
569                                 xfs_extent_busy_put_pag(pag, wakeup);
570                         agno = busyp->agno;
571                         pag = xfs_perag_get(mp, agno);
572                         spin_lock(&pag->pagb_lock);
573                         wakeup = false;
574                 }
575
576                 if (do_discard && busyp->length &&
577                     !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
578                         busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
579                 } else {
580                         xfs_extent_busy_clear_one(mp, pag, busyp);
581                         wakeup = true;
582                 }
583         }
584
585         if (pag)
586                 xfs_extent_busy_put_pag(pag, wakeup);
587 }
588
589 /*
590  * Flush out all busy extents for this AG.
591  *
592  * If the current transaction is holding busy extents, the caller may not want
593  * to wait for committed busy extents to resolve. If we are being told just to
594  * try a flush or progress has been made since we last skipped a busy extent,
595  * return immediately to allow the caller to try again.
596  *
597  * If we are freeing extents, we might actually be holding the only free extents
598  * in the transaction busy list and the log force won't resolve that situation.
599  * In this case, we must return -EAGAIN to avoid a deadlock by informing the
600  * caller it needs to commit the busy extents it holds before retrying the
601  * extent free operation.
602  */
603 int
604 xfs_extent_busy_flush(
605         struct xfs_trans        *tp,
606         struct xfs_perag        *pag,
607         unsigned                busy_gen,
608         uint32_t                alloc_flags)
609 {
610         DEFINE_WAIT             (wait);
611         int                     error;
612
613         error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
614         if (error)
615                 return error;
616
617         /* Avoid deadlocks on uncommitted busy extents. */
618         if (!list_empty(&tp->t_busy)) {
619                 if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
620                         return 0;
621
622                 if (busy_gen != READ_ONCE(pag->pagb_gen))
623                         return 0;
624
625                 if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
626                         return -EAGAIN;
627         }
628
629         /* Wait for committed busy extents to resolve. */
630         do {
631                 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
632                 if  (busy_gen != READ_ONCE(pag->pagb_gen))
633                         break;
634                 schedule();
635         } while (1);
636
637         finish_wait(&pag->pagb_wait, &wait);
638         return 0;
639 }
640
641 void
642 xfs_extent_busy_wait_all(
643         struct xfs_mount        *mp)
644 {
645         struct xfs_perag        *pag;
646         DEFINE_WAIT             (wait);
647         xfs_agnumber_t          agno;
648
649         for_each_perag(mp, agno, pag) {
650                 do {
651                         prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
652                         if  (RB_EMPTY_ROOT(&pag->pagb_tree))
653                                 break;
654                         schedule();
655                 } while (1);
656                 finish_wait(&pag->pagb_wait, &wait);
657         }
658 }
659
660 /*
661  * Callback for list_sort to sort busy extents by the AG they reside in.
662  */
663 int
664 xfs_extent_busy_ag_cmp(
665         void                    *priv,
666         const struct list_head  *l1,
667         const struct list_head  *l2)
668 {
669         struct xfs_extent_busy  *b1 =
670                 container_of(l1, struct xfs_extent_busy, list);
671         struct xfs_extent_busy  *b2 =
672                 container_of(l2, struct xfs_extent_busy, list);
673         s32 diff;
674
675         diff = b1->agno - b2->agno;
676         if (!diff)
677                 diff = b1->bno - b2->bno;
678         return diff;
679 }