xfs: fix low space alloc deadlock

author Dave Chinner <dchinner@redhat.com>

Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)

committer Dave Chinner <dchinner@redhat.com>

Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)
author Dave Chinner <dchinner@redhat.com>
Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)
committer Dave Chinner <dchinner@redhat.com>
Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index f8ff81c..ffe6345 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3164,10 +3164,13 @@ xfs_alloc_vextent(
         xfs_alloctype_t         type;   /* input allocation type */
         int                     bump_rotor = 0;
         xfs_agnumber_t          rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+       xfs_agnumber_t          minimum_agno = 0;
  
         mp = args->mp;
         type = args->otype = args->type;
         args->agbno = NULLAGBLOCK;
+       if (args->tp->t_firstblock != NULLFSBLOCK)
+               minimum_agno = XFS_FSB_TO_AGNO(mp, args->tp->t_firstblock);
         /*
          * Just fix this up, for the case where the last a.g. is shorter
          * (or there's only one a.g.) and the caller couldn't easily figure
@@ -3201,6 +3204,13 @@ xfs_alloc_vextent(
                  */
                 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
                 args->pag = xfs_perag_get(mp, args->agno);
+
+               if (minimum_agno > args->agno) {
+                       trace_xfs_alloc_vextent_skip_deadlock(args);
+                       error = 0;
+                       break;
+               }
+
                 error = xfs_alloc_fix_freelist(args, 0);
                 if (error) {
                         trace_xfs_alloc_vextent_nofix(args);
@@ -3232,6 +3242,8 @@ xfs_alloc_vextent(
         case XFS_ALLOCTYPE_FIRST_AG:
                 /*
                  * Rotate through the allocation groups looking for a winner.
+                * If we are blocking, we must obey minimum_agno contraints for
+                * avoiding ABBA deadlocks on AGF locking.
                  */
                 if (type == XFS_ALLOCTYPE_FIRST_AG) {
                         /*
@@ -3239,7 +3251,7 @@ xfs_alloc_vextent(
                          */
                         args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
                         args->type = XFS_ALLOCTYPE_THIS_AG;
-                       sagno = 0;
+                       sagno = minimum_agno;
                         flags = 0;
                 } else {
                         /*
@@ -3248,6 +3260,7 @@ xfs_alloc_vextent(
                         args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
                         flags = XFS_ALLOC_FLAG_TRYLOCK;
                 }
+
                 /*
                  * Loop over allocation groups twice; first time with
                  * trylock set, second time without.
@@ -3276,19 +3289,21 @@ xfs_alloc_vextent(
                         if (args->agno == sagno &&
                             type == XFS_ALLOCTYPE_START_BNO)
                                 args->type = XFS_ALLOCTYPE_THIS_AG;
+
                         /*
-                       * For the first allocation, we can try any AG to get
-                       * space.  However, if we already have allocated a
-                       * block, we don't want to try AGs whose number is below
-                       * sagno. Otherwise, we may end up with out-of-order
-                       * locking of AGF, which might cause deadlock.
-                       */
+                        * If we are try-locking, we can't deadlock on AGF
+                        * locks, so we can wrap all the way back to the first
+                        * AG. Otherwise, wrap back to the start AG so we can't
+                        * deadlock, and let the end of scan handler decide what
+                        * to do next.
+                        */
                         if (++(args->agno) == mp->m_sb.sb_agcount) {
-                               if (args->tp->t_firstblock != NULLFSBLOCK)
-                                       args->agno = sagno;
-                               else
+                               if (flags & XFS_ALLOC_FLAG_TRYLOCK)
                                         args->agno = 0;
+                               else
+                                       args->agno = sagno;
                         }
+
                         /*
                          * Reached the starting a.g., must either be done
                          * or switch to non-trylock mode.
@@ -3300,7 +3315,14 @@ xfs_alloc_vextent(
                                         break;
                                 }
  
+                               /*
+                                * Blocking pass next, so we must obey minimum
+                                * agno constraints to avoid ABBA AGF deadlocks.
+                                */
                                 flags = 0;
+                               if (minimum_agno > sagno)
+                                       sagno = minimum_agno;
+
                                 if (type == XFS_ALLOCTYPE_START_BNO) {
                                         args->agbno = XFS_FSB_TO_AGBNO(mp,
                                                 args->fsbno);
@@ -3322,9 +3344,9 @@ xfs_alloc_vextent(
                 ASSERT(0);
                 /* NOTREACHED */
         }
-       if (args->agbno == NULLAGBLOCK)
+       if (args->agbno == NULLAGBLOCK) {
                 args->fsbno = NULLFSBLOCK;
-       else {
+       } else {
                 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
  #ifdef DEBUG
                 ASSERT(args->len >= args->minlen);
@@ -3335,6 +3357,29 @@ xfs_alloc_vextent(
  #endif
  
         }
+
+       /*
+        * We end up here with a locked AGF. If we failed, the caller is likely
+        * going to try to allocate again with different parameters, and that
+        * can widen the AGs that are searched for free space. If we have to do
+        * BMBT block allocation, we have to do a new allocation.
+        *
+        * Hence leaving this function with the AGF locked opens up potential
+        * ABBA AGF deadlocks because a future allocation attempt in this
+        * transaction may attempt to lock a lower number AGF.
+        *
+        * We can't release the AGF until the transaction is commited, so at
+        * this point we must update the "firstblock" tracker to point at this
+        * AG if the tracker is empty or points to a lower AG. This allows the
+        * next allocation attempt to be modified appropriately to avoid
+        * deadlocks.
+        */
+       if (args->agbp &&
+           (args->tp->t_firstblock == NULLFSBLOCK ||
+            args->pag->pag_agno > minimum_agno)) {
+               args->tp->t_firstblock = XFS_AGB_TO_FSB(mp,
+                                       args->pag->pag_agno, 0);
+       }
         xfs_perag_put(args->pag);
         return 0;
  error0:
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index c8c6538..de6d585 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3413,21 +3413,7 @@ xfs_bmap_process_allocated_extent(
         xfs_fileoff_t           orig_offset,
         xfs_extlen_t            orig_length)
  {
-       int                     nullfb;
-
-       nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
-
-       /*
-        * check the allocation happened at the same or higher AG than
-        * the first block that was allocated.
-        */
-       ASSERT(nullfb ||
-               XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
-               XFS_FSB_TO_AGNO(args->mp, args->fsbno));
-
         ap->blkno = args->fsbno;
-       if (nullfb)
-               ap->tp->t_firstblock = args->fsbno;
         ap->length = args->len;
         /*
          * If the extent size hint is active, we tried to round the
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 6b0e9ae..adddaeb 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
  DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
  DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
  DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock);
  DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
  DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
  DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
author	Dave Chinner <dchinner@redhat.com>
	Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)
committer	Dave Chinner <dchinner@redhat.com>
	Fri, 10 Feb 2023 17:07:06 +0000 (04:07 +1100)
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history