xfs: use a separate frextents counter for rt extent reservations
authorDarrick J. Wong <djwong@kernel.org>
Mon, 11 Apr 2022 20:49:42 +0000 (06:49 +1000)
committerDave Chinner <david@fromorbit.com>
Mon, 11 Apr 2022 20:49:42 +0000 (06:49 +1000)
As mentioned in the previous commit, the kernel misuses sb_frextents in
the incore mount to reflect both incore reservations made by running
transactions as well as the actual count of free rt extents on disk.
This results in the superblock being written to the log with an
underestimate of the number of rt extents that are marked free in the
rtbitmap.

Teaching XFS to recompute frextents after log recovery avoids
operational problems in the current mount, but it doesn't solve the
problem of us writing undercounted frextents which are then recovered by
an older kernel that doesn't have that fix.

Create an incore percpu counter to mirror the ondisk frextents.  This
new counter will track transaction reservations and the only time we
will touch the incore super counter (i.e the one that gets logged) is
when those transactions commit updates to the rt bitmap.  This is in
contrast to the lazysbcount counters (e.g. fdblocks), where we know that
log recovery will always fix any incorrect counter that we log.
As a bonus, we only take m_sb_lock at transaction commit time.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
fs/xfs/libxfs/xfs_sb.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_rtalloc.c
fs/xfs/xfs_super.c
fs/xfs/xfs_trans.c

index f4e84aa1d50a4b267d844d78613eda67eb6c61a3..8dd7186ef9df191bd3f4075781e5e448547bde2f 100644 (file)
@@ -911,6 +911,11 @@ xfs_log_sb(
         * reservations that have been taken out percpu counters. If we have an
         * unclean shutdown, this will be corrected by log recovery rebuilding
         * the counters from the AGF block counts.
+        *
+        * Do not update sb_frextents here because it is not part of the lazy
+        * sb counters, despite having a percpu counter. It is always kept
+        * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+        * and hence we don't need have to update it here.
         */
        if (xfs_has_lazysbcount(mp)) {
                mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
index 68f74549fa2280612c19f0e05873e9b8372f66e3..a0d7aa7fbbff099f9d98976326604f812fc8c3a0 100644 (file)
@@ -349,10 +349,7 @@ xfs_fs_counts(
        cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
        cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
                                                xfs_fdblocks_unavailable(mp);
-
-       spin_lock(&mp->m_sb_lock);
-       cnt->freertx = mp->m_sb.sb_frextents;
-       spin_unlock(&mp->m_sb_lock);
+       cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
 }
 
 /*
index bffd6eb0b298eedb54f820014302dd1bd2bca765..5269354b1b69d432aa1f7cfb7852da3055dbc3b8 100644 (file)
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
        struct xfs_inode        *ip)
 {
        struct xfs_mount        *mp = ip->i_mount;
-       uint64_t                freertx;
 
        if (!XFS_IS_REALTIME_INODE(ip))
                return false;
 
-       freertx = READ_ONCE(mp->m_sb.sb_frextents);
-       return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+       if (__percpu_counter_compare(&mp->m_frextents,
+                               mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+                               XFS_FDBLOCKS_BATCH) < 0)
+               return true;
+
+       return false;
 }
 #else
 # define xfs_inodegc_want_queue_rt_file(ip)    (false)
index 53e130f803b149239e8fa67ed8c5e06e73a113fe..0c0bcbd4949ddb85677f5a1393f46764f03430c9 100644 (file)
@@ -1110,24 +1110,33 @@ xfs_fs_writable(
        return true;
 }
 
+/* Adjust m_fdblocks or m_frextents. */
 int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
        struct xfs_mount        *mp,
+       struct percpu_counter   *counter,
        int64_t                 delta,
        bool                    rsvd)
 {
        int64_t                 lcounter;
        long long               res_used;
+       uint64_t                set_aside = 0;
        s32                     batch;
-       uint64_t                set_aside;
+       bool                    has_resv_pool;
+
+       ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+       has_resv_pool = (counter == &mp->m_fdblocks);
+       if (rsvd)
+               ASSERT(has_resv_pool);
 
        if (delta > 0) {
                /*
                 * If the reserve pool is depleted, put blocks back into it
                 * first. Most of the time the pool is full.
                 */
-               if (likely(mp->m_resblks == mp->m_resblks_avail)) {
-                       percpu_counter_add(&mp->m_fdblocks, delta);
+               if (likely(!has_resv_pool ||
+                          mp->m_resblks == mp->m_resblks_avail)) {
+                       percpu_counter_add(counter, delta);
                        return 0;
                }
 
@@ -1139,7 +1148,7 @@ xfs_mod_fdblocks(
                } else {
                        delta -= res_used;
                        mp->m_resblks_avail = mp->m_resblks;
-                       percpu_counter_add(&mp->m_fdblocks, delta);
+                       percpu_counter_add(counter, delta);
                }
                spin_unlock(&mp->m_sb_lock);
                return 0;
@@ -1153,7 +1162,7 @@ xfs_mod_fdblocks(
         * then make everything serialise as we are real close to
         * ENOSPC.
         */
-       if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+       if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
                                     XFS_FDBLOCKS_BATCH) < 0)
                batch = 1;
        else
@@ -1170,9 +1179,10 @@ xfs_mod_fdblocks(
         * problems (i.e. transaction abort, pagecache discards, etc.) than
         * slightly premature -ENOSPC.
         */
-       set_aside = xfs_fdblocks_unavailable(mp);
-       percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
-       if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+       if (has_resv_pool)
+               set_aside = xfs_fdblocks_unavailable(mp);
+       percpu_counter_add_batch(counter, delta, batch);
+       if (__percpu_counter_compare(counter, set_aside,
                                     XFS_FDBLOCKS_BATCH) >= 0) {
                /* we had space! */
                return 0;
@@ -1183,8 +1193,8 @@ xfs_mod_fdblocks(
         * that took us to ENOSPC.
         */
        spin_lock(&mp->m_sb_lock);
-       percpu_counter_add(&mp->m_fdblocks, -delta);
-       if (!rsvd)
+       percpu_counter_add(counter, -delta);
+       if (!has_resv_pool || !rsvd)
                goto fdblocks_enospc;
 
        lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1201,24 +1211,6 @@ fdblocks_enospc:
        return -ENOSPC;
 }
 
-int
-xfs_mod_frextents(
-       struct xfs_mount        *mp,
-       int64_t                 delta)
-{
-       int64_t                 lcounter;
-       int                     ret = 0;
-
-       spin_lock(&mp->m_sb_lock);
-       lcounter = mp->m_sb.sb_frextents + delta;
-       if (lcounter < 0)
-               ret = -ENOSPC;
-       else
-               mp->m_sb.sb_frextents = lcounter;
-       spin_unlock(&mp->m_sb_lock);
-       return ret;
-}
-
 /*
  * Used to free the superblock along various error paths.
  */
index f6dc19de8322a6f104672bb61d53b8983e9b8566..a6b8efb2df5291dfab6602cd7a6b889553b03ee0 100644 (file)
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
        struct percpu_counter   m_icount;       /* allocated inodes counter */
        struct percpu_counter   m_ifree;        /* free inodes counter */
        struct percpu_counter   m_fdblocks;     /* free block counter */
+       struct percpu_counter   m_frextents;    /* free rt extent counter */
+
        /*
         * Count of data device blocks reserved for delayed allocations,
         * including indlen blocks.  Does not include allocated CoW staging
@@ -494,9 +496,20 @@ xfs_fdblocks_unavailable(
        return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
 }
 
-extern int     xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
-                                bool reserved);
-extern int     xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+               int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+       return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+       return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
 
 extern int     xfs_readsb(xfs_mount_t *, int);
 extern void    xfs_freesb(xfs_mount_t *);
index 76f50e75f99c99445d36a1c8ba017ecc205b35f0..997e4a9d27d3f7a13875254a5ff706904ea3499f 100644 (file)
@@ -1318,6 +1318,7 @@ xfs_rtalloc_reinit_frextents(
        spin_lock(&mp->m_sb_lock);
        mp->m_sb.sb_frextents = val;
        spin_unlock(&mp->m_sb_lock);
+       percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
        return 0;
 }
 
index 54be9d64093edacfaea509ff2d1e73cf61a58493..3a508864629448d91adc7584d84d3ae169429e54 100644 (file)
@@ -843,9 +843,11 @@ xfs_fs_statfs(
 
        if (XFS_IS_REALTIME_MOUNT(mp) &&
            (ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+               s64     freertx;
+
                statp->f_blocks = sbp->sb_rblocks;
-               statp->f_bavail = statp->f_bfree =
-                       sbp->sb_frextents * sbp->sb_rextsize;
+               freertx = percpu_counter_sum_positive(&mp->m_frextents);
+               statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
        }
 
        return 0;
@@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
        if (error)
                goto free_fdblocks;
 
+       error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+       if (error)
+               goto free_delalloc;
+
        return 0;
 
+free_delalloc:
+       percpu_counter_destroy(&mp->m_delalloc_blks);
 free_fdblocks:
        percpu_counter_destroy(&mp->m_fdblocks);
 free_ifree:
@@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
        percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
        percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
        percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+       percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
 }
 
 static void
@@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
        ASSERT(xfs_is_shutdown(mp) ||
               percpu_counter_sum(&mp->m_delalloc_blks) == 0);
        percpu_counter_destroy(&mp->m_delalloc_blks);
+       percpu_counter_destroy(&mp->m_frextents);
 }
 
 static int
index 0ac717aad380075848af3fc8157e0e5b3c9ed631..6d9df2e9b267439e7688e7b020f650372c28065f 100644 (file)
@@ -498,10 +498,31 @@ xfs_trans_apply_sb_deltas(
                        be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
        }
 
-       if (tp->t_frextents_delta)
-               be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
-       if (tp->t_res_frextents_delta)
-               be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+       /*
+        * Updating frextents requires careful handling because it does not
+        * behave like the lazysb counters because we cannot rely on log
+        * recovery in older kenels to recompute the value from the rtbitmap.
+        * This means that the ondisk frextents must be consistent with the
+        * rtbitmap.
+        *
+        * Therefore, log the frextents change to the ondisk superblock and
+        * update the incore superblock so that future calls to xfs_log_sb
+        * write the correct value ondisk.
+        *
+        * Don't touch m_frextents because it includes incore reservations,
+        * and those are handled by the unreserve function.
+        */
+       if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+               struct xfs_mount        *mp = tp->t_mountp;
+               int64_t                 rtxdelta;
+
+               rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+               spin_lock(&mp->m_sb_lock);
+               be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+               mp->m_sb.sb_frextents += rtxdelta;
+               spin_unlock(&mp->m_sb_lock);
+       }
 
        if (tp->t_dblocks_delta) {
                be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +635,12 @@ xfs_trans_unreserve_and_mod_sb(
        if (ifreedelta)
                percpu_counter_add(&mp->m_ifree, ifreedelta);
 
-       if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+       if (rtxdelta) {
+               error = xfs_mod_frextents(mp, rtxdelta);
+               ASSERT(!error);
+       }
+
+       if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
                return;
 
        /* apply remaining deltas */
@@ -622,7 +648,12 @@ xfs_trans_unreserve_and_mod_sb(
        mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
        mp->m_sb.sb_icount += idelta;
        mp->m_sb.sb_ifree += ifreedelta;
-       mp->m_sb.sb_frextents += rtxdelta;
+       /*
+        * Do not touch sb_frextents here because we are dealing with incore
+        * reservation.  sb_frextents is not part of the lazy sb counters so it
+        * must be consistent with the ondisk rtbitmap and must never include
+        * incore reservations.
+        */
        mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
        mp->m_sb.sb_agcount += tp->t_agcount_delta;
        mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;