xfs: fix ordering violation between cache flushes and tail updates

author Dave Chinner <dchinner@redhat.com>

Tue, 27 Jul 2021 23:23:48 +0000 (16:23 -0700)

committer Darrick J. Wong <djwong@kernel.org>

Thu, 29 Jul 2021 16:27:28 +0000 (09:27 -0700)
author Dave Chinner <dchinner@redhat.com>
Tue, 27 Jul 2021 23:23:48 +0000 (16:23 -0700)
committer Darrick J. Wong <djwong@kernel.org>
Thu, 29 Jul 2021 16:27:28 +0000 (09:27 -0700)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 82f5996d3889b59d81d04b673737d1b8164c8055..e8c6c96d4f7c83364c881fb76d5467c1d71ab430 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -489,12 +489,17 @@ out_error:
  
  /*
   * Flush iclog to disk if this is the last reference to the given iclog and the
- * it is in the WANT_SYNC state.
+ * it is in the WANT_SYNC state.  If the caller passes in a non-zero
+ * @old_tail_lsn and the current log tail does not match, there may be metadata
+ * on disk that must be persisted before this iclog is written.  To satisfy that
+ * requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this
+ * iclog with the new log tail value.
   */
  int
  xlog_state_release_iclog(
         struct xlog             *log,
-       struct xlog_in_core     *iclog)
+       struct xlog_in_core     *iclog,
+       xfs_lsn_t               old_tail_lsn)
  {
         xfs_lsn_t               tail_lsn;
         lockdep_assert_held(&log->l_icloglock);
@@ -503,6 +508,19 @@ xlog_state_release_iclog(
         if (iclog->ic_state == XLOG_STATE_IOERROR)
                 return -EIO;
  
+       /*
+        * Grabbing the current log tail needs to be atomic w.r.t. the writing
+        * of the tail LSN into the iclog so we guarantee that the log tail does
+        * not move between deciding if a cache flush is required and writing
+        * the LSN into the iclog below.
+        */
+       if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+               tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+               if (old_tail_lsn && tail_lsn != old_tail_lsn)
+                       iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+       }
+
         if (!atomic_dec_and_test(&iclog->ic_refcnt))
                 return 0;
  
@@ -511,8 +529,6 @@ xlog_state_release_iclog(
                 return 0;
         }
  
-       /* update tail before writing to iclog */
-       tail_lsn = xlog_assign_tail_lsn(log->l_mp);
         iclog->ic_state = XLOG_STATE_SYNCING;
         iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
         xlog_verify_tail_lsn(log, iclog, tail_lsn);
@@ -858,7 +874,7 @@ out_err:
          * iclog containing the unmount record is written.
          */
         iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
-       error = xlog_state_release_iclog(log, iclog);
+       error = xlog_state_release_iclog(log, iclog, 0);
         xlog_wait_on_iclog(iclog);
  
         if (tic) {
@@ -2302,7 +2318,7 @@ xlog_write_copy_finish(
         return 0;
  
  release_iclog:
-       error = xlog_state_release_iclog(log, iclog);
+       error = xlog_state_release_iclog(log, iclog, 0);
         spin_unlock(&log->l_icloglock);
         return error;
  }
@@ -2521,7 +2537,7 @@ next_lv:
                 ASSERT(optype & XLOG_COMMIT_TRANS);
                 *commit_iclog = iclog;
         } else {
-               error = xlog_state_release_iclog(log, iclog);
+               error = xlog_state_release_iclog(log, iclog, 0);
         }
         spin_unlock(&log->l_icloglock);
  
@@ -2959,7 +2975,7 @@ restart:
                  * reference to the iclog.
                  */
                 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
-                       error = xlog_state_release_iclog(log, iclog);
+                       error = xlog_state_release_iclog(log, iclog, 0);
                 spin_unlock(&log->l_icloglock);
                 if (error)
                         return error;
@@ -3195,7 +3211,7 @@ xfs_log_force(
                         atomic_inc(&iclog->ic_refcnt);
                         lsn = be64_to_cpu(iclog->ic_header.h_lsn);
                         xlog_state_switch_iclogs(log, iclog, 0);
-                       if (xlog_state_release_iclog(log, iclog))
+                       if (xlog_state_release_iclog(log, iclog, 0))
                                 goto out_error;
  
                         if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
@@ -3275,7 +3291,7 @@ xlog_force_lsn(
                 }
                 atomic_inc(&iclog->ic_refcnt);
                 xlog_state_switch_iclogs(log, iclog, 0);
-               if (xlog_state_release_iclog(log, iclog))
+               if (xlog_state_release_iclog(log, iclog, 0))
                         goto out_error;
                 if (log_flushed)
                         *log_flushed = 1;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index b128aaa9b870d5c709ed8fbe6868113e1491d6f9..4c44bc3786c0f07945aa9631a115bb66e670ca33 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -654,8 +654,9 @@ xlog_cil_push_work(
         struct xfs_trans_header thdr;
         struct xfs_log_iovec    lhdr;
         struct xfs_log_vec      lvhdr = { NULL };
+       xfs_lsn_t               preflush_tail_lsn;
         xfs_lsn_t               commit_lsn;
-       xfs_lsn_t               push_seq;
+       xfs_csn_t               push_seq;
         struct bio              bio;
         DECLARE_COMPLETION_ONSTACK(bdev_flush);
  
@@ -730,7 +731,15 @@ xlog_cil_push_work(
          * because we hold the flush lock exclusively. Hence we can now issue
          * a cache flush to ensure all the completed metadata in the journal we
          * are about to overwrite is on stable storage.
+        *
+        * Because we are issuing this cache flush before we've written the
+        * tail lsn to the iclog, we can have metadata IO completions move the
+        * tail forwards between the completion of this flush and the iclog
+        * being written. In this case, we need to re-issue the cache flush
+        * before the iclog write. To detect whether the log tail moves, sample
+        * the tail LSN *before* we issue the flush.
          */
+       preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
         xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
                                 &bdev_flush);
  
@@ -941,7 +950,7 @@ restart:
          * storage.
          */
         commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
-       xlog_state_release_iclog(log, commit_iclog);
+       xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
         spin_unlock(&log->l_icloglock);
         return;
  
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 4c41bbfa33b0df55acdbf1f83e5b6660cc94e3cf..7cbde0b4f9901d9a0435ed670a308c4c4a1765bf 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -497,7 +497,8 @@ int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
  void   xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
  void   xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
  
-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+               xfs_lsn_t log_tail_lsn);
  
  /*
   * When we crack an atomic LSN, we sample it first so that the value will not
author	Dave Chinner <dchinner@redhat.com>
	Tue, 27 Jul 2021 23:23:48 +0000 (16:23 -0700)
committer	Darrick J. Wong <djwong@kernel.org>
	Thu, 29 Jul 2021 16:27:28 +0000 (09:27 -0700)
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history