/*
* Flush iclog to disk if this is the last reference to the given iclog and the
- * it is in the WANT_SYNC state.
+ * it is in the WANT_SYNC state. If the caller passes in a non-zero
+ * @old_tail_lsn and the current log tail does not match, there may be metadata
+ * on disk that must be persisted before this iclog is written. To satisfy that
+ * requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this
+ * iclog with the new log tail value.
*/
int
xlog_state_release_iclog(
struct xlog *log,
- struct xlog_in_core *iclog)
+ struct xlog_in_core *iclog,
+ xfs_lsn_t old_tail_lsn)
{
xfs_lsn_t tail_lsn;
lockdep_assert_held(&log->l_icloglock);
if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
+ /*
+ * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ * of the tail LSN into the iclog so we guarantee that the log tail does
+ * not move between deciding if a cache flush is required and writing
+ * the LSN into the iclog below.
+ */
+ if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+ if (old_tail_lsn && tail_lsn != old_tail_lsn)
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+ }
+
if (!atomic_dec_and_test(&iclog->ic_refcnt))
return 0;
return 0;
}
- /* update tail before writing to iclog */
- tail_lsn = xlog_assign_tail_lsn(log->l_mp);
iclog->ic_state = XLOG_STATE_SYNCING;
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
xlog_verify_tail_lsn(log, iclog, tail_lsn);
* iclog containing the unmount record is written.
*/
iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
xlog_wait_on_iclog(iclog);
if (tic) {
return 0;
release_iclog:
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
return error;
}
ASSERT(optype & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
} else {
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
}
spin_unlock(&log->l_icloglock);
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
- error = xlog_state_release_iclog(log, iclog);
+ error = xlog_state_release_iclog(log, iclog, 0);
spin_unlock(&log->l_icloglock);
if (error)
return error;
atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_state_release_iclog(log, iclog, 0))
goto out_error;
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
}
atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
- if (xlog_state_release_iclog(log, iclog))
+ if (xlog_state_release_iclog(log, iclog, 0))
goto out_error;
if (log_flushed)
*log_flushed = 1;
struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL };
+ xfs_lsn_t preflush_tail_lsn;
xfs_lsn_t commit_lsn;
- xfs_lsn_t push_seq;
+ xfs_csn_t push_seq;
struct bio bio;
DECLARE_COMPLETION_ONSTACK(bdev_flush);
* because we hold the flush lock exclusively. Hence we can now issue
* a cache flush to ensure all the completed metadata in the journal we
* are about to overwrite is on stable storage.
+ *
+ * Because we are issuing this cache flush before we've written the
+ * tail lsn to the iclog, we can have metadata IO completions move the
+ * tail forwards between the completion of this flush and the iclog
+ * being written. In this case, we need to re-issue the cache flush
+ * before the iclog write. To detect whether the log tail moves, sample
+ * the tail LSN *before* we issue the flush.
*/
+ preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
&bdev_flush);
* storage.
*/
commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA;
- xlog_state_release_iclog(log, commit_iclog);
+ xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn);
spin_unlock(&log->l_icloglock);
return;