Btrfs: fix data loss in the fast fsync path

author Filipe Manana <fdmanana@suse.com>

Sun, 1 Mar 2015 20:36:00 +0000 (20:36 +0000)

committer Chris Mason <clm@fb.com>

Fri, 6 Mar 2015 01:28:32 +0000 (17:28 -0800)
author Filipe Manana <fdmanana@suse.com>
Sun, 1 Mar 2015 20:36:00 +0000 (20:36 +0000)
committer Chris Mason <clm@fb.com>
Fri, 6 Mar 2015 01:28:32 +0000 (17:28 -0800)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index b476e56..6351947 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         mutex_unlock(&inode->i_mutex);
  
         /*
-        * we want to make sure fsync finds this change
-        * but we haven't joined a transaction running right now.
-        *
-        * Later on, someone is sure to update the inode and get the
-        * real transid recorded.
-        *
-        * We set last_trans now to the fs_info generation + 1,
-        * this will either be one more than the running transaction
-        * or the generation used for the next transaction if there isn't
-        * one running right now.
-        *
          * We also have to set last_sub_trans to the current log transid,
          * otherwise subsequent syncs to a file that's been synced in this
          * transaction will appear to have already occured.
          */
-       BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
         BTRFS_I(inode)->last_sub_trans = root->log_transid;
         if (num_written > 0) {
                 err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         atomic_inc(&root->log_batch);
  
         /*
-        * check the transaction that last modified this inode
-        * and see if its already been committed
-        */
-       if (!BTRFS_I(inode)->last_trans) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
-
-       /*
-        * if the last transaction that changed this file was before
-        * the current transaction, we can bail out now without any
-        * syncing
+        * If the last transaction that changed this file was before the current
+        * transaction and we have the full sync flag set in our inode, we can
+        * bail out now without any syncing.
+        *
+        * Note that we can't bail out if the full sync flag isn't set. This is
+        * because when the full sync flag is set we start all ordered extents
+        * and wait for them to fully complete - when they complete they update
+        * the inode's last_trans field through:
+        *
+        *     btrfs_finish_ordered_io() ->
+        *         btrfs_update_inode_fallback() ->
+        *             btrfs_update_inode() ->
+        *                 btrfs_set_inode_last_trans()
+        *
+        * So we are sure that last_trans is up to date and can do this check to
+        * bail out safely. For the fast path, when the full sync flag is not
+        * set in our inode, we can not do it because we start only our ordered
+        * extents and don't wait for them to complete (that is when
+        * btrfs_finish_ordered_io runs), so here at this point their last_trans
+        * value might be less than or equals to fs_info->last_trans_committed,
+        * and setting a speculative last_trans for an inode when a buffered
+        * write is made (such as fs_info->generation + 1 for example) would not
+        * be reliable since after setting the value and before fsync is called
+        * any number of transactions can start and commit (transaction kthread
+        * commits the current transaction periodically), and a transaction
+        * commit does not start nor waits for ordered extents to complete.
          */
         smp_mb();
         if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-           BTRFS_I(inode)->last_trans <=
-           root->fs_info->last_trans_committed) {
-               BTRFS_I(inode)->last_trans = 0;
-
+           (full_sync && BTRFS_I(inode)->last_trans <=
+            root->fs_info->last_trans_committed)) {
                 /*
                  * We'v had everything committed since the last time we were
                  * modified so clear this flag in case it was set for whatever
author	Filipe Manana <fdmanana@suse.com>
	Sun, 1 Mar 2015 20:36:00 +0000 (20:36 +0000)
committer	Chris Mason <clm@fb.com>
	Fri, 6 Mar 2015 01:28:32 +0000 (17:28 -0800)