fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  *
  69  * Not all operations are supported by fast commits today (e.g extended
  70  * attributes). Fast commit ineligibility is marked by calling
  71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  72  * to full commit.
  73  *
  74  * Atomicity of commits
  75  * --------------------
  76  * In order to guarantee atomicity during the commit operation, fast commit
  77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  78  * tag contains CRC of the contents and TID of the transaction after which
  79  * this fast commit should be applied. Recovery code replays fast commit
  80  * logs only if there's at least 1 valid tail present. For every fast commit
  81  * operation, there is 1 tail. This means, we may end up with multiple tails
  82  * in the fast commit space. Here's an example:
  83  *
  84  * - Create a new file A and remove existing file B
  85  * - fsync()
  86  * - Append contents to file A
  87  * - Truncate file A
  88  * - fsync()
  89  *
  90  * The fast commit space at the end of above operations would look like this:
  91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
  92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
  93  *
  94  * Replay code should thus check for all the valid tails in the FC area.
  95  *
  96  * Fast Commit Replay Idempotence
  97  * ------------------------------
  98  *
  99  * Fast commits tags are idempotent in nature provided the recovery code follows
 100  * certain rules. The guiding principle that the commit path follows while
 101  * committing is that it stores the result of a particular operation instead of
 102  * storing the procedure.
 103  *
 104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 105  * was associated with inode 10. During fast commit, instead of storing this
 106  * operation as a procedure "rename a to b", we store the resulting file system
 107  * state as a "series" of outcomes:
 108  *
 109  * - Link dirent b to inode 10
 110  * - Unlink dirent a
 111  * - Inode <10> with valid refcount
 112  *
 113  * Now when recovery code runs, it needs "enforce" this state on the file
 114  * system. This is what guarantees idempotence of fast commit replay.
 115  *
 116  * Let's take an example of a procedure that is not idempotent and see how fast
 117  * commits make it idempotent. Consider following sequence of operations:
 118  *
 119  *     rm A;    mv B A;    read A
 120  *  (x)     (y)        (z)
 121  *
 122  * (x), (y) and (z) are the points at which we can crash. If we store this
 123  * sequence of operations as is then the replay is not idempotent. Let's say
 124  * while in replay, we crash at (z). During the second replay, file A (which was
 125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
 126  * file named A would be absent when we try to read A. So, this sequence of
 127  * operations is not idempotent. However, as mentioned above, instead of storing
 128  * the procedure fast commits store the outcome of each procedure. Thus the fast
 129  * commit log for above procedure would be as follows:
 130  *
 131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 132  * inode 11 before the replay)
 133  *
 134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 135  * (w)          (x)                    (y)          (z)
 136  *
 137  * If we crash at (z), we will have file A linked to inode 11. During the second
 138  * replay, we will remove file A (inode 11). But we will create it back and make
 139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
 140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 142  * similarly. Thus, by converting a non-idempotent procedure into a series of
 143  * idempotent outcomes, fast commits ensured idempotence during the replay.
 144  *
 145  * TODOs
 146  * -----
 147  *
 148  * 0) Fast commit replay path hardening: Fast commit replay code should use
 149  *    journal handles to make sure all the updates it does during the replay
 150  *    path are atomic. With that if we crash during fast commit replay, after
 151  *    trying to do recovery again, we will find a file system where fast commit
 152  *    area is invalid (because new full commit would be found). In order to deal
 153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 154  *    superblock state is persisted before starting the replay, so that after
 155  *    the crash, fast commit recovery code can look at that flag and perform
 156  *    fast commit recovery even if that area is invalidated by later full
 157  *    commits.
 158  *
 159  * 1) Fast commit's commit path locks the entire file system during fast
 160  *    commit. This has significant performance penalty. Instead of that, we
 161  *    should use ext4_fc_start/stop_update functions to start inode level
 162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
 163  *    system locking during commit path.
 164  *
 165  * 2) Handle more ineligible cases.
 166  */
 167
 168 #include <trace/events/ext4.h>
 169 static struct kmem_cache *ext4_fc_dentry_cachep;
 170
 171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172 {
 173         BUFFER_TRACE(bh, "");
 174         if (uptodate) {
 175                 ext4_debug("%s: Block %lld up-to-date",
 176                            __func__, bh->b_blocknr);
 177                 set_buffer_uptodate(bh);
 178         } else {
 179                 ext4_debug("%s: Block %lld not up-to-date",
 180                            __func__, bh->b_blocknr);
 181                 clear_buffer_uptodate(bh);
 182         }
 183
 184         unlock_buffer(bh);
 185 }
 186
 187 static inline void ext4_fc_reset_inode(struct inode *inode)
 188 {
 189         struct ext4_inode_info *ei = EXT4_I(inode);
 190
 191         ei->i_fc_lblk_start = 0;
 192         ei->i_fc_lblk_len = 0;
 193 }
 194
 195 void ext4_fc_init_inode(struct inode *inode)
 196 {
 197         struct ext4_inode_info *ei = EXT4_I(inode);
 198
 199         ext4_fc_reset_inode(inode);
 200         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 201         INIT_LIST_HEAD(&ei->i_fc_list);
 202         INIT_LIST_HEAD(&ei->i_fc_dilist);
 203         init_waitqueue_head(&ei->i_fc_wait);
 204         atomic_set(&ei->i_fc_updates, 0);
 205 }
 206
 207 /* This function must be called with sbi->s_fc_lock held. */
 208 static void ext4_fc_wait_committing_inode(struct inode *inode)
 209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 210 {
 211         wait_queue_head_t *wq;
 212         struct ext4_inode_info *ei = EXT4_I(inode);
 213
 214 #if (BITS_PER_LONG < 64)
 215         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 216                         EXT4_STATE_FC_COMMITTING);
 217         wq = bit_waitqueue(&ei->i_state_flags,
 218                                 EXT4_STATE_FC_COMMITTING);
 219 #else
 220         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 221                         EXT4_STATE_FC_COMMITTING);
 222         wq = bit_waitqueue(&ei->i_flags,
 223                                 EXT4_STATE_FC_COMMITTING);
 224 #endif
 225         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 226         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 227         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 228         schedule();
 229         finish_wait(wq, &wait.wq_entry);
 230 }
 231
 232 /*
 233  * Inform Ext4's fast about start of an inode update
 234  *
 235  * This function is called by the high level call VFS callbacks before
 236  * performing any inode update. This function blocks if there's an ongoing
 237  * fast commit on the inode in question.
 238  */
 239 void ext4_fc_start_update(struct inode *inode)
 240 {
 241         struct ext4_inode_info *ei = EXT4_I(inode);
 242
 243         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 244             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 245                 return;
 246
 247 restart:
 248         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 249         if (list_empty(&ei->i_fc_list))
 250                 goto out;
 251
 252         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 253                 ext4_fc_wait_committing_inode(inode);
 254                 goto restart;
 255         }
 256 out:
 257         atomic_inc(&ei->i_fc_updates);
 258         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 259 }
 260
 261 /*
 262  * Stop inode update and wake up waiting fast commits if any.
 263  */
 264 void ext4_fc_stop_update(struct inode *inode)
 265 {
 266         struct ext4_inode_info *ei = EXT4_I(inode);
 267
 268         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 269             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 270                 return;
 271
 272         if (atomic_dec_and_test(&ei->i_fc_updates))
 273                 wake_up_all(&ei->i_fc_wait);
 274 }
 275
 276 /*
 277  * Remove inode from fast commit list. If the inode is being committed
 278  * we wait until inode commit is done.
 279  */
 280 void ext4_fc_del(struct inode *inode)
 281 {
 282         struct ext4_inode_info *ei = EXT4_I(inode);
 283         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 284         struct ext4_fc_dentry_update *fc_dentry;
 285
 286         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 287             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 288                 return;
 289
 290 restart:
 291         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 292         if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 293                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 294                 return;
 295         }
 296
 297         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 298                 ext4_fc_wait_committing_inode(inode);
 299                 goto restart;
 300         }
 301
 302         if (!list_empty(&ei->i_fc_list))
 303                 list_del_init(&ei->i_fc_list);
 304
 305         /*
 306          * Since this inode is getting removed, let's also remove all FC
 307          * dentry create references, since it is not needed to log it anyways.
 308          */
 309         if (list_empty(&ei->i_fc_dilist)) {
 310                 spin_unlock(&sbi->s_fc_lock);
 311                 return;
 312         }
 313
 314         fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
 315         WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 316         list_del_init(&fc_dentry->fcd_list);
 317         list_del_init(&fc_dentry->fcd_dilist);
 318
 319         WARN_ON(!list_empty(&ei->i_fc_dilist));
 320         spin_unlock(&sbi->s_fc_lock);
 321
 322         if (fc_dentry->fcd_name.name &&
 323                 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
 324                 kfree(fc_dentry->fcd_name.name);
 325         kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 326
 327         return;
 328 }
 329
 330 /*
 331  * Mark file system as fast commit ineligible, and record latest
 332  * ineligible transaction tid. This means until the recorded
 333  * transaction, commit operation would result in a full jbd2 commit.
 334  */
 335 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 336 {
 337         struct ext4_sb_info *sbi = EXT4_SB(sb);
 338         tid_t tid;
 339
 340         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 341             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 342                 return;
 343
 344         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 345         if (handle && !IS_ERR(handle))
 346                 tid = handle->h_transaction->t_tid;
 347         else {
 348                 read_lock(&sbi->s_journal->j_state_lock);
 349                 tid = sbi->s_journal->j_running_transaction ?
 350                                 sbi->s_journal->j_running_transaction->t_tid : 0;
 351                 read_unlock(&sbi->s_journal->j_state_lock);
 352         }
 353         spin_lock(&sbi->s_fc_lock);
 354         if (sbi->s_fc_ineligible_tid < tid)
 355                 sbi->s_fc_ineligible_tid = tid;
 356         spin_unlock(&sbi->s_fc_lock);
 357         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 358         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 359 }
 360
 361 /*
 362  * Generic fast commit tracking function. If this is the first time this we are
 363  * called after a full commit, we initialize fast commit fields and then call
 364  * __fc_track_fn() with update = 0. If we have already been called after a full
 365  * commit, we pass update = 1. Based on that, the track function can determine
 366  * if it needs to track a field for the first time or if it needs to just
 367  * update the previously tracked value.
 368  *
 369  * If enqueue is set, this function enqueues the inode in fast commit list.
 370  */
 371 static int ext4_fc_track_template(
 372         handle_t *handle, struct inode *inode,
 373         int (*__fc_track_fn)(struct inode *, void *, bool),
 374         void *args, int enqueue)
 375 {
 376         bool update = false;
 377         struct ext4_inode_info *ei = EXT4_I(inode);
 378         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 379         tid_t tid = 0;
 380         int ret;
 381
 382         tid = handle->h_transaction->t_tid;
 383         mutex_lock(&ei->i_fc_lock);
 384         if (tid == ei->i_sync_tid) {
 385                 update = true;
 386         } else {
 387                 ext4_fc_reset_inode(inode);
 388                 ei->i_sync_tid = tid;
 389         }
 390         ret = __fc_track_fn(inode, args, update);
 391         mutex_unlock(&ei->i_fc_lock);
 392
 393         if (!enqueue)
 394                 return ret;
 395
 396         spin_lock(&sbi->s_fc_lock);
 397         if (list_empty(&EXT4_I(inode)->i_fc_list))
 398                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 399                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 400                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 401                                 &sbi->s_fc_q[FC_Q_STAGING] :
 402                                 &sbi->s_fc_q[FC_Q_MAIN]);
 403         spin_unlock(&sbi->s_fc_lock);
 404
 405         return ret;
 406 }
 407
 408 struct __track_dentry_update_args {
 409         struct dentry *dentry;
 410         int op;
 411 };
 412
 413 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 414 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 415 {
 416         struct ext4_fc_dentry_update *node;
 417         struct ext4_inode_info *ei = EXT4_I(inode);
 418         struct __track_dentry_update_args *dentry_update =
 419                 (struct __track_dentry_update_args *)arg;
 420         struct dentry *dentry = dentry_update->dentry;
 421         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 422
 423         mutex_unlock(&ei->i_fc_lock);
 424         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 425         if (!node) {
 426                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
 427                 mutex_lock(&ei->i_fc_lock);
 428                 return -ENOMEM;
 429         }
 430
 431         node->fcd_op = dentry_update->op;
 432         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 433         node->fcd_ino = inode->i_ino;
 434         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 435                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 436                 if (!node->fcd_name.name) {
 437                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 438                         ext4_fc_mark_ineligible(inode->i_sb,
 439                                 EXT4_FC_REASON_NOMEM, NULL);
 440                         mutex_lock(&ei->i_fc_lock);
 441                         return -ENOMEM;
 442                 }
 443                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 444                         dentry->d_name.len);
 445         } else {
 446                 memcpy(node->fcd_iname, dentry->d_name.name,
 447                         dentry->d_name.len);
 448                 node->fcd_name.name = node->fcd_iname;
 449         }
 450         node->fcd_name.len = dentry->d_name.len;
 451         INIT_LIST_HEAD(&node->fcd_dilist);
 452         spin_lock(&sbi->s_fc_lock);
 453         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 454                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 455                 list_add_tail(&node->fcd_list,
 456                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 457         else
 458                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 459
 460         /*
 461          * This helps us keep a track of all fc_dentry updates which is part of
 462          * this ext4 inode. So in case the inode is getting unlinked, before
 463          * even we get a chance to fsync, we could remove all fc_dentry
 464          * references while evicting the inode in ext4_fc_del().
 465          * Also with this, we don't need to loop over all the inodes in
 466          * sbi->s_fc_q to get the corresponding inode in
 467          * ext4_fc_commit_dentry_updates().
 468          */
 469         if (dentry_update->op == EXT4_FC_TAG_CREAT) {
 470                 WARN_ON(!list_empty(&ei->i_fc_dilist));
 471                 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
 472         }
 473         spin_unlock(&sbi->s_fc_lock);
 474         mutex_lock(&ei->i_fc_lock);
 475
 476         return 0;
 477 }
 478
 479 void __ext4_fc_track_unlink(handle_t *handle,
 480                 struct inode *inode, struct dentry *dentry)
 481 {
 482         struct __track_dentry_update_args args;
 483         int ret;
 484
 485         args.dentry = dentry;
 486         args.op = EXT4_FC_TAG_UNLINK;
 487
 488         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 489                                         (void *)&args, 0);
 490         trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
 491 }
 492
 493 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 494 {
 495         struct inode *inode = d_inode(dentry);
 496         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 497
 498         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 499             (sbi->s_mount_state & EXT4_FC_REPLAY))
 500                 return;
 501
 502         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 503                 return;
 504
 505         __ext4_fc_track_unlink(handle, inode, dentry);
 506 }
 507
 508 void __ext4_fc_track_link(handle_t *handle,
 509         struct inode *inode, struct dentry *dentry)
 510 {
 511         struct __track_dentry_update_args args;
 512         int ret;
 513
 514         args.dentry = dentry;
 515         args.op = EXT4_FC_TAG_LINK;
 516
 517         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 518                                         (void *)&args, 0);
 519         trace_ext4_fc_track_link(handle, inode, dentry, ret);
 520 }
 521
 522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 523 {
 524         struct inode *inode = d_inode(dentry);
 525         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 526
 527         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 528             (sbi->s_mount_state & EXT4_FC_REPLAY))
 529                 return;
 530
 531         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 532                 return;
 533
 534         __ext4_fc_track_link(handle, inode, dentry);
 535 }
 536
 537 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 538                           struct dentry *dentry)
 539 {
 540         struct __track_dentry_update_args args;
 541         int ret;
 542
 543         args.dentry = dentry;
 544         args.op = EXT4_FC_TAG_CREAT;
 545
 546         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 547                                         (void *)&args, 0);
 548         trace_ext4_fc_track_create(handle, inode, dentry, ret);
 549 }
 550
 551 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 552 {
 553         struct inode *inode = d_inode(dentry);
 554         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 555
 556         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 557             (sbi->s_mount_state & EXT4_FC_REPLAY))
 558                 return;
 559
 560         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 561                 return;
 562
 563         __ext4_fc_track_create(handle, inode, dentry);
 564 }
 565
 566 /* __track_fn for inode tracking */
 567 static int __track_inode(struct inode *inode, void *arg, bool update)
 568 {
 569         if (update)
 570                 return -EEXIST;
 571
 572         EXT4_I(inode)->i_fc_lblk_len = 0;
 573
 574         return 0;
 575 }
 576
 577 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 578 {
 579         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 580         int ret;
 581
 582         if (S_ISDIR(inode->i_mode))
 583                 return;
 584
 585         if (ext4_should_journal_data(inode)) {
 586                 ext4_fc_mark_ineligible(inode->i_sb,
 587                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 588                 return;
 589         }
 590
 591         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 592             (sbi->s_mount_state & EXT4_FC_REPLAY))
 593                 return;
 594
 595         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 596                 return;
 597
 598         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 599         trace_ext4_fc_track_inode(handle, inode, ret);
 600 }
 601
 602 struct __track_range_args {
 603         ext4_lblk_t start, end;
 604 };
 605
 606 /* __track_fn for tracking data updates */
 607 static int __track_range(struct inode *inode, void *arg, bool update)
 608 {
 609         struct ext4_inode_info *ei = EXT4_I(inode);
 610         ext4_lblk_t oldstart;
 611         struct __track_range_args *__arg =
 612                 (struct __track_range_args *)arg;
 613
 614         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 615                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 616                 return -ECANCELED;
 617         }
 618
 619         oldstart = ei->i_fc_lblk_start;
 620
 621         if (update && ei->i_fc_lblk_len > 0) {
 622                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 623                 ei->i_fc_lblk_len =
 624                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 625                                 ei->i_fc_lblk_start + 1;
 626         } else {
 627                 ei->i_fc_lblk_start = __arg->start;
 628                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 629         }
 630
 631         return 0;
 632 }
 633
 634 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 635                          ext4_lblk_t end)
 636 {
 637         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 638         struct __track_range_args args;
 639         int ret;
 640
 641         if (S_ISDIR(inode->i_mode))
 642                 return;
 643
 644         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 645             (sbi->s_mount_state & EXT4_FC_REPLAY))
 646                 return;
 647
 648         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 649                 return;
 650
 651         args.start = start;
 652         args.end = end;
 653
 654         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 655
 656         trace_ext4_fc_track_range(handle, inode, start, end, ret);
 657 }
 658
 659 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 660 {
 661         int write_flags = REQ_SYNC;
 662         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 663
 664         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 665         if (test_opt(sb, BARRIER) && is_tail)
 666                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 667         lock_buffer(bh);
 668         set_buffer_dirty(bh);
 669         set_buffer_uptodate(bh);
 670         bh->b_end_io = ext4_end_buffer_io_sync;
 671         submit_bh(REQ_OP_WRITE, write_flags, bh);
 672         EXT4_SB(sb)->s_fc_bh = NULL;
 673 }
 674
 675 /* Ext4 commit path routines */
 676
 677 /* memzero and update CRC */
 678 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 679                                 u32 *crc)
 680 {
 681         void *ret;
 682
 683         ret = memset(dst, 0, len);
 684         if (crc)
 685                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 686         return ret;
 687 }
 688
 689 /*
 690  * Allocate len bytes on a fast commit buffer.
 691  *
 692  * During the commit time this function is used to manage fast commit
 693  * block space. We don't split a fast commit log onto different
 694  * blocks. So this function makes sure that if there's not enough space
 695  * on the current block, the remaining space in the current block is
 696  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 697  * new block is from jbd2 and CRC is updated to reflect the padding
 698  * we added.
 699  */
 700 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 701 {
 702         struct ext4_fc_tl *tl;
 703         struct ext4_sb_info *sbi = EXT4_SB(sb);
 704         struct buffer_head *bh;
 705         int bsize = sbi->s_journal->j_blocksize;
 706         int ret, off = sbi->s_fc_bytes % bsize;
 707         int pad_len;
 708
 709         /*
 710          * After allocating len, we should have space at least for a 0 byte
 711          * padding.
 712          */
 713         if (len + sizeof(struct ext4_fc_tl) > bsize)
 714                 return NULL;
 715
 716         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 717                 /*
 718                  * Only allocate from current buffer if we have enough space for
 719                  * this request AND we have space to add a zero byte padding.
 720                  */
 721                 if (!sbi->s_fc_bh) {
 722                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 723                         if (ret)
 724                                 return NULL;
 725                         sbi->s_fc_bh = bh;
 726                 }
 727                 sbi->s_fc_bytes += len;
 728                 return sbi->s_fc_bh->b_data + off;
 729         }
 730         /* Need to add PAD tag */
 731         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 732         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 733         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 734         tl->fc_len = cpu_to_le16(pad_len);
 735         if (crc)
 736                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 737         if (pad_len > 0)
 738                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 739         ext4_fc_submit_bh(sb, false);
 740
 741         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 742         if (ret)
 743                 return NULL;
 744         sbi->s_fc_bh = bh;
 745         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 746         return sbi->s_fc_bh->b_data;
 747 }
 748
 749 /* memcpy to fc reserved space and update CRC */
 750 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 751                                 int len, u32 *crc)
 752 {
 753         if (crc)
 754                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 755         return memcpy(dst, src, len);
 756 }
 757
 758 /*
 759  * Complete a fast commit by writing tail tag.
 760  *
 761  * Writing tail tag marks the end of a fast commit. In order to guarantee
 762  * atomicity, after writing tail tag, even if there's space remaining
 763  * in the block, next commit shouldn't use it. That's why tail tag
 764  * has the length as that of the remaining space on the block.
 765  */
 766 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 767 {
 768         struct ext4_sb_info *sbi = EXT4_SB(sb);
 769         struct ext4_fc_tl tl;
 770         struct ext4_fc_tail tail;
 771         int off, bsize = sbi->s_journal->j_blocksize;
 772         u8 *dst;
 773
 774         /*
 775          * ext4_fc_reserve_space takes care of allocating an extra block if
 776          * there's no enough space on this block for accommodating this tail.
 777          */
 778         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 779         if (!dst)
 780                 return -ENOSPC;
 781
 782         off = sbi->s_fc_bytes % bsize;
 783
 784         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 785         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 786         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 787
 788         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 789         dst += sizeof(tl);
 790         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 791         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 792         dst += sizeof(tail.fc_tid);
 793         tail.fc_crc = cpu_to_le32(crc);
 794         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 795
 796         ext4_fc_submit_bh(sb, true);
 797
 798         return 0;
 799 }
 800
 801 /*
 802  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 803  * Returns false if there's not enough space.
 804  */
 805 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 806                            u32 *crc)
 807 {
 808         struct ext4_fc_tl tl;
 809         u8 *dst;
 810
 811         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 812         if (!dst)
 813                 return false;
 814
 815         tl.fc_tag = cpu_to_le16(tag);
 816         tl.fc_len = cpu_to_le16(len);
 817
 818         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 819         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 820
 821         return true;
 822 }
 823
 824 /* Same as above, but adds dentry tlv. */
 825 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 826                                    struct ext4_fc_dentry_update *fc_dentry)
 827 {
 828         struct ext4_fc_dentry_info fcd;
 829         struct ext4_fc_tl tl;
 830         int dlen = fc_dentry->fcd_name.len;
 831         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 832                                         crc);
 833
 834         if (!dst)
 835                 return false;
 836
 837         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 838         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 839         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 840         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 841         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 842         dst += sizeof(tl);
 843         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 844         dst += sizeof(fcd);
 845         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
 846
 847         return true;
 848 }
 849
 850 /*
 851  * Writes inode in the fast commit space under TLV with tag @tag.
 852  * Returns 0 on success, error on failure.
 853  */
 854 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 855 {
 856         struct ext4_inode_info *ei = EXT4_I(inode);
 857         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 858         int ret;
 859         struct ext4_iloc iloc;
 860         struct ext4_fc_inode fc_inode;
 861         struct ext4_fc_tl tl;
 862         u8 *dst;
 863
 864         ret = ext4_get_inode_loc(inode, &iloc);
 865         if (ret)
 866                 return ret;
 867
 868         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 869                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
 870         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 871                 inode_len += ei->i_extra_isize;
 872
 873         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 874         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 875         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 876
 877         dst = ext4_fc_reserve_space(inode->i_sb,
 878                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 879         if (!dst)
 880                 return -ECANCELED;
 881
 882         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 883                 return -ECANCELED;
 884         dst += sizeof(tl);
 885         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 886                 return -ECANCELED;
 887         dst += sizeof(fc_inode);
 888         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 889                                         inode_len, crc))
 890                 return -ECANCELED;
 891
 892         return 0;
 893 }
 894
 895 /*
 896  * Writes updated data ranges for the inode in question. Updates CRC.
 897  * Returns 0 on success, error otherwise.
 898  */
 899 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 900 {
 901         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 902         struct ext4_inode_info *ei = EXT4_I(inode);
 903         struct ext4_map_blocks map;
 904         struct ext4_fc_add_range fc_ext;
 905         struct ext4_fc_del_range lrange;
 906         struct ext4_extent *ex;
 907         int ret;
 908
 909         mutex_lock(&ei->i_fc_lock);
 910         if (ei->i_fc_lblk_len == 0) {
 911                 mutex_unlock(&ei->i_fc_lock);
 912                 return 0;
 913         }
 914         old_blk_size = ei->i_fc_lblk_start;
 915         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 916         ei->i_fc_lblk_len = 0;
 917         mutex_unlock(&ei->i_fc_lock);
 918
 919         cur_lblk_off = old_blk_size;
 920         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 921                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 922
 923         while (cur_lblk_off <= new_blk_size) {
 924                 map.m_lblk = cur_lblk_off;
 925                 map.m_len = new_blk_size - cur_lblk_off + 1;
 926                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 927                 if (ret < 0)
 928                         return -ECANCELED;
 929
 930                 if (map.m_len == 0) {
 931                         cur_lblk_off++;
 932                         continue;
 933                 }
 934
 935                 if (ret == 0) {
 936                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 937                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 938                         lrange.fc_len = cpu_to_le32(map.m_len);
 939                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 940                                             sizeof(lrange), (u8 *)&lrange, crc))
 941                                 return -ENOSPC;
 942                 } else {
 943                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 944                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 945
 946                         /* Limit the number of blocks in one extent */
 947                         map.m_len = min(max, map.m_len);
 948
 949                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 950                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 951                         ex->ee_block = cpu_to_le32(map.m_lblk);
 952                         ex->ee_len = cpu_to_le16(map.m_len);
 953                         ext4_ext_store_pblock(ex, map.m_pblk);
 954                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 955                                 ext4_ext_mark_unwritten(ex);
 956                         else
 957                                 ext4_ext_mark_initialized(ex);
 958                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 959                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 960                                 return -ENOSPC;
 961                 }
 962
 963                 cur_lblk_off += map.m_len;
 964         }
 965
 966         return 0;
 967 }
 968
 969
 970 /* Submit data for all the fast commit inodes */
 971 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 972 {
 973         struct super_block *sb = (struct super_block *)(journal->j_private);
 974         struct ext4_sb_info *sbi = EXT4_SB(sb);
 975         struct ext4_inode_info *ei;
 976         int ret = 0;
 977
 978         spin_lock(&sbi->s_fc_lock);
 979         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 980                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 981                 while (atomic_read(&ei->i_fc_updates)) {
 982                         DEFINE_WAIT(wait);
 983
 984                         prepare_to_wait(&ei->i_fc_wait, &wait,
 985                                                 TASK_UNINTERRUPTIBLE);
 986                         if (atomic_read(&ei->i_fc_updates)) {
 987                                 spin_unlock(&sbi->s_fc_lock);
 988                                 schedule();
 989                                 spin_lock(&sbi->s_fc_lock);
 990                         }
 991                         finish_wait(&ei->i_fc_wait, &wait);
 992                 }
 993                 spin_unlock(&sbi->s_fc_lock);
 994                 ret = jbd2_submit_inode_data(ei->jinode);
 995                 if (ret)
 996                         return ret;
 997                 spin_lock(&sbi->s_fc_lock);
 998         }
 999         spin_unlock(&sbi->s_fc_lock);
1000
1001         return ret;
1002 }
1003
1004 /* Wait for completion of data for all the fast commit inodes */
1005 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1006 {
1007         struct super_block *sb = (struct super_block *)(journal->j_private);
1008         struct ext4_sb_info *sbi = EXT4_SB(sb);
1009         struct ext4_inode_info *pos, *n;
1010         int ret = 0;
1011
1012         spin_lock(&sbi->s_fc_lock);
1013         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1014                 if (!ext4_test_inode_state(&pos->vfs_inode,
1015                                            EXT4_STATE_FC_COMMITTING))
1016                         continue;
1017                 spin_unlock(&sbi->s_fc_lock);
1018
1019                 ret = jbd2_wait_inode_data(journal, pos->jinode);
1020                 if (ret)
1021                         return ret;
1022                 spin_lock(&sbi->s_fc_lock);
1023         }
1024         spin_unlock(&sbi->s_fc_lock);
1025
1026         return 0;
1027 }
1028
1029 /* Commit all the directory entry updates */
1030 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1031 __acquires(&sbi->s_fc_lock)
1032 __releases(&sbi->s_fc_lock)
1033 {
1034         struct super_block *sb = (struct super_block *)(journal->j_private);
1035         struct ext4_sb_info *sbi = EXT4_SB(sb);
1036         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1037         struct inode *inode;
1038         struct ext4_inode_info *ei;
1039         int ret;
1040
1041         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1042                 return 0;
1043         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1044                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1045                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1046                         spin_unlock(&sbi->s_fc_lock);
1047                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1048                                 ret = -ENOSPC;
1049                                 goto lock_and_exit;
1050                         }
1051                         spin_lock(&sbi->s_fc_lock);
1052                         continue;
1053                 }
1054                 /*
1055                  * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1056                  * corresponding inode pointer
1057                  */
1058                 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1059                 ei = list_first_entry(&fc_dentry->fcd_dilist,
1060                                 struct ext4_inode_info, i_fc_dilist);
1061                 inode = &ei->vfs_inode;
1062                 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1063
1064                 spin_unlock(&sbi->s_fc_lock);
1065
1066                 /*
1067                  * We first write the inode and then the create dirent. This
1068                  * allows the recovery code to create an unnamed inode first
1069                  * and then link it to a directory entry. This allows us
1070                  * to use namei.c routines almost as is and simplifies
1071                  * the recovery code.
1072                  */
1073                 ret = ext4_fc_write_inode(inode, crc);
1074                 if (ret)
1075                         goto lock_and_exit;
1076
1077                 ret = ext4_fc_write_inode_data(inode, crc);
1078                 if (ret)
1079                         goto lock_and_exit;
1080
1081                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1082                         ret = -ENOSPC;
1083                         goto lock_and_exit;
1084                 }
1085
1086                 spin_lock(&sbi->s_fc_lock);
1087         }
1088         return 0;
1089 lock_and_exit:
1090         spin_lock(&sbi->s_fc_lock);
1091         return ret;
1092 }
1093
1094 static int ext4_fc_perform_commit(journal_t *journal)
1095 {
1096         struct super_block *sb = (struct super_block *)(journal->j_private);
1097         struct ext4_sb_info *sbi = EXT4_SB(sb);
1098         struct ext4_inode_info *iter;
1099         struct ext4_fc_head head;
1100         struct inode *inode;
1101         struct blk_plug plug;
1102         int ret = 0;
1103         u32 crc = 0;
1104
1105         ret = ext4_fc_submit_inode_data_all(journal);
1106         if (ret)
1107                 return ret;
1108
1109         ret = ext4_fc_wait_inode_data_all(journal);
1110         if (ret)
1111                 return ret;
1112
1113         /*
1114          * If file system device is different from journal device, issue a cache
1115          * flush before we start writing fast commit blocks.
1116          */
1117         if (journal->j_fs_dev != journal->j_dev)
1118                 blkdev_issue_flush(journal->j_fs_dev);
1119
1120         blk_start_plug(&plug);
1121         if (sbi->s_fc_bytes == 0) {
1122                 /*
1123                  * Add a head tag only if this is the first fast commit
1124                  * in this TID.
1125                  */
1126                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1127                 head.fc_tid = cpu_to_le32(
1128                         sbi->s_journal->j_running_transaction->t_tid);
1129                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1130                         (u8 *)&head, &crc)) {
1131                         ret = -ENOSPC;
1132                         goto out;
1133                 }
1134         }
1135
1136         spin_lock(&sbi->s_fc_lock);
1137         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1138         if (ret) {
1139                 spin_unlock(&sbi->s_fc_lock);
1140                 goto out;
1141         }
1142
1143         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1144                 inode = &iter->vfs_inode;
1145                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1146                         continue;
1147
1148                 spin_unlock(&sbi->s_fc_lock);
1149                 ret = ext4_fc_write_inode_data(inode, &crc);
1150                 if (ret)
1151                         goto out;
1152                 ret = ext4_fc_write_inode(inode, &crc);
1153                 if (ret)
1154                         goto out;
1155                 spin_lock(&sbi->s_fc_lock);
1156         }
1157         spin_unlock(&sbi->s_fc_lock);
1158
1159         ret = ext4_fc_write_tail(sb, crc);
1160
1161 out:
1162         blk_finish_plug(&plug);
1163         return ret;
1164 }
1165
1166 static void ext4_fc_update_stats(struct super_block *sb, int status,
1167                                  u64 commit_time, int nblks, tid_t commit_tid)
1168 {
1169         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1170
1171         jbd_debug(1, "Fast commit ended with status = %d for tid %u",
1172                         status, commit_tid);
1173         if (status == EXT4_FC_STATUS_OK) {
1174                 stats->fc_num_commits++;
1175                 stats->fc_numblks += nblks;
1176                 if (likely(stats->s_fc_avg_commit_time))
1177                         stats->s_fc_avg_commit_time =
1178                                 (commit_time +
1179                                  stats->s_fc_avg_commit_time * 3) / 4;
1180                 else
1181                         stats->s_fc_avg_commit_time = commit_time;
1182         } else if (status == EXT4_FC_STATUS_FAILED ||
1183                    status == EXT4_FC_STATUS_INELIGIBLE) {
1184                 if (status == EXT4_FC_STATUS_FAILED)
1185                         stats->fc_failed_commits++;
1186                 stats->fc_ineligible_commits++;
1187         } else {
1188                 stats->fc_skipped_commits++;
1189         }
1190         trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1191 }
1192
1193 /*
1194  * The main commit entry point. Performs a fast commit for transaction
1195  * commit_tid if needed. If it's not possible to perform a fast commit
1196  * due to various reasons, we fall back to full commit. Returns 0
1197  * on success, error otherwise.
1198  */
1199 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1200 {
1201         struct super_block *sb = (struct super_block *)(journal->j_private);
1202         struct ext4_sb_info *sbi = EXT4_SB(sb);
1203         int nblks = 0, ret, bsize = journal->j_blocksize;
1204         int subtid = atomic_read(&sbi->s_fc_subtid);
1205         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1206         ktime_t start_time, commit_time;
1207
1208         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1209                 return jbd2_complete_transaction(journal, commit_tid);
1210
1211         trace_ext4_fc_commit_start(sb, commit_tid);
1212
1213         start_time = ktime_get();
1214
1215 restart_fc:
1216         ret = jbd2_fc_begin_commit(journal, commit_tid);
1217         if (ret == -EALREADY) {
1218                 /* There was an ongoing commit, check if we need to restart */
1219                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1220                         commit_tid > journal->j_commit_sequence)
1221                         goto restart_fc;
1222                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1223                                 commit_tid);
1224                 return 0;
1225         } else if (ret) {
1226                 /*
1227                  * Commit couldn't start. Just update stats and perform a
1228                  * full commit.
1229                  */
1230                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1231                                 commit_tid);
1232                 return jbd2_complete_transaction(journal, commit_tid);
1233         }
1234
1235         /*
1236          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1237          * if we are fast commit ineligible.
1238          */
1239         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1240                 status = EXT4_FC_STATUS_INELIGIBLE;
1241                 goto fallback;
1242         }
1243
1244         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1245         ret = ext4_fc_perform_commit(journal);
1246         if (ret < 0) {
1247                 status = EXT4_FC_STATUS_FAILED;
1248                 goto fallback;
1249         }
1250         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1251         ret = jbd2_fc_wait_bufs(journal, nblks);
1252         if (ret < 0) {
1253                 status = EXT4_FC_STATUS_FAILED;
1254                 goto fallback;
1255         }
1256         atomic_inc(&sbi->s_fc_subtid);
1257         ret = jbd2_fc_end_commit(journal);
1258         /*
1259          * weight the commit time higher than the average time so we
1260          * don't react too strongly to vast changes in the commit time
1261          */
1262         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1263         ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1264         return ret;
1265
1266 fallback:
1267         ret = jbd2_fc_end_commit_fallback(journal);
1268         ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1269         return ret;
1270 }
1271
1272 /*
1273  * Fast commit cleanup routine. This is called after every fast commit and
1274  * full commit. full is true if we are called after a full commit.
1275  */
1276 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1277 {
1278         struct super_block *sb = journal->j_private;
1279         struct ext4_sb_info *sbi = EXT4_SB(sb);
1280         struct ext4_inode_info *iter, *iter_n;
1281         struct ext4_fc_dentry_update *fc_dentry;
1282
1283         if (full && sbi->s_fc_bh)
1284                 sbi->s_fc_bh = NULL;
1285
1286         trace_ext4_fc_cleanup(journal, full, tid);
1287         jbd2_fc_release_bufs(journal);
1288
1289         spin_lock(&sbi->s_fc_lock);
1290         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1291                                  i_fc_list) {
1292                 list_del_init(&iter->i_fc_list);
1293                 ext4_clear_inode_state(&iter->vfs_inode,
1294                                        EXT4_STATE_FC_COMMITTING);
1295                 if (iter->i_sync_tid <= tid)
1296                         ext4_fc_reset_inode(&iter->vfs_inode);
1297                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1298                 smp_mb();
1299 #if (BITS_PER_LONG < 64)
1300                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1301 #else
1302                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1303 #endif
1304         }
1305
1306         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1307                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1308                                              struct ext4_fc_dentry_update,
1309                                              fcd_list);
1310                 list_del_init(&fc_dentry->fcd_list);
1311                 list_del_init(&fc_dentry->fcd_dilist);
1312                 spin_unlock(&sbi->s_fc_lock);
1313
1314                 if (fc_dentry->fcd_name.name &&
1315                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1316                         kfree(fc_dentry->fcd_name.name);
1317                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1318                 spin_lock(&sbi->s_fc_lock);
1319         }
1320
1321         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1322                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1323         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1324                                 &sbi->s_fc_q[FC_Q_MAIN]);
1325
1326         if (tid >= sbi->s_fc_ineligible_tid) {
1327                 sbi->s_fc_ineligible_tid = 0;
1328                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1329         }
1330
1331         if (full)
1332                 sbi->s_fc_bytes = 0;
1333         spin_unlock(&sbi->s_fc_lock);
1334         trace_ext4_fc_stats(sb);
1335 }
1336
1337 /* Ext4 Replay Path Routines */
1338
1339 /* Helper struct for dentry replay routines */
1340 struct dentry_info_args {
1341         int parent_ino, dname_len, ino, inode_len;
1342         char *dname;
1343 };
1344
1345 static inline void tl_to_darg(struct dentry_info_args *darg,
1346                               struct  ext4_fc_tl *tl, u8 *val)
1347 {
1348         struct ext4_fc_dentry_info fcd;
1349
1350         memcpy(&fcd, val, sizeof(fcd));
1351
1352         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1353         darg->ino = le32_to_cpu(fcd.fc_ino);
1354         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1355         darg->dname_len = le16_to_cpu(tl->fc_len) -
1356                 sizeof(struct ext4_fc_dentry_info);
1357 }
1358
1359 /* Unlink replay function */
1360 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1361                                  u8 *val)
1362 {
1363         struct inode *inode, *old_parent;
1364         struct qstr entry;
1365         struct dentry_info_args darg;
1366         int ret = 0;
1367
1368         tl_to_darg(&darg, tl, val);
1369
1370         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1371                         darg.parent_ino, darg.dname_len);
1372
1373         entry.name = darg.dname;
1374         entry.len = darg.dname_len;
1375         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1376
1377         if (IS_ERR(inode)) {
1378                 jbd_debug(1, "Inode %d not found", darg.ino);
1379                 return 0;
1380         }
1381
1382         old_parent = ext4_iget(sb, darg.parent_ino,
1383                                 EXT4_IGET_NORMAL);
1384         if (IS_ERR(old_parent)) {
1385                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1386                 iput(inode);
1387                 return 0;
1388         }
1389
1390         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1391         /* -ENOENT ok coz it might not exist anymore. */
1392         if (ret == -ENOENT)
1393                 ret = 0;
1394         iput(old_parent);
1395         iput(inode);
1396         return ret;
1397 }
1398
1399 static int ext4_fc_replay_link_internal(struct super_block *sb,
1400                                 struct dentry_info_args *darg,
1401                                 struct inode *inode)
1402 {
1403         struct inode *dir = NULL;
1404         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1405         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1406         int ret = 0;
1407
1408         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1409         if (IS_ERR(dir)) {
1410                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1411                 dir = NULL;
1412                 goto out;
1413         }
1414
1415         dentry_dir = d_obtain_alias(dir);
1416         if (IS_ERR(dentry_dir)) {
1417                 jbd_debug(1, "Failed to obtain dentry");
1418                 dentry_dir = NULL;
1419                 goto out;
1420         }
1421
1422         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1423         if (!dentry_inode) {
1424                 jbd_debug(1, "Inode dentry not created.");
1425                 ret = -ENOMEM;
1426                 goto out;
1427         }
1428
1429         ret = __ext4_link(dir, inode, dentry_inode);
1430         /*
1431          * It's possible that link already existed since data blocks
1432          * for the dir in question got persisted before we crashed OR
1433          * we replayed this tag and crashed before the entire replay
1434          * could complete.
1435          */
1436         if (ret && ret != -EEXIST) {
1437                 jbd_debug(1, "Failed to link\n");
1438                 goto out;
1439         }
1440
1441         ret = 0;
1442 out:
1443         if (dentry_dir) {
1444                 d_drop(dentry_dir);
1445                 dput(dentry_dir);
1446         } else if (dir) {
1447                 iput(dir);
1448         }
1449         if (dentry_inode) {
1450                 d_drop(dentry_inode);
1451                 dput(dentry_inode);
1452         }
1453
1454         return ret;
1455 }
1456
1457 /* Link replay function */
1458 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1459                                u8 *val)
1460 {
1461         struct inode *inode;
1462         struct dentry_info_args darg;
1463         int ret = 0;
1464
1465         tl_to_darg(&darg, tl, val);
1466         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1467                         darg.parent_ino, darg.dname_len);
1468
1469         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1470         if (IS_ERR(inode)) {
1471                 jbd_debug(1, "Inode not found.");
1472                 return 0;
1473         }
1474
1475         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1476         iput(inode);
1477         return ret;
1478 }
1479
1480 /*
1481  * Record all the modified inodes during replay. We use this later to setup
1482  * block bitmaps correctly.
1483  */
1484 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1485 {
1486         struct ext4_fc_replay_state *state;
1487         int i;
1488
1489         state = &EXT4_SB(sb)->s_fc_replay_state;
1490         for (i = 0; i < state->fc_modified_inodes_used; i++)
1491                 if (state->fc_modified_inodes[i] == ino)
1492                         return 0;
1493         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1494                 state->fc_modified_inodes = krealloc(
1495                                 state->fc_modified_inodes,
1496                                 sizeof(int) * (state->fc_modified_inodes_size +
1497                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1498                                 GFP_KERNEL);
1499                 if (!state->fc_modified_inodes)
1500                         return -ENOMEM;
1501                 state->fc_modified_inodes_size +=
1502                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1503         }
1504         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1505         return 0;
1506 }
1507
1508 /*
1509  * Inode replay function
1510  */
1511 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1512                                 u8 *val)
1513 {
1514         struct ext4_fc_inode fc_inode;
1515         struct ext4_inode *raw_inode;
1516         struct ext4_inode *raw_fc_inode;
1517         struct inode *inode = NULL;
1518         struct ext4_iloc iloc;
1519         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1520         struct ext4_extent_header *eh;
1521
1522         memcpy(&fc_inode, val, sizeof(fc_inode));
1523
1524         ino = le32_to_cpu(fc_inode.fc_ino);
1525         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1526
1527         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1528         if (!IS_ERR(inode)) {
1529                 ext4_ext_clear_bb(inode);
1530                 iput(inode);
1531         }
1532         inode = NULL;
1533
1534         ret = ext4_fc_record_modified_inode(sb, ino);
1535         if (ret)
1536                 goto out;
1537
1538         raw_fc_inode = (struct ext4_inode *)
1539                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1540         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1541         if (ret)
1542                 goto out;
1543
1544         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1545         raw_inode = ext4_raw_inode(&iloc);
1546
1547         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1548         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1549                 inode_len - offsetof(struct ext4_inode, i_generation));
1550         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1551                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1552                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1553                         memset(eh, 0, sizeof(*eh));
1554                         eh->eh_magic = EXT4_EXT_MAGIC;
1555                         eh->eh_max = cpu_to_le16(
1556                                 (sizeof(raw_inode->i_block) -
1557                                  sizeof(struct ext4_extent_header))
1558                                  / sizeof(struct ext4_extent));
1559                 }
1560         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1561                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1562                         sizeof(raw_inode->i_block));
1563         }
1564
1565         /* Immediately update the inode on disk. */
1566         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1567         if (ret)
1568                 goto out;
1569         ret = sync_dirty_buffer(iloc.bh);
1570         if (ret)
1571                 goto out;
1572         ret = ext4_mark_inode_used(sb, ino);
1573         if (ret)
1574                 goto out;
1575
1576         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1577         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1578         if (IS_ERR(inode)) {
1579                 jbd_debug(1, "Inode not found.");
1580                 return -EFSCORRUPTED;
1581         }
1582
1583         /*
1584          * Our allocator could have made different decisions than before
1585          * crashing. This should be fixed but until then, we calculate
1586          * the number of blocks the inode.
1587          */
1588         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1589                 ext4_ext_replay_set_iblocks(inode);
1590
1591         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1592         ext4_reset_inode_seed(inode);
1593
1594         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1595         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1596         sync_dirty_buffer(iloc.bh);
1597         brelse(iloc.bh);
1598 out:
1599         iput(inode);
1600         if (!ret)
1601                 blkdev_issue_flush(sb->s_bdev);
1602
1603         return 0;
1604 }
1605
1606 /*
1607  * Dentry create replay function.
1608  *
1609  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1610  * inode for which we are trying to create a dentry here, should already have
1611  * been replayed before we start here.
1612  */
1613 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1614                                  u8 *val)
1615 {
1616         int ret = 0;
1617         struct inode *inode = NULL;
1618         struct inode *dir = NULL;
1619         struct dentry_info_args darg;
1620
1621         tl_to_darg(&darg, tl, val);
1622
1623         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1624                         darg.parent_ino, darg.dname_len);
1625
1626         /* This takes care of update group descriptor and other metadata */
1627         ret = ext4_mark_inode_used(sb, darg.ino);
1628         if (ret)
1629                 goto out;
1630
1631         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1632         if (IS_ERR(inode)) {
1633                 jbd_debug(1, "inode %d not found.", darg.ino);
1634                 inode = NULL;
1635                 ret = -EINVAL;
1636                 goto out;
1637         }
1638
1639         if (S_ISDIR(inode->i_mode)) {
1640                 /*
1641                  * If we are creating a directory, we need to make sure that the
1642                  * dot and dot dot dirents are setup properly.
1643                  */
1644                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1645                 if (IS_ERR(dir)) {
1646                         jbd_debug(1, "Dir %d not found.", darg.ino);
1647                         goto out;
1648                 }
1649                 ret = ext4_init_new_dir(NULL, dir, inode);
1650                 iput(dir);
1651                 if (ret) {
1652                         ret = 0;
1653                         goto out;
1654                 }
1655         }
1656         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1657         if (ret)
1658                 goto out;
1659         set_nlink(inode, 1);
1660         ext4_mark_inode_dirty(NULL, inode);
1661 out:
1662         if (inode)
1663                 iput(inode);
1664         return ret;
1665 }
1666
1667 /*
1668  * Record physical disk regions which are in use as per fast commit area,
1669  * and used by inodes during replay phase. Our simple replay phase
1670  * allocator excludes these regions from allocation.
1671  */
1672 int ext4_fc_record_regions(struct super_block *sb, int ino,
1673                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1674 {
1675         struct ext4_fc_replay_state *state;
1676         struct ext4_fc_alloc_region *region;
1677
1678         state = &EXT4_SB(sb)->s_fc_replay_state;
1679         /*
1680          * during replay phase, the fc_regions_valid may not same as
1681          * fc_regions_used, update it when do new additions.
1682          */
1683         if (replay && state->fc_regions_used != state->fc_regions_valid)
1684                 state->fc_regions_used = state->fc_regions_valid;
1685         if (state->fc_regions_used == state->fc_regions_size) {
1686                 state->fc_regions_size +=
1687                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1688                 state->fc_regions = krealloc(
1689                                         state->fc_regions,
1690                                         state->fc_regions_size *
1691                                         sizeof(struct ext4_fc_alloc_region),
1692                                         GFP_KERNEL);
1693                 if (!state->fc_regions)
1694                         return -ENOMEM;
1695         }
1696         region = &state->fc_regions[state->fc_regions_used++];
1697         region->ino = ino;
1698         region->lblk = lblk;
1699         region->pblk = pblk;
1700         region->len = len;
1701
1702         if (replay)
1703                 state->fc_regions_valid++;
1704
1705         return 0;
1706 }
1707
1708 /* Replay add range tag */
1709 static int ext4_fc_replay_add_range(struct super_block *sb,
1710                                     struct ext4_fc_tl *tl, u8 *val)
1711 {
1712         struct ext4_fc_add_range fc_add_ex;
1713         struct ext4_extent newex, *ex;
1714         struct inode *inode;
1715         ext4_lblk_t start, cur;
1716         int remaining, len;
1717         ext4_fsblk_t start_pblk;
1718         struct ext4_map_blocks map;
1719         struct ext4_ext_path *path = NULL;
1720         int ret;
1721
1722         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1723         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1724
1725         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1726                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1727                 ext4_ext_get_actual_len(ex));
1728
1729         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1730         if (IS_ERR(inode)) {
1731                 jbd_debug(1, "Inode not found.");
1732                 return 0;
1733         }
1734
1735         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1736         if (ret)
1737                 goto out;
1738
1739         start = le32_to_cpu(ex->ee_block);
1740         start_pblk = ext4_ext_pblock(ex);
1741         len = ext4_ext_get_actual_len(ex);
1742
1743         cur = start;
1744         remaining = len;
1745         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1746                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1747                   inode->i_ino);
1748
1749         while (remaining > 0) {
1750                 map.m_lblk = cur;
1751                 map.m_len = remaining;
1752                 map.m_pblk = 0;
1753                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1754
1755                 if (ret < 0)
1756                         goto out;
1757
1758                 if (ret == 0) {
1759                         /* Range is not mapped */
1760                         path = ext4_find_extent(inode, cur, NULL, 0);
1761                         if (IS_ERR(path))
1762                                 goto out;
1763                         memset(&newex, 0, sizeof(newex));
1764                         newex.ee_block = cpu_to_le32(cur);
1765                         ext4_ext_store_pblock(
1766                                 &newex, start_pblk + cur - start);
1767                         newex.ee_len = cpu_to_le16(map.m_len);
1768                         if (ext4_ext_is_unwritten(ex))
1769                                 ext4_ext_mark_unwritten(&newex);
1770                         down_write(&EXT4_I(inode)->i_data_sem);
1771                         ret = ext4_ext_insert_extent(
1772                                 NULL, inode, &path, &newex, 0);
1773                         up_write((&EXT4_I(inode)->i_data_sem));
1774                         ext4_ext_drop_refs(path);
1775                         kfree(path);
1776                         if (ret)
1777                                 goto out;
1778                         goto next;
1779                 }
1780
1781                 if (start_pblk + cur - start != map.m_pblk) {
1782                         /*
1783                          * Logical to physical mapping changed. This can happen
1784                          * if this range was removed and then reallocated to
1785                          * map to new physical blocks during a fast commit.
1786                          */
1787                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1788                                         ext4_ext_is_unwritten(ex),
1789                                         start_pblk + cur - start);
1790                         if (ret)
1791                                 goto out;
1792                         /*
1793                          * Mark the old blocks as free since they aren't used
1794                          * anymore. We maintain an array of all the modified
1795                          * inodes. In case these blocks are still used at either
1796                          * a different logical range in the same inode or in
1797                          * some different inode, we will mark them as allocated
1798                          * at the end of the FC replay using our array of
1799                          * modified inodes.
1800                          */
1801                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1802                         goto next;
1803                 }
1804
1805                 /* Range is mapped and needs a state change */
1806                 jbd_debug(1, "Converting from %ld to %d %lld",
1807                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1808                         ext4_ext_is_unwritten(ex), map.m_pblk);
1809                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1810                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1811                 if (ret)
1812                         goto out;
1813                 /*
1814                  * We may have split the extent tree while toggling the state.
1815                  * Try to shrink the extent tree now.
1816                  */
1817                 ext4_ext_replay_shrink_inode(inode, start + len);
1818 next:
1819                 cur += map.m_len;
1820                 remaining -= map.m_len;
1821         }
1822         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1823                                         sb->s_blocksize_bits);
1824 out:
1825         iput(inode);
1826         return 0;
1827 }
1828
1829 /* Replay DEL_RANGE tag */
1830 static int
1831 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1832                          u8 *val)
1833 {
1834         struct inode *inode;
1835         struct ext4_fc_del_range lrange;
1836         struct ext4_map_blocks map;
1837         ext4_lblk_t cur, remaining;
1838         int ret;
1839
1840         memcpy(&lrange, val, sizeof(lrange));
1841         cur = le32_to_cpu(lrange.fc_lblk);
1842         remaining = le32_to_cpu(lrange.fc_len);
1843
1844         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1845                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1846
1847         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1848         if (IS_ERR(inode)) {
1849                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1850                 return 0;
1851         }
1852
1853         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1854         if (ret)
1855                 goto out;
1856
1857         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1858                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1859                         le32_to_cpu(lrange.fc_len));
1860         while (remaining > 0) {
1861                 map.m_lblk = cur;
1862                 map.m_len = remaining;
1863
1864                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1865                 if (ret < 0)
1866                         goto out;
1867                 if (ret > 0) {
1868                         remaining -= ret;
1869                         cur += ret;
1870                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1871                 } else {
1872                         remaining -= map.m_len;
1873                         cur += map.m_len;
1874                 }
1875         }
1876
1877         down_write(&EXT4_I(inode)->i_data_sem);
1878         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1879                                 le32_to_cpu(lrange.fc_lblk) +
1880                                 le32_to_cpu(lrange.fc_len) - 1);
1881         up_write(&EXT4_I(inode)->i_data_sem);
1882         if (ret)
1883                 goto out;
1884         ext4_ext_replay_shrink_inode(inode,
1885                 i_size_read(inode) >> sb->s_blocksize_bits);
1886         ext4_mark_inode_dirty(NULL, inode);
1887 out:
1888         iput(inode);
1889         return 0;
1890 }
1891
1892 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1893 {
1894         struct ext4_fc_replay_state *state;
1895         struct inode *inode;
1896         struct ext4_ext_path *path = NULL;
1897         struct ext4_map_blocks map;
1898         int i, ret, j;
1899         ext4_lblk_t cur, end;
1900
1901         state = &EXT4_SB(sb)->s_fc_replay_state;
1902         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1903                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1904                         EXT4_IGET_NORMAL);
1905                 if (IS_ERR(inode)) {
1906                         jbd_debug(1, "Inode %d not found.",
1907                                 state->fc_modified_inodes[i]);
1908                         continue;
1909                 }
1910                 cur = 0;
1911                 end = EXT_MAX_BLOCKS;
1912                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1913                         iput(inode);
1914                         continue;
1915                 }
1916                 while (cur < end) {
1917                         map.m_lblk = cur;
1918                         map.m_len = end - cur;
1919
1920                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1921                         if (ret < 0)
1922                                 break;
1923
1924                         if (ret > 0) {
1925                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1926                                 if (!IS_ERR(path)) {
1927                                         for (j = 0; j < path->p_depth; j++)
1928                                                 ext4_mb_mark_bb(inode->i_sb,
1929                                                         path[j].p_block, 1, 1);
1930                                         ext4_ext_drop_refs(path);
1931                                         kfree(path);
1932                                 }
1933                                 cur += ret;
1934                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1935                                                         map.m_len, 1);
1936                         } else {
1937                                 cur = cur + (map.m_len ? map.m_len : 1);
1938                         }
1939                 }
1940                 iput(inode);
1941         }
1942 }
1943
1944 /*
1945  * Check if block is in excluded regions for block allocation. The simple
1946  * allocator that runs during replay phase is calls this function to see
1947  * if it is okay to use a block.
1948  */
1949 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1950 {
1951         int i;
1952         struct ext4_fc_replay_state *state;
1953
1954         state = &EXT4_SB(sb)->s_fc_replay_state;
1955         for (i = 0; i < state->fc_regions_valid; i++) {
1956                 if (state->fc_regions[i].ino == 0 ||
1957                         state->fc_regions[i].len == 0)
1958                         continue;
1959                 if (in_range(blk, state->fc_regions[i].pblk,
1960                                         state->fc_regions[i].len))
1961                         return true;
1962         }
1963         return false;
1964 }
1965
1966 /* Cleanup function called after replay */
1967 void ext4_fc_replay_cleanup(struct super_block *sb)
1968 {
1969         struct ext4_sb_info *sbi = EXT4_SB(sb);
1970
1971         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1972         kfree(sbi->s_fc_replay_state.fc_regions);
1973         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1974 }
1975
1976 /*
1977  * Recovery Scan phase handler
1978  *
1979  * This function is called during the scan phase and is responsible
1980  * for doing following things:
1981  * - Make sure the fast commit area has valid tags for replay
1982  * - Count number of tags that need to be replayed by the replay handler
1983  * - Verify CRC
1984  * - Create a list of excluded blocks for allocation during replay phase
1985  *
1986  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1987  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1988  * to indicate that scan has finished and JBD2 can now start replay phase.
1989  * It returns a negative error to indicate that there was an error. At the end
1990  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1991  * to indicate the number of tags that need to replayed during the replay phase.
1992  */
1993 static int ext4_fc_replay_scan(journal_t *journal,
1994                                 struct buffer_head *bh, int off,
1995                                 tid_t expected_tid)
1996 {
1997         struct super_block *sb = journal->j_private;
1998         struct ext4_sb_info *sbi = EXT4_SB(sb);
1999         struct ext4_fc_replay_state *state;
2000         int ret = JBD2_FC_REPLAY_CONTINUE;
2001         struct ext4_fc_add_range ext;
2002         struct ext4_fc_tl tl;
2003         struct ext4_fc_tail tail;
2004         __u8 *start, *end, *cur, *val;
2005         struct ext4_fc_head head;
2006         struct ext4_extent *ex;
2007
2008         state = &sbi->s_fc_replay_state;
2009
2010         start = (u8 *)bh->b_data;
2011         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2012
2013         if (state->fc_replay_expected_off == 0) {
2014                 state->fc_cur_tag = 0;
2015                 state->fc_replay_num_tags = 0;
2016                 state->fc_crc = 0;
2017                 state->fc_regions = NULL;
2018                 state->fc_regions_valid = state->fc_regions_used =
2019                         state->fc_regions_size = 0;
2020                 /* Check if we can stop early */
2021                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2022                         != EXT4_FC_TAG_HEAD)
2023                         return 0;
2024         }
2025
2026         if (off != state->fc_replay_expected_off) {
2027                 ret = -EFSCORRUPTED;
2028                 goto out_err;
2029         }
2030
2031         state->fc_replay_expected_off++;
2032         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2033                 memcpy(&tl, cur, sizeof(tl));
2034                 val = cur + sizeof(tl);
2035                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
2036                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
2037                 switch (le16_to_cpu(tl.fc_tag)) {
2038                 case EXT4_FC_TAG_ADD_RANGE:
2039                         memcpy(&ext, val, sizeof(ext));
2040                         ex = (struct ext4_extent *)&ext.fc_ex;
2041                         ret = ext4_fc_record_regions(sb,
2042                                 le32_to_cpu(ext.fc_ino),
2043                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2044                                 ext4_ext_get_actual_len(ex), 0);
2045                         if (ret < 0)
2046                                 break;
2047                         ret = JBD2_FC_REPLAY_CONTINUE;
2048                         fallthrough;
2049                 case EXT4_FC_TAG_DEL_RANGE:
2050                 case EXT4_FC_TAG_LINK:
2051                 case EXT4_FC_TAG_UNLINK:
2052                 case EXT4_FC_TAG_CREAT:
2053                 case EXT4_FC_TAG_INODE:
2054                 case EXT4_FC_TAG_PAD:
2055                         state->fc_cur_tag++;
2056                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2057                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
2058                         break;
2059                 case EXT4_FC_TAG_TAIL:
2060                         state->fc_cur_tag++;
2061                         memcpy(&tail, val, sizeof(tail));
2062                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2063                                                 sizeof(tl) +
2064                                                 offsetof(struct ext4_fc_tail,
2065                                                 fc_crc));
2066                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2067                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2068                                 state->fc_replay_num_tags = state->fc_cur_tag;
2069                                 state->fc_regions_valid =
2070                                         state->fc_regions_used;
2071                         } else {
2072                                 ret = state->fc_replay_num_tags ?
2073                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2074                         }
2075                         state->fc_crc = 0;
2076                         break;
2077                 case EXT4_FC_TAG_HEAD:
2078                         memcpy(&head, val, sizeof(head));
2079                         if (le32_to_cpu(head.fc_features) &
2080                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2081                                 ret = -EOPNOTSUPP;
2082                                 break;
2083                         }
2084                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2085                                 ret = JBD2_FC_REPLAY_STOP;
2086                                 break;
2087                         }
2088                         state->fc_cur_tag++;
2089                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2090                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2091                         break;
2092                 default:
2093                         ret = state->fc_replay_num_tags ?
2094                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2095                 }
2096                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2097                         break;
2098         }
2099
2100 out_err:
2101         trace_ext4_fc_replay_scan(sb, ret, off);
2102         return ret;
2103 }
2104
2105 /*
2106  * Main recovery path entry point.
2107  * The meaning of return codes is similar as above.
2108  */
2109 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2110                                 enum passtype pass, int off, tid_t expected_tid)
2111 {
2112         struct super_block *sb = journal->j_private;
2113         struct ext4_sb_info *sbi = EXT4_SB(sb);
2114         struct ext4_fc_tl tl;
2115         __u8 *start, *end, *cur, *val;
2116         int ret = JBD2_FC_REPLAY_CONTINUE;
2117         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2118         struct ext4_fc_tail tail;
2119
2120         if (pass == PASS_SCAN) {
2121                 state->fc_current_pass = PASS_SCAN;
2122                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2123         }
2124
2125         if (state->fc_current_pass != pass) {
2126                 state->fc_current_pass = pass;
2127                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2128         }
2129         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2130                 jbd_debug(1, "Replay stops\n");
2131                 ext4_fc_set_bitmaps_and_counters(sb);
2132                 return 0;
2133         }
2134
2135 #ifdef CONFIG_EXT4_DEBUG
2136         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2137                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2138                 return JBD2_FC_REPLAY_STOP;
2139         }
2140 #endif
2141
2142         start = (u8 *)bh->b_data;
2143         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2144
2145         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2146                 memcpy(&tl, cur, sizeof(tl));
2147                 val = cur + sizeof(tl);
2148
2149                 if (state->fc_replay_num_tags == 0) {
2150                         ret = JBD2_FC_REPLAY_STOP;
2151                         ext4_fc_set_bitmaps_and_counters(sb);
2152                         break;
2153                 }
2154                 jbd_debug(3, "Replay phase, tag:%s\n",
2155                                 tag2str(le16_to_cpu(tl.fc_tag)));
2156                 state->fc_replay_num_tags--;
2157                 switch (le16_to_cpu(tl.fc_tag)) {
2158                 case EXT4_FC_TAG_LINK:
2159                         ret = ext4_fc_replay_link(sb, &tl, val);
2160                         break;
2161                 case EXT4_FC_TAG_UNLINK:
2162                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2163                         break;
2164                 case EXT4_FC_TAG_ADD_RANGE:
2165                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2166                         break;
2167                 case EXT4_FC_TAG_CREAT:
2168                         ret = ext4_fc_replay_create(sb, &tl, val);
2169                         break;
2170                 case EXT4_FC_TAG_DEL_RANGE:
2171                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2172                         break;
2173                 case EXT4_FC_TAG_INODE:
2174                         ret = ext4_fc_replay_inode(sb, &tl, val);
2175                         break;
2176                 case EXT4_FC_TAG_PAD:
2177                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2178                                              le16_to_cpu(tl.fc_len), 0);
2179                         break;
2180                 case EXT4_FC_TAG_TAIL:
2181                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2182                                              le16_to_cpu(tl.fc_len), 0);
2183                         memcpy(&tail, val, sizeof(tail));
2184                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2185                         break;
2186                 case EXT4_FC_TAG_HEAD:
2187                         break;
2188                 default:
2189                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2190                                              le16_to_cpu(tl.fc_len), 0);
2191                         ret = -ECANCELED;
2192                         break;
2193                 }
2194                 if (ret < 0)
2195                         break;
2196                 ret = JBD2_FC_REPLAY_CONTINUE;
2197         }
2198         return ret;
2199 }
2200
2201 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2202 {
2203         /*
2204          * We set replay callback even if fast commit disabled because we may
2205          * could still have fast commit blocks that need to be replayed even if
2206          * fast commit has now been turned off.
2207          */
2208         journal->j_fc_replay_callback = ext4_fc_replay;
2209         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2210                 return;
2211         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2212 }
2213
2214 static const char *fc_ineligible_reasons[] = {
2215         "Extended attributes changed",
2216         "Cross rename",
2217         "Journal flag changed",
2218         "Insufficient memory",
2219         "Swap boot",
2220         "Resize",
2221         "Dir renamed",
2222         "Falloc range op",
2223         "Data journalling",
2224         "FC Commit Failed"
2225 };
2226
2227 int ext4_fc_info_show(struct seq_file *seq, void *v)
2228 {
2229         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2230         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2231         int i;
2232
2233         if (v != SEQ_START_TOKEN)
2234                 return 0;
2235
2236         seq_printf(seq,
2237                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2238                    stats->fc_num_commits, stats->fc_ineligible_commits,
2239                    stats->fc_numblks,
2240                    div_u64(stats->s_fc_avg_commit_time, 1000));
2241         seq_puts(seq, "Ineligible reasons:\n");
2242         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2243                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2244                         stats->fc_ineligible_reason_count[i]);
2245
2246         return 0;
2247 }
2248
2249 int __init ext4_fc_init_dentry_cache(void)
2250 {
2251         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2252                                            SLAB_RECLAIM_ACCOUNT);
2253
2254         if (ext4_fc_dentry_cachep == NULL)
2255                 return -ENOMEM;
2256
2257         return 0;
2258 }
2259
2260 void ext4_fc_destroy_dentry_cache(void)
2261 {
2262         kmem_cache_destroy(ext4_fc_dentry_cachep);
2263 }