fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligibility is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * Fast Commit Replay Idempotence
 107  * ------------------------------
 108  *
 109  * Fast commits tags are idempotent in nature provided the recovery code follows
 110  * certain rules. The guiding principle that the commit path follows while
 111  * committing is that it stores the result of a particular operation instead of
 112  * storing the procedure.
 113  *
 114  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115  * was associated with inode 10. During fast commit, instead of storing this
 116  * operation as a procedure "rename a to b", we store the resulting file system
 117  * state as a "series" of outcomes:
 118  *
 119  * - Link dirent b to inode 10
 120  * - Unlink dirent a
 121  * - Inode <10> with valid refcount
 122  *
 123  * Now when recovery code runs, it needs "enforce" this state on the file
 124  * system. This is what guarantees idempotence of fast commit replay.
 125  *
 126  * Let's take an example of a procedure that is not idempotent and see how fast
 127  * commits make it idempotent. Consider following sequence of operations:
 128  *
 129  *     rm A;    mv B A;    read A
 130  *  (x)     (y)        (z)
 131  *
 132  * (x), (y) and (z) are the points at which we can crash. If we store this
 133  * sequence of operations as is then the replay is not idempotent. Let's say
 134  * while in replay, we crash at (z). During the second replay, file A (which was
 135  * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136  * file named A would be absent when we try to read A. So, this sequence of
 137  * operations is not idempotent. However, as mentioned above, instead of storing
 138  * the procedure fast commits store the outcome of each procedure. Thus the fast
 139  * commit log for above procedure would be as follows:
 140  *
 141  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142  * inode 11 before the replay)
 143  *
 144  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145  * (w)          (x)                    (y)          (z)
 146  *
 147  * If we crash at (z), we will have file A linked to inode 11. During the second
 148  * replay, we will remove file A (inode 11). But we will create it back and make
 149  * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152  * similarly. Thus, by converting a non-idempotent procedure into a series of
 153  * idempotent outcomes, fast commits ensured idempotence during the replay.
 154  *
 155  * TODOs
 156  * -----
 157  *
 158  * 0) Fast commit replay path hardening: Fast commit replay code should use
 159  *    journal handles to make sure all the updates it does during the replay
 160  *    path are atomic. With that if we crash during fast commit replay, after
 161  *    trying to do recovery again, we will find a file system where fast commit
 162  *    area is invalid (because new full commit would be found). In order to deal
 163  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164  *    superblock state is persisted before starting the replay, so that after
 165  *    the crash, fast commit recovery code can look at that flag and perform
 166  *    fast commit recovery even if that area is invalidated by later full
 167  *    commits.
 168  *
 169  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170  *    eligible update must be protected within ext4_fc_start_update() and
 171  *    ext4_fc_stop_update(). These routines are called at much higher
 172  *    routines. This can be made more fine grained by combining with
 173  *    ext4_journal_start().
 174  *
 175  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176  *
 177  * 3) Handle more ineligible cases.
 178  */
 179
 180 #include <trace/events/ext4.h>
 181 static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184 {
 185         BUFFER_TRACE(bh, "");
 186         if (uptodate) {
 187                 ext4_debug("%s: Block %lld up-to-date",
 188                            __func__, bh->b_blocknr);
 189                 set_buffer_uptodate(bh);
 190         } else {
 191                 ext4_debug("%s: Block %lld not up-to-date",
 192                            __func__, bh->b_blocknr);
 193                 clear_buffer_uptodate(bh);
 194         }
 195
 196         unlock_buffer(bh);
 197 }
 198
 199 static inline void ext4_fc_reset_inode(struct inode *inode)
 200 {
 201         struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203         ei->i_fc_lblk_start = 0;
 204         ei->i_fc_lblk_len = 0;
 205 }
 206
 207 void ext4_fc_init_inode(struct inode *inode)
 208 {
 209         struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211         ext4_fc_reset_inode(inode);
 212         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213         INIT_LIST_HEAD(&ei->i_fc_list);
 214         init_waitqueue_head(&ei->i_fc_wait);
 215         atomic_set(&ei->i_fc_updates, 0);
 216 }
 217
 218 /* This function must be called with sbi->s_fc_lock held. */
 219 static void ext4_fc_wait_committing_inode(struct inode *inode)
 220 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221 {
 222         wait_queue_head_t *wq;
 223         struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225 #if (BITS_PER_LONG < 64)
 226         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227                         EXT4_STATE_FC_COMMITTING);
 228         wq = bit_waitqueue(&ei->i_state_flags,
 229                                 EXT4_STATE_FC_COMMITTING);
 230 #else
 231         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232                         EXT4_STATE_FC_COMMITTING);
 233         wq = bit_waitqueue(&ei->i_flags,
 234                                 EXT4_STATE_FC_COMMITTING);
 235 #endif
 236         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239         schedule();
 240         finish_wait(wq, &wait.wq_entry);
 241 }
 242
 243 /*
 244  * Inform Ext4's fast about start of an inode update
 245  *
 246  * This function is called by the high level call VFS callbacks before
 247  * performing any inode update. This function blocks if there's an ongoing
 248  * fast commit on the inode in question.
 249  */
 250 void ext4_fc_start_update(struct inode *inode)
 251 {
 252         struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256                 return;
 257
 258 restart:
 259         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260         if (list_empty(&ei->i_fc_list))
 261                 goto out;
 262
 263         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264                 ext4_fc_wait_committing_inode(inode);
 265                 goto restart;
 266         }
 267 out:
 268         atomic_inc(&ei->i_fc_updates);
 269         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270 }
 271
 272 /*
 273  * Stop inode update and wake up waiting fast commits if any.
 274  */
 275 void ext4_fc_stop_update(struct inode *inode)
 276 {
 277         struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281                 return;
 282
 283         if (atomic_dec_and_test(&ei->i_fc_updates))
 284                 wake_up_all(&ei->i_fc_wait);
 285 }
 286
 287 /*
 288  * Remove inode from fast commit list. If the inode is being committed
 289  * we wait until inode commit is done.
 290  */
 291 void ext4_fc_del(struct inode *inode)
 292 {
 293         struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297                 return;
 298
 299 restart:
 300         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301         if (list_empty(&ei->i_fc_list)) {
 302                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303                 return;
 304         }
 305
 306         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307                 ext4_fc_wait_committing_inode(inode);
 308                 goto restart;
 309         }
 310         list_del_init(&ei->i_fc_list);
 311         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312 }
 313
 314 /*
 315  * Mark file system as fast commit ineligible. This means that next commit
 316  * operation would result in a full jbd2 commit.
 317  */
 318 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319 {
 320         struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324                 return;
 325
 326         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329 }
 330
 331 /*
 332  * Start a fast commit ineligible update. Any commits that happen while
 333  * such an operation is in progress fall back to full commits.
 334  */
 335 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336 {
 337         struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341                 return;
 342
 343         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345         atomic_inc(&sbi->s_fc_ineligible_updates);
 346 }
 347
 348 /*
 349  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350  * to ensure that after stopping the ineligible update, at least one full
 351  * commit takes place.
 352  */
 353 void ext4_fc_stop_ineligible(struct super_block *sb)
 354 {
 355         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357                 return;
 358
 359         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361 }
 362
 363 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364 {
 365         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367 }
 368
 369 /*
 370  * Generic fast commit tracking function. If this is the first time this we are
 371  * called after a full commit, we initialize fast commit fields and then call
 372  * __fc_track_fn() with update = 0. If we have already been called after a full
 373  * commit, we pass update = 1. Based on that, the track function can determine
 374  * if it needs to track a field for the first time or if it needs to just
 375  * update the previously tracked value.
 376  *
 377  * If enqueue is set, this function enqueues the inode in fast commit list.
 378  */
 379 static int ext4_fc_track_template(
 380         handle_t *handle, struct inode *inode,
 381         int (*__fc_track_fn)(struct inode *, void *, bool),
 382         void *args, int enqueue)
 383 {
 384         bool update = false;
 385         struct ext4_inode_info *ei = EXT4_I(inode);
 386         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387         tid_t tid = 0;
 388         int ret;
 389
 390         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391             (sbi->s_mount_state & EXT4_FC_REPLAY))
 392                 return -EOPNOTSUPP;
 393
 394         if (ext4_fc_is_ineligible(inode->i_sb))
 395                 return -EINVAL;
 396
 397         tid = handle->h_transaction->t_tid;
 398         mutex_lock(&ei->i_fc_lock);
 399         if (tid == ei->i_sync_tid) {
 400                 update = true;
 401         } else {
 402                 ext4_fc_reset_inode(inode);
 403                 ei->i_sync_tid = tid;
 404         }
 405         ret = __fc_track_fn(inode, args, update);
 406         mutex_unlock(&ei->i_fc_lock);
 407
 408         if (!enqueue)
 409                 return ret;
 410
 411         spin_lock(&sbi->s_fc_lock);
 412         if (list_empty(&EXT4_I(inode)->i_fc_list))
 413                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 414                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415                                 &sbi->s_fc_q[FC_Q_STAGING] :
 416                                 &sbi->s_fc_q[FC_Q_MAIN]);
 417         spin_unlock(&sbi->s_fc_lock);
 418
 419         return ret;
 420 }
 421
 422 struct __track_dentry_update_args {
 423         struct dentry *dentry;
 424         int op;
 425 };
 426
 427 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429 {
 430         struct ext4_fc_dentry_update *node;
 431         struct ext4_inode_info *ei = EXT4_I(inode);
 432         struct __track_dentry_update_args *dentry_update =
 433                 (struct __track_dentry_update_args *)arg;
 434         struct dentry *dentry = dentry_update->dentry;
 435         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437         mutex_unlock(&ei->i_fc_lock);
 438         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439         if (!node) {
 440                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441                 mutex_lock(&ei->i_fc_lock);
 442                 return -ENOMEM;
 443         }
 444
 445         node->fcd_op = dentry_update->op;
 446         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447         node->fcd_ino = inode->i_ino;
 448         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450                 if (!node->fcd_name.name) {
 451                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 452                         ext4_fc_mark_ineligible(inode->i_sb,
 453                                 EXT4_FC_REASON_NOMEM);
 454                         mutex_lock(&ei->i_fc_lock);
 455                         return -ENOMEM;
 456                 }
 457                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458                         dentry->d_name.len);
 459         } else {
 460                 memcpy(node->fcd_iname, dentry->d_name.name,
 461                         dentry->d_name.len);
 462                 node->fcd_name.name = node->fcd_iname;
 463         }
 464         node->fcd_name.len = dentry->d_name.len;
 465
 466         spin_lock(&sbi->s_fc_lock);
 467         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468                 list_add_tail(&node->fcd_list,
 469                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470         else
 471                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472         spin_unlock(&sbi->s_fc_lock);
 473         mutex_lock(&ei->i_fc_lock);
 474
 475         return 0;
 476 }
 477
 478 void __ext4_fc_track_unlink(handle_t *handle,
 479                 struct inode *inode, struct dentry *dentry)
 480 {
 481         struct __track_dentry_update_args args;
 482         int ret;
 483
 484         args.dentry = dentry;
 485         args.op = EXT4_FC_TAG_UNLINK;
 486
 487         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488                                         (void *)&args, 0);
 489         trace_ext4_fc_track_unlink(inode, dentry, ret);
 490 }
 491
 492 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493 {
 494         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495 }
 496
 497 void __ext4_fc_track_link(handle_t *handle,
 498         struct inode *inode, struct dentry *dentry)
 499 {
 500         struct __track_dentry_update_args args;
 501         int ret;
 502
 503         args.dentry = dentry;
 504         args.op = EXT4_FC_TAG_LINK;
 505
 506         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507                                         (void *)&args, 0);
 508         trace_ext4_fc_track_link(inode, dentry, ret);
 509 }
 510
 511 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512 {
 513         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514 }
 515
 516 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517                           struct dentry *dentry)
 518 {
 519         struct __track_dentry_update_args args;
 520         int ret;
 521
 522         args.dentry = dentry;
 523         args.op = EXT4_FC_TAG_CREAT;
 524
 525         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526                                         (void *)&args, 0);
 527         trace_ext4_fc_track_create(inode, dentry, ret);
 528 }
 529
 530 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531 {
 532         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533 }
 534
 535 /* __track_fn for inode tracking */
 536 static int __track_inode(struct inode *inode, void *arg, bool update)
 537 {
 538         if (update)
 539                 return -EEXIST;
 540
 541         EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543         return 0;
 544 }
 545
 546 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547 {
 548         int ret;
 549
 550         if (S_ISDIR(inode->i_mode))
 551                 return;
 552
 553         if (ext4_should_journal_data(inode)) {
 554                 ext4_fc_mark_ineligible(inode->i_sb,
 555                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556                 return;
 557         }
 558
 559         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560         trace_ext4_fc_track_inode(inode, ret);
 561 }
 562
 563 struct __track_range_args {
 564         ext4_lblk_t start, end;
 565 };
 566
 567 /* __track_fn for tracking data updates */
 568 static int __track_range(struct inode *inode, void *arg, bool update)
 569 {
 570         struct ext4_inode_info *ei = EXT4_I(inode);
 571         ext4_lblk_t oldstart;
 572         struct __track_range_args *__arg =
 573                 (struct __track_range_args *)arg;
 574
 575         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577                 return -ECANCELED;
 578         }
 579
 580         oldstart = ei->i_fc_lblk_start;
 581
 582         if (update && ei->i_fc_lblk_len > 0) {
 583                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584                 ei->i_fc_lblk_len =
 585                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586                                 ei->i_fc_lblk_start + 1;
 587         } else {
 588                 ei->i_fc_lblk_start = __arg->start;
 589                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590         }
 591
 592         return 0;
 593 }
 594
 595 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596                          ext4_lblk_t end)
 597 {
 598         struct __track_range_args args;
 599         int ret;
 600
 601         if (S_ISDIR(inode->i_mode))
 602                 return;
 603
 604         args.start = start;
 605         args.end = end;
 606
 607         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609         trace_ext4_fc_track_range(inode, start, end, ret);
 610 }
 611
 612 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613 {
 614         int write_flags = REQ_SYNC;
 615         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618         if (test_opt(sb, BARRIER) && is_tail)
 619                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 620         lock_buffer(bh);
 621         set_buffer_dirty(bh);
 622         set_buffer_uptodate(bh);
 623         bh->b_end_io = ext4_end_buffer_io_sync;
 624         submit_bh(REQ_OP_WRITE, write_flags, bh);
 625         EXT4_SB(sb)->s_fc_bh = NULL;
 626 }
 627
 628 /* Ext4 commit path routines */
 629
 630 /* memzero and update CRC */
 631 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632                                 u32 *crc)
 633 {
 634         void *ret;
 635
 636         ret = memset(dst, 0, len);
 637         if (crc)
 638                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639         return ret;
 640 }
 641
 642 /*
 643  * Allocate len bytes on a fast commit buffer.
 644  *
 645  * During the commit time this function is used to manage fast commit
 646  * block space. We don't split a fast commit log onto different
 647  * blocks. So this function makes sure that if there's not enough space
 648  * on the current block, the remaining space in the current block is
 649  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650  * new block is from jbd2 and CRC is updated to reflect the padding
 651  * we added.
 652  */
 653 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654 {
 655         struct ext4_fc_tl *tl;
 656         struct ext4_sb_info *sbi = EXT4_SB(sb);
 657         struct buffer_head *bh;
 658         int bsize = sbi->s_journal->j_blocksize;
 659         int ret, off = sbi->s_fc_bytes % bsize;
 660         int pad_len;
 661
 662         /*
 663          * After allocating len, we should have space at least for a 0 byte
 664          * padding.
 665          */
 666         if (len + sizeof(struct ext4_fc_tl) > bsize)
 667                 return NULL;
 668
 669         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670                 /*
 671                  * Only allocate from current buffer if we have enough space for
 672                  * this request AND we have space to add a zero byte padding.
 673                  */
 674                 if (!sbi->s_fc_bh) {
 675                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676                         if (ret)
 677                                 return NULL;
 678                         sbi->s_fc_bh = bh;
 679                 }
 680                 sbi->s_fc_bytes += len;
 681                 return sbi->s_fc_bh->b_data + off;
 682         }
 683         /* Need to add PAD tag */
 684         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687         tl->fc_len = cpu_to_le16(pad_len);
 688         if (crc)
 689                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690         if (pad_len > 0)
 691                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692         ext4_fc_submit_bh(sb, false);
 693
 694         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695         if (ret)
 696                 return NULL;
 697         sbi->s_fc_bh = bh;
 698         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699         return sbi->s_fc_bh->b_data;
 700 }
 701
 702 /* memcpy to fc reserved space and update CRC */
 703 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704                                 int len, u32 *crc)
 705 {
 706         if (crc)
 707                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708         return memcpy(dst, src, len);
 709 }
 710
 711 /*
 712  * Complete a fast commit by writing tail tag.
 713  *
 714  * Writing tail tag marks the end of a fast commit. In order to guarantee
 715  * atomicity, after writing tail tag, even if there's space remaining
 716  * in the block, next commit shouldn't use it. That's why tail tag
 717  * has the length as that of the remaining space on the block.
 718  */
 719 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720 {
 721         struct ext4_sb_info *sbi = EXT4_SB(sb);
 722         struct ext4_fc_tl tl;
 723         struct ext4_fc_tail tail;
 724         int off, bsize = sbi->s_journal->j_blocksize;
 725         u8 *dst;
 726
 727         /*
 728          * ext4_fc_reserve_space takes care of allocating an extra block if
 729          * there's no enough space on this block for accommodating this tail.
 730          */
 731         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732         if (!dst)
 733                 return -ENOSPC;
 734
 735         off = sbi->s_fc_bytes % bsize;
 736
 737         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742         dst += sizeof(tl);
 743         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745         dst += sizeof(tail.fc_tid);
 746         tail.fc_crc = cpu_to_le32(crc);
 747         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749         ext4_fc_submit_bh(sb, true);
 750
 751         return 0;
 752 }
 753
 754 /*
 755  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756  * Returns false if there's not enough space.
 757  */
 758 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759                            u32 *crc)
 760 {
 761         struct ext4_fc_tl tl;
 762         u8 *dst;
 763
 764         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765         if (!dst)
 766                 return false;
 767
 768         tl.fc_tag = cpu_to_le16(tag);
 769         tl.fc_len = cpu_to_le16(len);
 770
 771         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774         return true;
 775 }
 776
 777 /* Same as above, but adds dentry tlv. */
 778 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 779                                    struct ext4_fc_dentry_update *fc_dentry)
 780 {
 781         struct ext4_fc_dentry_info fcd;
 782         struct ext4_fc_tl tl;
 783         int dlen = fc_dentry->fcd_name.len;
 784         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 785                                         crc);
 786
 787         if (!dst)
 788                 return false;
 789
 790         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 791         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 792         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 793         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 794         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 795         dst += sizeof(tl);
 796         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 797         dst += sizeof(fcd);
 798         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
 799         dst += dlen;
 800
 801         return true;
 802 }
 803
 804 /*
 805  * Writes inode in the fast commit space under TLV with tag @tag.
 806  * Returns 0 on success, error on failure.
 807  */
 808 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 809 {
 810         struct ext4_inode_info *ei = EXT4_I(inode);
 811         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 812         int ret;
 813         struct ext4_iloc iloc;
 814         struct ext4_fc_inode fc_inode;
 815         struct ext4_fc_tl tl;
 816         u8 *dst;
 817
 818         ret = ext4_get_inode_loc(inode, &iloc);
 819         if (ret)
 820                 return ret;
 821
 822         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 823                 inode_len += ei->i_extra_isize;
 824
 825         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 826         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 827         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 828
 829         dst = ext4_fc_reserve_space(inode->i_sb,
 830                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 831         if (!dst)
 832                 return -ECANCELED;
 833
 834         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 835                 return -ECANCELED;
 836         dst += sizeof(tl);
 837         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 838                 return -ECANCELED;
 839         dst += sizeof(fc_inode);
 840         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 841                                         inode_len, crc))
 842                 return -ECANCELED;
 843
 844         return 0;
 845 }
 846
 847 /*
 848  * Writes updated data ranges for the inode in question. Updates CRC.
 849  * Returns 0 on success, error otherwise.
 850  */
 851 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 852 {
 853         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 854         struct ext4_inode_info *ei = EXT4_I(inode);
 855         struct ext4_map_blocks map;
 856         struct ext4_fc_add_range fc_ext;
 857         struct ext4_fc_del_range lrange;
 858         struct ext4_extent *ex;
 859         int ret;
 860
 861         mutex_lock(&ei->i_fc_lock);
 862         if (ei->i_fc_lblk_len == 0) {
 863                 mutex_unlock(&ei->i_fc_lock);
 864                 return 0;
 865         }
 866         old_blk_size = ei->i_fc_lblk_start;
 867         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 868         ei->i_fc_lblk_len = 0;
 869         mutex_unlock(&ei->i_fc_lock);
 870
 871         cur_lblk_off = old_blk_size;
 872         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 873                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 874
 875         while (cur_lblk_off <= new_blk_size) {
 876                 map.m_lblk = cur_lblk_off;
 877                 map.m_len = new_blk_size - cur_lblk_off + 1;
 878                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 879                 if (ret < 0)
 880                         return -ECANCELED;
 881
 882                 if (map.m_len == 0) {
 883                         cur_lblk_off++;
 884                         continue;
 885                 }
 886
 887                 if (ret == 0) {
 888                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 889                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 890                         lrange.fc_len = cpu_to_le32(map.m_len);
 891                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 892                                             sizeof(lrange), (u8 *)&lrange, crc))
 893                                 return -ENOSPC;
 894                 } else {
 895                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 896                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 897
 898                         /* Limit the number of blocks in one extent */
 899                         map.m_len = min(max, map.m_len);
 900
 901                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 902                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 903                         ex->ee_block = cpu_to_le32(map.m_lblk);
 904                         ex->ee_len = cpu_to_le16(map.m_len);
 905                         ext4_ext_store_pblock(ex, map.m_pblk);
 906                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 907                                 ext4_ext_mark_unwritten(ex);
 908                         else
 909                                 ext4_ext_mark_initialized(ex);
 910                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 911                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 912                                 return -ENOSPC;
 913                 }
 914
 915                 cur_lblk_off += map.m_len;
 916         }
 917
 918         return 0;
 919 }
 920
 921
 922 /* Submit data for all the fast commit inodes */
 923 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 924 {
 925         struct super_block *sb = (struct super_block *)(journal->j_private);
 926         struct ext4_sb_info *sbi = EXT4_SB(sb);
 927         struct ext4_inode_info *ei;
 928         int ret = 0;
 929
 930         spin_lock(&sbi->s_fc_lock);
 931         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 932         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 933                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 934                 while (atomic_read(&ei->i_fc_updates)) {
 935                         DEFINE_WAIT(wait);
 936
 937                         prepare_to_wait(&ei->i_fc_wait, &wait,
 938                                                 TASK_UNINTERRUPTIBLE);
 939                         if (atomic_read(&ei->i_fc_updates)) {
 940                                 spin_unlock(&sbi->s_fc_lock);
 941                                 schedule();
 942                                 spin_lock(&sbi->s_fc_lock);
 943                         }
 944                         finish_wait(&ei->i_fc_wait, &wait);
 945                 }
 946                 spin_unlock(&sbi->s_fc_lock);
 947                 ret = jbd2_submit_inode_data(ei->jinode);
 948                 if (ret)
 949                         return ret;
 950                 spin_lock(&sbi->s_fc_lock);
 951         }
 952         spin_unlock(&sbi->s_fc_lock);
 953
 954         return ret;
 955 }
 956
 957 /* Wait for completion of data for all the fast commit inodes */
 958 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 959 {
 960         struct super_block *sb = (struct super_block *)(journal->j_private);
 961         struct ext4_sb_info *sbi = EXT4_SB(sb);
 962         struct ext4_inode_info *pos, *n;
 963         int ret = 0;
 964
 965         spin_lock(&sbi->s_fc_lock);
 966         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 967                 if (!ext4_test_inode_state(&pos->vfs_inode,
 968                                            EXT4_STATE_FC_COMMITTING))
 969                         continue;
 970                 spin_unlock(&sbi->s_fc_lock);
 971
 972                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 973                 if (ret)
 974                         return ret;
 975                 spin_lock(&sbi->s_fc_lock);
 976         }
 977         spin_unlock(&sbi->s_fc_lock);
 978
 979         return 0;
 980 }
 981
 982 /* Commit all the directory entry updates */
 983 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 984 __acquires(&sbi->s_fc_lock)
 985 __releases(&sbi->s_fc_lock)
 986 {
 987         struct super_block *sb = (struct super_block *)(journal->j_private);
 988         struct ext4_sb_info *sbi = EXT4_SB(sb);
 989         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 990         struct inode *inode;
 991         struct ext4_inode_info *ei, *ei_n;
 992         int ret;
 993
 994         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 995                 return 0;
 996         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 997                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 998                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 999                         spin_unlock(&sbi->s_fc_lock);
1000                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1001                                 ret = -ENOSPC;
1002                                 goto lock_and_exit;
1003                         }
1004                         spin_lock(&sbi->s_fc_lock);
1005                         continue;
1006                 }
1007
1008                 inode = NULL;
1009                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1010                                          i_fc_list) {
1011                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1012                                 inode = &ei->vfs_inode;
1013                                 break;
1014                         }
1015                 }
1016                 /*
1017                  * If we don't find inode in our list, then it was deleted,
1018                  * in which case, we don't need to record it's create tag.
1019                  */
1020                 if (!inode)
1021                         continue;
1022                 spin_unlock(&sbi->s_fc_lock);
1023
1024                 /*
1025                  * We first write the inode and then the create dirent. This
1026                  * allows the recovery code to create an unnamed inode first
1027                  * and then link it to a directory entry. This allows us
1028                  * to use namei.c routines almost as is and simplifies
1029                  * the recovery code.
1030                  */
1031                 ret = ext4_fc_write_inode(inode, crc);
1032                 if (ret)
1033                         goto lock_and_exit;
1034
1035                 ret = ext4_fc_write_inode_data(inode, crc);
1036                 if (ret)
1037                         goto lock_and_exit;
1038
1039                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1040                         ret = -ENOSPC;
1041                         goto lock_and_exit;
1042                 }
1043
1044                 spin_lock(&sbi->s_fc_lock);
1045         }
1046         return 0;
1047 lock_and_exit:
1048         spin_lock(&sbi->s_fc_lock);
1049         return ret;
1050 }
1051
1052 static int ext4_fc_perform_commit(journal_t *journal)
1053 {
1054         struct super_block *sb = (struct super_block *)(journal->j_private);
1055         struct ext4_sb_info *sbi = EXT4_SB(sb);
1056         struct ext4_inode_info *iter;
1057         struct ext4_fc_head head;
1058         struct inode *inode;
1059         struct blk_plug plug;
1060         int ret = 0;
1061         u32 crc = 0;
1062
1063         ret = ext4_fc_submit_inode_data_all(journal);
1064         if (ret)
1065                 return ret;
1066
1067         ret = ext4_fc_wait_inode_data_all(journal);
1068         if (ret)
1069                 return ret;
1070
1071         /*
1072          * If file system device is different from journal device, issue a cache
1073          * flush before we start writing fast commit blocks.
1074          */
1075         if (journal->j_fs_dev != journal->j_dev)
1076                 blkdev_issue_flush(journal->j_fs_dev);
1077
1078         blk_start_plug(&plug);
1079         if (sbi->s_fc_bytes == 0) {
1080                 /*
1081                  * Add a head tag only if this is the first fast commit
1082                  * in this TID.
1083                  */
1084                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1085                 head.fc_tid = cpu_to_le32(
1086                         sbi->s_journal->j_running_transaction->t_tid);
1087                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1088                         (u8 *)&head, &crc)) {
1089                         ret = -ENOSPC;
1090                         goto out;
1091                 }
1092         }
1093
1094         spin_lock(&sbi->s_fc_lock);
1095         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1096         if (ret) {
1097                 spin_unlock(&sbi->s_fc_lock);
1098                 goto out;
1099         }
1100
1101         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1102                 inode = &iter->vfs_inode;
1103                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1104                         continue;
1105
1106                 spin_unlock(&sbi->s_fc_lock);
1107                 ret = ext4_fc_write_inode_data(inode, &crc);
1108                 if (ret)
1109                         goto out;
1110                 ret = ext4_fc_write_inode(inode, &crc);
1111                 if (ret)
1112                         goto out;
1113                 spin_lock(&sbi->s_fc_lock);
1114         }
1115         spin_unlock(&sbi->s_fc_lock);
1116
1117         ret = ext4_fc_write_tail(sb, crc);
1118
1119 out:
1120         blk_finish_plug(&plug);
1121         return ret;
1122 }
1123
1124 /*
1125  * The main commit entry point. Performs a fast commit for transaction
1126  * commit_tid if needed. If it's not possible to perform a fast commit
1127  * due to various reasons, we fall back to full commit. Returns 0
1128  * on success, error otherwise.
1129  */
1130 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1131 {
1132         struct super_block *sb = (struct super_block *)(journal->j_private);
1133         struct ext4_sb_info *sbi = EXT4_SB(sb);
1134         int nblks = 0, ret, bsize = journal->j_blocksize;
1135         int subtid = atomic_read(&sbi->s_fc_subtid);
1136         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1137         ktime_t start_time, commit_time;
1138
1139         trace_ext4_fc_commit_start(sb);
1140
1141         start_time = ktime_get();
1142
1143         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1144                 (ext4_fc_is_ineligible(sb))) {
1145                 reason = EXT4_FC_REASON_INELIGIBLE;
1146                 goto out;
1147         }
1148
1149 restart_fc:
1150         ret = jbd2_fc_begin_commit(journal, commit_tid);
1151         if (ret == -EALREADY) {
1152                 /* There was an ongoing commit, check if we need to restart */
1153                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1154                         commit_tid > journal->j_commit_sequence)
1155                         goto restart_fc;
1156                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1157                 goto out;
1158         } else if (ret) {
1159                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1160                 reason = EXT4_FC_REASON_FC_START_FAILED;
1161                 goto out;
1162         }
1163
1164         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1165         ret = ext4_fc_perform_commit(journal);
1166         if (ret < 0) {
1167                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1168                 reason = EXT4_FC_REASON_FC_FAILED;
1169                 goto out;
1170         }
1171         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1172         ret = jbd2_fc_wait_bufs(journal, nblks);
1173         if (ret < 0) {
1174                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1175                 reason = EXT4_FC_REASON_FC_FAILED;
1176                 goto out;
1177         }
1178         atomic_inc(&sbi->s_fc_subtid);
1179         jbd2_fc_end_commit(journal);
1180 out:
1181         /* Has any ineligible update happened since we started? */
1182         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1183                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184                 reason = EXT4_FC_REASON_INELIGIBLE;
1185         }
1186
1187         spin_lock(&sbi->s_fc_lock);
1188         if (reason != EXT4_FC_REASON_OK &&
1189                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1190                 sbi->s_fc_stats.fc_ineligible_commits++;
1191         } else {
1192                 sbi->s_fc_stats.fc_num_commits++;
1193                 sbi->s_fc_stats.fc_numblks += nblks;
1194         }
1195         spin_unlock(&sbi->s_fc_lock);
1196         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1197         trace_ext4_fc_commit_stop(sb, nblks, reason);
1198         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1199         /*
1200          * weight the commit time higher than the average time so we don't
1201          * react too strongly to vast changes in the commit time
1202          */
1203         if (likely(sbi->s_fc_avg_commit_time))
1204                 sbi->s_fc_avg_commit_time = (commit_time +
1205                                 sbi->s_fc_avg_commit_time * 3) / 4;
1206         else
1207                 sbi->s_fc_avg_commit_time = commit_time;
1208         jbd_debug(1,
1209                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1210                 nblks, reason, subtid);
1211         if (reason == EXT4_FC_REASON_FC_FAILED)
1212                 return jbd2_fc_end_commit_fallback(journal);
1213         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1214                 reason == EXT4_FC_REASON_INELIGIBLE)
1215                 return jbd2_complete_transaction(journal, commit_tid);
1216         return 0;
1217 }
1218
1219 /*
1220  * Fast commit cleanup routine. This is called after every fast commit and
1221  * full commit. full is true if we are called after a full commit.
1222  */
1223 static void ext4_fc_cleanup(journal_t *journal, int full)
1224 {
1225         struct super_block *sb = journal->j_private;
1226         struct ext4_sb_info *sbi = EXT4_SB(sb);
1227         struct ext4_inode_info *iter, *iter_n;
1228         struct ext4_fc_dentry_update *fc_dentry;
1229
1230         if (full && sbi->s_fc_bh)
1231                 sbi->s_fc_bh = NULL;
1232
1233         jbd2_fc_release_bufs(journal);
1234
1235         spin_lock(&sbi->s_fc_lock);
1236         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1237                                  i_fc_list) {
1238                 list_del_init(&iter->i_fc_list);
1239                 ext4_clear_inode_state(&iter->vfs_inode,
1240                                        EXT4_STATE_FC_COMMITTING);
1241                 ext4_fc_reset_inode(&iter->vfs_inode);
1242                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1243                 smp_mb();
1244 #if (BITS_PER_LONG < 64)
1245                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1246 #else
1247                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1248 #endif
1249         }
1250
1251         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1252                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1253                                              struct ext4_fc_dentry_update,
1254                                              fcd_list);
1255                 list_del_init(&fc_dentry->fcd_list);
1256                 spin_unlock(&sbi->s_fc_lock);
1257
1258                 if (fc_dentry->fcd_name.name &&
1259                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1260                         kfree(fc_dentry->fcd_name.name);
1261                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1262                 spin_lock(&sbi->s_fc_lock);
1263         }
1264
1265         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1266                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1267         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1268                                 &sbi->s_fc_q[FC_Q_MAIN]);
1269
1270         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1271         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1272
1273         if (full)
1274                 sbi->s_fc_bytes = 0;
1275         spin_unlock(&sbi->s_fc_lock);
1276         trace_ext4_fc_stats(sb);
1277 }
1278
1279 /* Ext4 Replay Path Routines */
1280
1281 /* Helper struct for dentry replay routines */
1282 struct dentry_info_args {
1283         int parent_ino, dname_len, ino, inode_len;
1284         char *dname;
1285 };
1286
1287 static inline void tl_to_darg(struct dentry_info_args *darg,
1288                               struct  ext4_fc_tl *tl, u8 *val)
1289 {
1290         struct ext4_fc_dentry_info fcd;
1291
1292         memcpy(&fcd, val, sizeof(fcd));
1293
1294         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1295         darg->ino = le32_to_cpu(fcd.fc_ino);
1296         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1297         darg->dname_len = le16_to_cpu(tl->fc_len) -
1298                 sizeof(struct ext4_fc_dentry_info);
1299 }
1300
1301 /* Unlink replay function */
1302 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1303                                  u8 *val)
1304 {
1305         struct inode *inode, *old_parent;
1306         struct qstr entry;
1307         struct dentry_info_args darg;
1308         int ret = 0;
1309
1310         tl_to_darg(&darg, tl, val);
1311
1312         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1313                         darg.parent_ino, darg.dname_len);
1314
1315         entry.name = darg.dname;
1316         entry.len = darg.dname_len;
1317         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1318
1319         if (IS_ERR(inode)) {
1320                 jbd_debug(1, "Inode %d not found", darg.ino);
1321                 return 0;
1322         }
1323
1324         old_parent = ext4_iget(sb, darg.parent_ino,
1325                                 EXT4_IGET_NORMAL);
1326         if (IS_ERR(old_parent)) {
1327                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1328                 iput(inode);
1329                 return 0;
1330         }
1331
1332         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1333         /* -ENOENT ok coz it might not exist anymore. */
1334         if (ret == -ENOENT)
1335                 ret = 0;
1336         iput(old_parent);
1337         iput(inode);
1338         return ret;
1339 }
1340
1341 static int ext4_fc_replay_link_internal(struct super_block *sb,
1342                                 struct dentry_info_args *darg,
1343                                 struct inode *inode)
1344 {
1345         struct inode *dir = NULL;
1346         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1347         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1348         int ret = 0;
1349
1350         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1351         if (IS_ERR(dir)) {
1352                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1353                 dir = NULL;
1354                 goto out;
1355         }
1356
1357         dentry_dir = d_obtain_alias(dir);
1358         if (IS_ERR(dentry_dir)) {
1359                 jbd_debug(1, "Failed to obtain dentry");
1360                 dentry_dir = NULL;
1361                 goto out;
1362         }
1363
1364         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1365         if (!dentry_inode) {
1366                 jbd_debug(1, "Inode dentry not created.");
1367                 ret = -ENOMEM;
1368                 goto out;
1369         }
1370
1371         ret = __ext4_link(dir, inode, dentry_inode);
1372         /*
1373          * It's possible that link already existed since data blocks
1374          * for the dir in question got persisted before we crashed OR
1375          * we replayed this tag and crashed before the entire replay
1376          * could complete.
1377          */
1378         if (ret && ret != -EEXIST) {
1379                 jbd_debug(1, "Failed to link\n");
1380                 goto out;
1381         }
1382
1383         ret = 0;
1384 out:
1385         if (dentry_dir) {
1386                 d_drop(dentry_dir);
1387                 dput(dentry_dir);
1388         } else if (dir) {
1389                 iput(dir);
1390         }
1391         if (dentry_inode) {
1392                 d_drop(dentry_inode);
1393                 dput(dentry_inode);
1394         }
1395
1396         return ret;
1397 }
1398
1399 /* Link replay function */
1400 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1401                                u8 *val)
1402 {
1403         struct inode *inode;
1404         struct dentry_info_args darg;
1405         int ret = 0;
1406
1407         tl_to_darg(&darg, tl, val);
1408         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1409                         darg.parent_ino, darg.dname_len);
1410
1411         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1412         if (IS_ERR(inode)) {
1413                 jbd_debug(1, "Inode not found.");
1414                 return 0;
1415         }
1416
1417         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1418         iput(inode);
1419         return ret;
1420 }
1421
1422 /*
1423  * Record all the modified inodes during replay. We use this later to setup
1424  * block bitmaps correctly.
1425  */
1426 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1427 {
1428         struct ext4_fc_replay_state *state;
1429         int i;
1430
1431         state = &EXT4_SB(sb)->s_fc_replay_state;
1432         for (i = 0; i < state->fc_modified_inodes_used; i++)
1433                 if (state->fc_modified_inodes[i] == ino)
1434                         return 0;
1435         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1436                 state->fc_modified_inodes_size +=
1437                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1438                 state->fc_modified_inodes = krealloc(
1439                                         state->fc_modified_inodes, sizeof(int) *
1440                                         state->fc_modified_inodes_size,
1441                                         GFP_KERNEL);
1442                 if (!state->fc_modified_inodes)
1443                         return -ENOMEM;
1444         }
1445         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1446         return 0;
1447 }
1448
1449 /*
1450  * Inode replay function
1451  */
1452 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1453                                 u8 *val)
1454 {
1455         struct ext4_fc_inode fc_inode;
1456         struct ext4_inode *raw_inode;
1457         struct ext4_inode *raw_fc_inode;
1458         struct inode *inode = NULL;
1459         struct ext4_iloc iloc;
1460         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1461         struct ext4_extent_header *eh;
1462
1463         memcpy(&fc_inode, val, sizeof(fc_inode));
1464
1465         ino = le32_to_cpu(fc_inode.fc_ino);
1466         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1467
1468         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1469         if (!IS_ERR(inode)) {
1470                 ext4_ext_clear_bb(inode);
1471                 iput(inode);
1472         }
1473         inode = NULL;
1474
1475         ext4_fc_record_modified_inode(sb, ino);
1476
1477         raw_fc_inode = (struct ext4_inode *)
1478                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1479         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1480         if (ret)
1481                 goto out;
1482
1483         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1484         raw_inode = ext4_raw_inode(&iloc);
1485
1486         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1487         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1488                 inode_len - offsetof(struct ext4_inode, i_generation));
1489         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1490                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1491                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1492                         memset(eh, 0, sizeof(*eh));
1493                         eh->eh_magic = EXT4_EXT_MAGIC;
1494                         eh->eh_max = cpu_to_le16(
1495                                 (sizeof(raw_inode->i_block) -
1496                                  sizeof(struct ext4_extent_header))
1497                                  / sizeof(struct ext4_extent));
1498                 }
1499         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1500                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1501                         sizeof(raw_inode->i_block));
1502         }
1503
1504         /* Immediately update the inode on disk. */
1505         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1506         if (ret)
1507                 goto out;
1508         ret = sync_dirty_buffer(iloc.bh);
1509         if (ret)
1510                 goto out;
1511         ret = ext4_mark_inode_used(sb, ino);
1512         if (ret)
1513                 goto out;
1514
1515         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1516         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1517         if (IS_ERR(inode)) {
1518                 jbd_debug(1, "Inode not found.");
1519                 return -EFSCORRUPTED;
1520         }
1521
1522         /*
1523          * Our allocator could have made different decisions than before
1524          * crashing. This should be fixed but until then, we calculate
1525          * the number of blocks the inode.
1526          */
1527         ext4_ext_replay_set_iblocks(inode);
1528
1529         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1530         ext4_reset_inode_seed(inode);
1531
1532         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1533         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1534         sync_dirty_buffer(iloc.bh);
1535         brelse(iloc.bh);
1536 out:
1537         iput(inode);
1538         if (!ret)
1539                 blkdev_issue_flush(sb->s_bdev);
1540
1541         return 0;
1542 }
1543
1544 /*
1545  * Dentry create replay function.
1546  *
1547  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1548  * inode for which we are trying to create a dentry here, should already have
1549  * been replayed before we start here.
1550  */
1551 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1552                                  u8 *val)
1553 {
1554         int ret = 0;
1555         struct inode *inode = NULL;
1556         struct inode *dir = NULL;
1557         struct dentry_info_args darg;
1558
1559         tl_to_darg(&darg, tl, val);
1560
1561         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1562                         darg.parent_ino, darg.dname_len);
1563
1564         /* This takes care of update group descriptor and other metadata */
1565         ret = ext4_mark_inode_used(sb, darg.ino);
1566         if (ret)
1567                 goto out;
1568
1569         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1570         if (IS_ERR(inode)) {
1571                 jbd_debug(1, "inode %d not found.", darg.ino);
1572                 inode = NULL;
1573                 ret = -EINVAL;
1574                 goto out;
1575         }
1576
1577         if (S_ISDIR(inode->i_mode)) {
1578                 /*
1579                  * If we are creating a directory, we need to make sure that the
1580                  * dot and dot dot dirents are setup properly.
1581                  */
1582                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1583                 if (IS_ERR(dir)) {
1584                         jbd_debug(1, "Dir %d not found.", darg.ino);
1585                         goto out;
1586                 }
1587                 ret = ext4_init_new_dir(NULL, dir, inode);
1588                 iput(dir);
1589                 if (ret) {
1590                         ret = 0;
1591                         goto out;
1592                 }
1593         }
1594         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1595         if (ret)
1596                 goto out;
1597         set_nlink(inode, 1);
1598         ext4_mark_inode_dirty(NULL, inode);
1599 out:
1600         if (inode)
1601                 iput(inode);
1602         return ret;
1603 }
1604
1605 /*
1606  * Record physical disk regions which are in use as per fast commit area. Our
1607  * simple replay phase allocator excludes these regions from allocation.
1608  */
1609 static int ext4_fc_record_regions(struct super_block *sb, int ino,
1610                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1611 {
1612         struct ext4_fc_replay_state *state;
1613         struct ext4_fc_alloc_region *region;
1614
1615         state = &EXT4_SB(sb)->s_fc_replay_state;
1616         if (state->fc_regions_used == state->fc_regions_size) {
1617                 state->fc_regions_size +=
1618                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1619                 state->fc_regions = krealloc(
1620                                         state->fc_regions,
1621                                         state->fc_regions_size *
1622                                         sizeof(struct ext4_fc_alloc_region),
1623                                         GFP_KERNEL);
1624                 if (!state->fc_regions)
1625                         return -ENOMEM;
1626         }
1627         region = &state->fc_regions[state->fc_regions_used++];
1628         region->ino = ino;
1629         region->lblk = lblk;
1630         region->pblk = pblk;
1631         region->len = len;
1632
1633         return 0;
1634 }
1635
1636 /* Replay add range tag */
1637 static int ext4_fc_replay_add_range(struct super_block *sb,
1638                                     struct ext4_fc_tl *tl, u8 *val)
1639 {
1640         struct ext4_fc_add_range fc_add_ex;
1641         struct ext4_extent newex, *ex;
1642         struct inode *inode;
1643         ext4_lblk_t start, cur;
1644         int remaining, len;
1645         ext4_fsblk_t start_pblk;
1646         struct ext4_map_blocks map;
1647         struct ext4_ext_path *path = NULL;
1648         int ret;
1649
1650         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1651         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1652
1653         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1654                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1655                 ext4_ext_get_actual_len(ex));
1656
1657         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1658         if (IS_ERR(inode)) {
1659                 jbd_debug(1, "Inode not found.");
1660                 return 0;
1661         }
1662
1663         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1664
1665         start = le32_to_cpu(ex->ee_block);
1666         start_pblk = ext4_ext_pblock(ex);
1667         len = ext4_ext_get_actual_len(ex);
1668
1669         cur = start;
1670         remaining = len;
1671         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1672                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1673                   inode->i_ino);
1674
1675         while (remaining > 0) {
1676                 map.m_lblk = cur;
1677                 map.m_len = remaining;
1678                 map.m_pblk = 0;
1679                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1680
1681                 if (ret < 0) {
1682                         iput(inode);
1683                         return 0;
1684                 }
1685
1686                 if (ret == 0) {
1687                         /* Range is not mapped */
1688                         path = ext4_find_extent(inode, cur, NULL, 0);
1689                         if (IS_ERR(path)) {
1690                                 iput(inode);
1691                                 return 0;
1692                         }
1693                         memset(&newex, 0, sizeof(newex));
1694                         newex.ee_block = cpu_to_le32(cur);
1695                         ext4_ext_store_pblock(
1696                                 &newex, start_pblk + cur - start);
1697                         newex.ee_len = cpu_to_le16(map.m_len);
1698                         if (ext4_ext_is_unwritten(ex))
1699                                 ext4_ext_mark_unwritten(&newex);
1700                         down_write(&EXT4_I(inode)->i_data_sem);
1701                         ret = ext4_ext_insert_extent(
1702                                 NULL, inode, &path, &newex, 0);
1703                         up_write((&EXT4_I(inode)->i_data_sem));
1704                         ext4_ext_drop_refs(path);
1705                         kfree(path);
1706                         if (ret) {
1707                                 iput(inode);
1708                                 return 0;
1709                         }
1710                         goto next;
1711                 }
1712
1713                 if (start_pblk + cur - start != map.m_pblk) {
1714                         /*
1715                          * Logical to physical mapping changed. This can happen
1716                          * if this range was removed and then reallocated to
1717                          * map to new physical blocks during a fast commit.
1718                          */
1719                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1720                                         ext4_ext_is_unwritten(ex),
1721                                         start_pblk + cur - start);
1722                         if (ret) {
1723                                 iput(inode);
1724                                 return 0;
1725                         }
1726                         /*
1727                          * Mark the old blocks as free since they aren't used
1728                          * anymore. We maintain an array of all the modified
1729                          * inodes. In case these blocks are still used at either
1730                          * a different logical range in the same inode or in
1731                          * some different inode, we will mark them as allocated
1732                          * at the end of the FC replay using our array of
1733                          * modified inodes.
1734                          */
1735                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1736                         goto next;
1737                 }
1738
1739                 /* Range is mapped and needs a state change */
1740                 jbd_debug(1, "Converting from %ld to %d %lld",
1741                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1742                         ext4_ext_is_unwritten(ex), map.m_pblk);
1743                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1744                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1745                 if (ret) {
1746                         iput(inode);
1747                         return 0;
1748                 }
1749                 /*
1750                  * We may have split the extent tree while toggling the state.
1751                  * Try to shrink the extent tree now.
1752                  */
1753                 ext4_ext_replay_shrink_inode(inode, start + len);
1754 next:
1755                 cur += map.m_len;
1756                 remaining -= map.m_len;
1757         }
1758         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1759                                         sb->s_blocksize_bits);
1760         iput(inode);
1761         return 0;
1762 }
1763
1764 /* Replay DEL_RANGE tag */
1765 static int
1766 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1767                          u8 *val)
1768 {
1769         struct inode *inode;
1770         struct ext4_fc_del_range lrange;
1771         struct ext4_map_blocks map;
1772         ext4_lblk_t cur, remaining;
1773         int ret;
1774
1775         memcpy(&lrange, val, sizeof(lrange));
1776         cur = le32_to_cpu(lrange.fc_lblk);
1777         remaining = le32_to_cpu(lrange.fc_len);
1778
1779         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1780                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1781
1782         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1783         if (IS_ERR(inode)) {
1784                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1785                 return 0;
1786         }
1787
1788         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1789
1790         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1791                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1792                         le32_to_cpu(lrange.fc_len));
1793         while (remaining > 0) {
1794                 map.m_lblk = cur;
1795                 map.m_len = remaining;
1796
1797                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1798                 if (ret < 0) {
1799                         iput(inode);
1800                         return 0;
1801                 }
1802                 if (ret > 0) {
1803                         remaining -= ret;
1804                         cur += ret;
1805                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1806                 } else {
1807                         remaining -= map.m_len;
1808                         cur += map.m_len;
1809                 }
1810         }
1811
1812         ret = ext4_punch_hole(inode,
1813                 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1814                 le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1815         if (ret)
1816                 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1817         ext4_ext_replay_shrink_inode(inode,
1818                 i_size_read(inode) >> sb->s_blocksize_bits);
1819         ext4_mark_inode_dirty(NULL, inode);
1820         iput(inode);
1821
1822         return 0;
1823 }
1824
1825 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1826 {
1827         struct ext4_fc_replay_state *state;
1828         struct inode *inode;
1829         struct ext4_ext_path *path = NULL;
1830         struct ext4_map_blocks map;
1831         int i, ret, j;
1832         ext4_lblk_t cur, end;
1833
1834         state = &EXT4_SB(sb)->s_fc_replay_state;
1835         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1836                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1837                         EXT4_IGET_NORMAL);
1838                 if (IS_ERR(inode)) {
1839                         jbd_debug(1, "Inode %d not found.",
1840                                 state->fc_modified_inodes[i]);
1841                         continue;
1842                 }
1843                 cur = 0;
1844                 end = EXT_MAX_BLOCKS;
1845                 while (cur < end) {
1846                         map.m_lblk = cur;
1847                         map.m_len = end - cur;
1848
1849                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1850                         if (ret < 0)
1851                                 break;
1852
1853                         if (ret > 0) {
1854                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1855                                 if (!IS_ERR(path)) {
1856                                         for (j = 0; j < path->p_depth; j++)
1857                                                 ext4_mb_mark_bb(inode->i_sb,
1858                                                         path[j].p_block, 1, 1);
1859                                         ext4_ext_drop_refs(path);
1860                                         kfree(path);
1861                                 }
1862                                 cur += ret;
1863                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1864                                                         map.m_len, 1);
1865                         } else {
1866                                 cur = cur + (map.m_len ? map.m_len : 1);
1867                         }
1868                 }
1869                 iput(inode);
1870         }
1871 }
1872
1873 /*
1874  * Check if block is in excluded regions for block allocation. The simple
1875  * allocator that runs during replay phase is calls this function to see
1876  * if it is okay to use a block.
1877  */
1878 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1879 {
1880         int i;
1881         struct ext4_fc_replay_state *state;
1882
1883         state = &EXT4_SB(sb)->s_fc_replay_state;
1884         for (i = 0; i < state->fc_regions_valid; i++) {
1885                 if (state->fc_regions[i].ino == 0 ||
1886                         state->fc_regions[i].len == 0)
1887                         continue;
1888                 if (blk >= state->fc_regions[i].pblk &&
1889                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1890                         return true;
1891         }
1892         return false;
1893 }
1894
1895 /* Cleanup function called after replay */
1896 void ext4_fc_replay_cleanup(struct super_block *sb)
1897 {
1898         struct ext4_sb_info *sbi = EXT4_SB(sb);
1899
1900         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1901         kfree(sbi->s_fc_replay_state.fc_regions);
1902         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1903 }
1904
1905 /*
1906  * Recovery Scan phase handler
1907  *
1908  * This function is called during the scan phase and is responsible
1909  * for doing following things:
1910  * - Make sure the fast commit area has valid tags for replay
1911  * - Count number of tags that need to be replayed by the replay handler
1912  * - Verify CRC
1913  * - Create a list of excluded blocks for allocation during replay phase
1914  *
1915  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1916  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1917  * to indicate that scan has finished and JBD2 can now start replay phase.
1918  * It returns a negative error to indicate that there was an error. At the end
1919  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1920  * to indicate the number of tags that need to replayed during the replay phase.
1921  */
1922 static int ext4_fc_replay_scan(journal_t *journal,
1923                                 struct buffer_head *bh, int off,
1924                                 tid_t expected_tid)
1925 {
1926         struct super_block *sb = journal->j_private;
1927         struct ext4_sb_info *sbi = EXT4_SB(sb);
1928         struct ext4_fc_replay_state *state;
1929         int ret = JBD2_FC_REPLAY_CONTINUE;
1930         struct ext4_fc_add_range ext;
1931         struct ext4_fc_tl tl;
1932         struct ext4_fc_tail tail;
1933         __u8 *start, *end, *cur, *val;
1934         struct ext4_fc_head head;
1935         struct ext4_extent *ex;
1936
1937         state = &sbi->s_fc_replay_state;
1938
1939         start = (u8 *)bh->b_data;
1940         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1941
1942         if (state->fc_replay_expected_off == 0) {
1943                 state->fc_cur_tag = 0;
1944                 state->fc_replay_num_tags = 0;
1945                 state->fc_crc = 0;
1946                 state->fc_regions = NULL;
1947                 state->fc_regions_valid = state->fc_regions_used =
1948                         state->fc_regions_size = 0;
1949                 /* Check if we can stop early */
1950                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1951                         != EXT4_FC_TAG_HEAD)
1952                         return 0;
1953         }
1954
1955         if (off != state->fc_replay_expected_off) {
1956                 ret = -EFSCORRUPTED;
1957                 goto out_err;
1958         }
1959
1960         state->fc_replay_expected_off++;
1961         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1962                 memcpy(&tl, cur, sizeof(tl));
1963                 val = cur + sizeof(tl);
1964                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1965                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1966                 switch (le16_to_cpu(tl.fc_tag)) {
1967                 case EXT4_FC_TAG_ADD_RANGE:
1968                         memcpy(&ext, val, sizeof(ext));
1969                         ex = (struct ext4_extent *)&ext.fc_ex;
1970                         ret = ext4_fc_record_regions(sb,
1971                                 le32_to_cpu(ext.fc_ino),
1972                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1973                                 ext4_ext_get_actual_len(ex));
1974                         if (ret < 0)
1975                                 break;
1976                         ret = JBD2_FC_REPLAY_CONTINUE;
1977                         fallthrough;
1978                 case EXT4_FC_TAG_DEL_RANGE:
1979                 case EXT4_FC_TAG_LINK:
1980                 case EXT4_FC_TAG_UNLINK:
1981                 case EXT4_FC_TAG_CREAT:
1982                 case EXT4_FC_TAG_INODE:
1983                 case EXT4_FC_TAG_PAD:
1984                         state->fc_cur_tag++;
1985                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1986                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1987                         break;
1988                 case EXT4_FC_TAG_TAIL:
1989                         state->fc_cur_tag++;
1990                         memcpy(&tail, val, sizeof(tail));
1991                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1992                                                 sizeof(tl) +
1993                                                 offsetof(struct ext4_fc_tail,
1994                                                 fc_crc));
1995                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1996                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1997                                 state->fc_replay_num_tags = state->fc_cur_tag;
1998                                 state->fc_regions_valid =
1999                                         state->fc_regions_used;
2000                         } else {
2001                                 ret = state->fc_replay_num_tags ?
2002                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2003                         }
2004                         state->fc_crc = 0;
2005                         break;
2006                 case EXT4_FC_TAG_HEAD:
2007                         memcpy(&head, val, sizeof(head));
2008                         if (le32_to_cpu(head.fc_features) &
2009                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2010                                 ret = -EOPNOTSUPP;
2011                                 break;
2012                         }
2013                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2014                                 ret = JBD2_FC_REPLAY_STOP;
2015                                 break;
2016                         }
2017                         state->fc_cur_tag++;
2018                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2019                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2020                         break;
2021                 default:
2022                         ret = state->fc_replay_num_tags ?
2023                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2024                 }
2025                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2026                         break;
2027         }
2028
2029 out_err:
2030         trace_ext4_fc_replay_scan(sb, ret, off);
2031         return ret;
2032 }
2033
2034 /*
2035  * Main recovery path entry point.
2036  * The meaning of return codes is similar as above.
2037  */
2038 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2039                                 enum passtype pass, int off, tid_t expected_tid)
2040 {
2041         struct super_block *sb = journal->j_private;
2042         struct ext4_sb_info *sbi = EXT4_SB(sb);
2043         struct ext4_fc_tl tl;
2044         __u8 *start, *end, *cur, *val;
2045         int ret = JBD2_FC_REPLAY_CONTINUE;
2046         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2047         struct ext4_fc_tail tail;
2048
2049         if (pass == PASS_SCAN) {
2050                 state->fc_current_pass = PASS_SCAN;
2051                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2052         }
2053
2054         if (state->fc_current_pass != pass) {
2055                 state->fc_current_pass = pass;
2056                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2057         }
2058         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2059                 jbd_debug(1, "Replay stops\n");
2060                 ext4_fc_set_bitmaps_and_counters(sb);
2061                 return 0;
2062         }
2063
2064 #ifdef CONFIG_EXT4_DEBUG
2065         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2066                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2067                 return JBD2_FC_REPLAY_STOP;
2068         }
2069 #endif
2070
2071         start = (u8 *)bh->b_data;
2072         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2073
2074         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2075                 memcpy(&tl, cur, sizeof(tl));
2076                 val = cur + sizeof(tl);
2077
2078                 if (state->fc_replay_num_tags == 0) {
2079                         ret = JBD2_FC_REPLAY_STOP;
2080                         ext4_fc_set_bitmaps_and_counters(sb);
2081                         break;
2082                 }
2083                 jbd_debug(3, "Replay phase, tag:%s\n",
2084                                 tag2str(le16_to_cpu(tl.fc_tag)));
2085                 state->fc_replay_num_tags--;
2086                 switch (le16_to_cpu(tl.fc_tag)) {
2087                 case EXT4_FC_TAG_LINK:
2088                         ret = ext4_fc_replay_link(sb, &tl, val);
2089                         break;
2090                 case EXT4_FC_TAG_UNLINK:
2091                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2092                         break;
2093                 case EXT4_FC_TAG_ADD_RANGE:
2094                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2095                         break;
2096                 case EXT4_FC_TAG_CREAT:
2097                         ret = ext4_fc_replay_create(sb, &tl, val);
2098                         break;
2099                 case EXT4_FC_TAG_DEL_RANGE:
2100                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2101                         break;
2102                 case EXT4_FC_TAG_INODE:
2103                         ret = ext4_fc_replay_inode(sb, &tl, val);
2104                         break;
2105                 case EXT4_FC_TAG_PAD:
2106                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2107                                              le16_to_cpu(tl.fc_len), 0);
2108                         break;
2109                 case EXT4_FC_TAG_TAIL:
2110                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2111                                              le16_to_cpu(tl.fc_len), 0);
2112                         memcpy(&tail, val, sizeof(tail));
2113                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2114                         break;
2115                 case EXT4_FC_TAG_HEAD:
2116                         break;
2117                 default:
2118                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2119                                              le16_to_cpu(tl.fc_len), 0);
2120                         ret = -ECANCELED;
2121                         break;
2122                 }
2123                 if (ret < 0)
2124                         break;
2125                 ret = JBD2_FC_REPLAY_CONTINUE;
2126         }
2127         return ret;
2128 }
2129
2130 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2131 {
2132         /*
2133          * We set replay callback even if fast commit disabled because we may
2134          * could still have fast commit blocks that need to be replayed even if
2135          * fast commit has now been turned off.
2136          */
2137         journal->j_fc_replay_callback = ext4_fc_replay;
2138         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2139                 return;
2140         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2141 }
2142
2143 static const char *fc_ineligible_reasons[] = {
2144         "Extended attributes changed",
2145         "Cross rename",
2146         "Journal flag changed",
2147         "Insufficient memory",
2148         "Swap boot",
2149         "Resize",
2150         "Dir renamed",
2151         "Falloc range op",
2152         "Data journalling",
2153         "FC Commit Failed"
2154 };
2155
2156 int ext4_fc_info_show(struct seq_file *seq, void *v)
2157 {
2158         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2159         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2160         int i;
2161
2162         if (v != SEQ_START_TOKEN)
2163                 return 0;
2164
2165         seq_printf(seq,
2166                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2167                    stats->fc_num_commits, stats->fc_ineligible_commits,
2168                    stats->fc_numblks,
2169                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2170         seq_puts(seq, "Ineligible reasons:\n");
2171         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2172                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2173                         stats->fc_ineligible_reason_count[i]);
2174
2175         return 0;
2176 }
2177
2178 int __init ext4_fc_init_dentry_cache(void)
2179 {
2180         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2181                                            SLAB_RECLAIM_ACCOUNT);
2182
2183         if (ext4_fc_dentry_cachep == NULL)
2184                 return -ENOMEM;
2185
2186         return 0;
2187 }