fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  *
  69  * Not all operations are supported by fast commits today (e.g extended
  70  * attributes). Fast commit ineligibility is marked by calling
  71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  72  * to full commit.
  73  *
  74  * Atomicity of commits
  75  * --------------------
  76  * In order to guarantee atomicity during the commit operation, fast commit
  77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  78  * tag contains CRC of the contents and TID of the transaction after which
  79  * this fast commit should be applied. Recovery code replays fast commit
  80  * logs only if there's at least 1 valid tail present. For every fast commit
  81  * operation, there is 1 tail. This means, we may end up with multiple tails
  82  * in the fast commit space. Here's an example:
  83  *
  84  * - Create a new file A and remove existing file B
  85  * - fsync()
  86  * - Append contents to file A
  87  * - Truncate file A
  88  * - fsync()
  89  *
  90  * The fast commit space at the end of above operations would look like this:
  91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
  92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
  93  *
  94  * Replay code should thus check for all the valid tails in the FC area.
  95  *
  96  * Fast Commit Replay Idempotence
  97  * ------------------------------
  98  *
  99  * Fast commits tags are idempotent in nature provided the recovery code follows
 100  * certain rules. The guiding principle that the commit path follows while
 101  * committing is that it stores the result of a particular operation instead of
 102  * storing the procedure.
 103  *
 104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 105  * was associated with inode 10. During fast commit, instead of storing this
 106  * operation as a procedure "rename a to b", we store the resulting file system
 107  * state as a "series" of outcomes:
 108  *
 109  * - Link dirent b to inode 10
 110  * - Unlink dirent a
 111  * - Inode <10> with valid refcount
 112  *
 113  * Now when recovery code runs, it needs "enforce" this state on the file
 114  * system. This is what guarantees idempotence of fast commit replay.
 115  *
 116  * Let's take an example of a procedure that is not idempotent and see how fast
 117  * commits make it idempotent. Consider following sequence of operations:
 118  *
 119  *     rm A;    mv B A;    read A
 120  *  (x)     (y)        (z)
 121  *
 122  * (x), (y) and (z) are the points at which we can crash. If we store this
 123  * sequence of operations as is then the replay is not idempotent. Let's say
 124  * while in replay, we crash at (z). During the second replay, file A (which was
 125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
 126  * file named A would be absent when we try to read A. So, this sequence of
 127  * operations is not idempotent. However, as mentioned above, instead of storing
 128  * the procedure fast commits store the outcome of each procedure. Thus the fast
 129  * commit log for above procedure would be as follows:
 130  *
 131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 132  * inode 11 before the replay)
 133  *
 134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 135  * (w)          (x)                    (y)          (z)
 136  *
 137  * If we crash at (z), we will have file A linked to inode 11. During the second
 138  * replay, we will remove file A (inode 11). But we will create it back and make
 139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
 140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 142  * similarly. Thus, by converting a non-idempotent procedure into a series of
 143  * idempotent outcomes, fast commits ensured idempotence during the replay.
 144  *
 145  * TODOs
 146  * -----
 147  *
 148  * 0) Fast commit replay path hardening: Fast commit replay code should use
 149  *    journal handles to make sure all the updates it does during the replay
 150  *    path are atomic. With that if we crash during fast commit replay, after
 151  *    trying to do recovery again, we will find a file system where fast commit
 152  *    area is invalid (because new full commit would be found). In order to deal
 153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 154  *    superblock state is persisted before starting the replay, so that after
 155  *    the crash, fast commit recovery code can look at that flag and perform
 156  *    fast commit recovery even if that area is invalidated by later full
 157  *    commits.
 158  *
 159  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 160  *    eligible update must be protected within ext4_fc_start_update() and
 161  *    ext4_fc_stop_update(). These routines are called at much higher
 162  *    routines. This can be made more fine grained by combining with
 163  *    ext4_journal_start().
 164  *
 165  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 166  *
 167  * 3) Handle more ineligible cases.
 168  */
 169
 170 #include <trace/events/ext4.h>
 171 static struct kmem_cache *ext4_fc_dentry_cachep;
 172
 173 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 174 {
 175         BUFFER_TRACE(bh, "");
 176         if (uptodate) {
 177                 ext4_debug("%s: Block %lld up-to-date",
 178                            __func__, bh->b_blocknr);
 179                 set_buffer_uptodate(bh);
 180         } else {
 181                 ext4_debug("%s: Block %lld not up-to-date",
 182                            __func__, bh->b_blocknr);
 183                 clear_buffer_uptodate(bh);
 184         }
 185
 186         unlock_buffer(bh);
 187 }
 188
 189 static inline void ext4_fc_reset_inode(struct inode *inode)
 190 {
 191         struct ext4_inode_info *ei = EXT4_I(inode);
 192
 193         ei->i_fc_lblk_start = 0;
 194         ei->i_fc_lblk_len = 0;
 195 }
 196
 197 void ext4_fc_init_inode(struct inode *inode)
 198 {
 199         struct ext4_inode_info *ei = EXT4_I(inode);
 200
 201         ext4_fc_reset_inode(inode);
 202         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 203         INIT_LIST_HEAD(&ei->i_fc_list);
 204         init_waitqueue_head(&ei->i_fc_wait);
 205         atomic_set(&ei->i_fc_updates, 0);
 206 }
 207
 208 /* This function must be called with sbi->s_fc_lock held. */
 209 static void ext4_fc_wait_committing_inode(struct inode *inode)
 210 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 211 {
 212         wait_queue_head_t *wq;
 213         struct ext4_inode_info *ei = EXT4_I(inode);
 214
 215 #if (BITS_PER_LONG < 64)
 216         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 217                         EXT4_STATE_FC_COMMITTING);
 218         wq = bit_waitqueue(&ei->i_state_flags,
 219                                 EXT4_STATE_FC_COMMITTING);
 220 #else
 221         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 222                         EXT4_STATE_FC_COMMITTING);
 223         wq = bit_waitqueue(&ei->i_flags,
 224                                 EXT4_STATE_FC_COMMITTING);
 225 #endif
 226         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 227         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 228         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 229         schedule();
 230         finish_wait(wq, &wait.wq_entry);
 231 }
 232
 233 /*
 234  * Inform Ext4's fast about start of an inode update
 235  *
 236  * This function is called by the high level call VFS callbacks before
 237  * performing any inode update. This function blocks if there's an ongoing
 238  * fast commit on the inode in question.
 239  */
 240 void ext4_fc_start_update(struct inode *inode)
 241 {
 242         struct ext4_inode_info *ei = EXT4_I(inode);
 243
 244         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 245             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 246                 return;
 247
 248 restart:
 249         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 250         if (list_empty(&ei->i_fc_list))
 251                 goto out;
 252
 253         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 254                 ext4_fc_wait_committing_inode(inode);
 255                 goto restart;
 256         }
 257 out:
 258         atomic_inc(&ei->i_fc_updates);
 259         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260 }
 261
 262 /*
 263  * Stop inode update and wake up waiting fast commits if any.
 264  */
 265 void ext4_fc_stop_update(struct inode *inode)
 266 {
 267         struct ext4_inode_info *ei = EXT4_I(inode);
 268
 269         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 270             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 271                 return;
 272
 273         if (atomic_dec_and_test(&ei->i_fc_updates))
 274                 wake_up_all(&ei->i_fc_wait);
 275 }
 276
 277 /*
 278  * Remove inode from fast commit list. If the inode is being committed
 279  * we wait until inode commit is done.
 280  */
 281 void ext4_fc_del(struct inode *inode)
 282 {
 283         struct ext4_inode_info *ei = EXT4_I(inode);
 284
 285         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 286             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 287                 return;
 288
 289 restart:
 290         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 291         if (list_empty(&ei->i_fc_list)) {
 292                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 293                 return;
 294         }
 295
 296         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 297                 ext4_fc_wait_committing_inode(inode);
 298                 goto restart;
 299         }
 300         list_del_init(&ei->i_fc_list);
 301         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 302 }
 303
 304 /*
 305  * Mark file system as fast commit ineligible, and record latest
 306  * ineligible transaction tid. This means until the recorded
 307  * transaction, commit operation would result in a full jbd2 commit.
 308  */
 309 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 310 {
 311         struct ext4_sb_info *sbi = EXT4_SB(sb);
 312         tid_t tid;
 313
 314         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 315             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 316                 return;
 317
 318         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 319         if (handle && !IS_ERR(handle))
 320                 tid = handle->h_transaction->t_tid;
 321         else {
 322                 read_lock(&sbi->s_journal->j_state_lock);
 323                 tid = sbi->s_journal->j_running_transaction ?
 324                                 sbi->s_journal->j_running_transaction->t_tid : 0;
 325                 read_unlock(&sbi->s_journal->j_state_lock);
 326         }
 327         spin_lock(&sbi->s_fc_lock);
 328         if (sbi->s_fc_ineligible_tid < tid)
 329                 sbi->s_fc_ineligible_tid = tid;
 330         spin_unlock(&sbi->s_fc_lock);
 331         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 332         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 333 }
 334
 335 /*
 336  * Generic fast commit tracking function. If this is the first time this we are
 337  * called after a full commit, we initialize fast commit fields and then call
 338  * __fc_track_fn() with update = 0. If we have already been called after a full
 339  * commit, we pass update = 1. Based on that, the track function can determine
 340  * if it needs to track a field for the first time or if it needs to just
 341  * update the previously tracked value.
 342  *
 343  * If enqueue is set, this function enqueues the inode in fast commit list.
 344  */
 345 static int ext4_fc_track_template(
 346         handle_t *handle, struct inode *inode,
 347         int (*__fc_track_fn)(struct inode *, void *, bool),
 348         void *args, int enqueue)
 349 {
 350         bool update = false;
 351         struct ext4_inode_info *ei = EXT4_I(inode);
 352         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 353         tid_t tid = 0;
 354         int ret;
 355
 356         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 357             (sbi->s_mount_state & EXT4_FC_REPLAY))
 358                 return -EOPNOTSUPP;
 359
 360         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 361                 return -EINVAL;
 362
 363         tid = handle->h_transaction->t_tid;
 364         mutex_lock(&ei->i_fc_lock);
 365         if (tid == ei->i_sync_tid) {
 366                 update = true;
 367         } else {
 368                 ext4_fc_reset_inode(inode);
 369                 ei->i_sync_tid = tid;
 370         }
 371         ret = __fc_track_fn(inode, args, update);
 372         mutex_unlock(&ei->i_fc_lock);
 373
 374         if (!enqueue)
 375                 return ret;
 376
 377         spin_lock(&sbi->s_fc_lock);
 378         if (list_empty(&EXT4_I(inode)->i_fc_list))
 379                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 380                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 381                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 382                                 &sbi->s_fc_q[FC_Q_STAGING] :
 383                                 &sbi->s_fc_q[FC_Q_MAIN]);
 384         spin_unlock(&sbi->s_fc_lock);
 385
 386         return ret;
 387 }
 388
 389 struct __track_dentry_update_args {
 390         struct dentry *dentry;
 391         int op;
 392 };
 393
 394 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 395 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 396 {
 397         struct ext4_fc_dentry_update *node;
 398         struct ext4_inode_info *ei = EXT4_I(inode);
 399         struct __track_dentry_update_args *dentry_update =
 400                 (struct __track_dentry_update_args *)arg;
 401         struct dentry *dentry = dentry_update->dentry;
 402         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 403
 404         mutex_unlock(&ei->i_fc_lock);
 405         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 406         if (!node) {
 407                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
 408                 mutex_lock(&ei->i_fc_lock);
 409                 return -ENOMEM;
 410         }
 411
 412         node->fcd_op = dentry_update->op;
 413         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 414         node->fcd_ino = inode->i_ino;
 415         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 416                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 417                 if (!node->fcd_name.name) {
 418                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 419                         ext4_fc_mark_ineligible(inode->i_sb,
 420                                 EXT4_FC_REASON_NOMEM, NULL);
 421                         mutex_lock(&ei->i_fc_lock);
 422                         return -ENOMEM;
 423                 }
 424                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 425                         dentry->d_name.len);
 426         } else {
 427                 memcpy(node->fcd_iname, dentry->d_name.name,
 428                         dentry->d_name.len);
 429                 node->fcd_name.name = node->fcd_iname;
 430         }
 431         node->fcd_name.len = dentry->d_name.len;
 432
 433         spin_lock(&sbi->s_fc_lock);
 434         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 435                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 436                 list_add_tail(&node->fcd_list,
 437                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 438         else
 439                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 440         spin_unlock(&sbi->s_fc_lock);
 441         mutex_lock(&ei->i_fc_lock);
 442
 443         return 0;
 444 }
 445
 446 void __ext4_fc_track_unlink(handle_t *handle,
 447                 struct inode *inode, struct dentry *dentry)
 448 {
 449         struct __track_dentry_update_args args;
 450         int ret;
 451
 452         args.dentry = dentry;
 453         args.op = EXT4_FC_TAG_UNLINK;
 454
 455         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 456                                         (void *)&args, 0);
 457         trace_ext4_fc_track_unlink(inode, dentry, ret);
 458 }
 459
 460 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 461 {
 462         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 463 }
 464
 465 void __ext4_fc_track_link(handle_t *handle,
 466         struct inode *inode, struct dentry *dentry)
 467 {
 468         struct __track_dentry_update_args args;
 469         int ret;
 470
 471         args.dentry = dentry;
 472         args.op = EXT4_FC_TAG_LINK;
 473
 474         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 475                                         (void *)&args, 0);
 476         trace_ext4_fc_track_link(inode, dentry, ret);
 477 }
 478
 479 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 480 {
 481         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 482 }
 483
 484 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 485                           struct dentry *dentry)
 486 {
 487         struct __track_dentry_update_args args;
 488         int ret;
 489
 490         args.dentry = dentry;
 491         args.op = EXT4_FC_TAG_CREAT;
 492
 493         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 494                                         (void *)&args, 0);
 495         trace_ext4_fc_track_create(inode, dentry, ret);
 496 }
 497
 498 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 499 {
 500         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 501 }
 502
 503 /* __track_fn for inode tracking */
 504 static int __track_inode(struct inode *inode, void *arg, bool update)
 505 {
 506         if (update)
 507                 return -EEXIST;
 508
 509         EXT4_I(inode)->i_fc_lblk_len = 0;
 510
 511         return 0;
 512 }
 513
 514 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 515 {
 516         int ret;
 517
 518         if (S_ISDIR(inode->i_mode))
 519                 return;
 520
 521         if (ext4_should_journal_data(inode)) {
 522                 ext4_fc_mark_ineligible(inode->i_sb,
 523                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 524                 return;
 525         }
 526
 527         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 528         trace_ext4_fc_track_inode(inode, ret);
 529 }
 530
 531 struct __track_range_args {
 532         ext4_lblk_t start, end;
 533 };
 534
 535 /* __track_fn for tracking data updates */
 536 static int __track_range(struct inode *inode, void *arg, bool update)
 537 {
 538         struct ext4_inode_info *ei = EXT4_I(inode);
 539         ext4_lblk_t oldstart;
 540         struct __track_range_args *__arg =
 541                 (struct __track_range_args *)arg;
 542
 543         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 544                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 545                 return -ECANCELED;
 546         }
 547
 548         oldstart = ei->i_fc_lblk_start;
 549
 550         if (update && ei->i_fc_lblk_len > 0) {
 551                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 552                 ei->i_fc_lblk_len =
 553                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 554                                 ei->i_fc_lblk_start + 1;
 555         } else {
 556                 ei->i_fc_lblk_start = __arg->start;
 557                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 558         }
 559
 560         return 0;
 561 }
 562
 563 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 564                          ext4_lblk_t end)
 565 {
 566         struct __track_range_args args;
 567         int ret;
 568
 569         if (S_ISDIR(inode->i_mode))
 570                 return;
 571
 572         args.start = start;
 573         args.end = end;
 574
 575         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 576
 577         trace_ext4_fc_track_range(inode, start, end, ret);
 578 }
 579
 580 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 581 {
 582         int write_flags = REQ_SYNC;
 583         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 584
 585         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
 586         if (test_opt(sb, BARRIER) && is_tail)
 587                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 588         lock_buffer(bh);
 589         set_buffer_dirty(bh);
 590         set_buffer_uptodate(bh);
 591         bh->b_end_io = ext4_end_buffer_io_sync;
 592         submit_bh(REQ_OP_WRITE, write_flags, bh);
 593         EXT4_SB(sb)->s_fc_bh = NULL;
 594 }
 595
 596 /* Ext4 commit path routines */
 597
 598 /* memzero and update CRC */
 599 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 600                                 u32 *crc)
 601 {
 602         void *ret;
 603
 604         ret = memset(dst, 0, len);
 605         if (crc)
 606                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 607         return ret;
 608 }
 609
 610 /*
 611  * Allocate len bytes on a fast commit buffer.
 612  *
 613  * During the commit time this function is used to manage fast commit
 614  * block space. We don't split a fast commit log onto different
 615  * blocks. So this function makes sure that if there's not enough space
 616  * on the current block, the remaining space in the current block is
 617  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 618  * new block is from jbd2 and CRC is updated to reflect the padding
 619  * we added.
 620  */
 621 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 622 {
 623         struct ext4_fc_tl *tl;
 624         struct ext4_sb_info *sbi = EXT4_SB(sb);
 625         struct buffer_head *bh;
 626         int bsize = sbi->s_journal->j_blocksize;
 627         int ret, off = sbi->s_fc_bytes % bsize;
 628         int pad_len;
 629
 630         /*
 631          * After allocating len, we should have space at least for a 0 byte
 632          * padding.
 633          */
 634         if (len + sizeof(struct ext4_fc_tl) > bsize)
 635                 return NULL;
 636
 637         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 638                 /*
 639                  * Only allocate from current buffer if we have enough space for
 640                  * this request AND we have space to add a zero byte padding.
 641                  */
 642                 if (!sbi->s_fc_bh) {
 643                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 644                         if (ret)
 645                                 return NULL;
 646                         sbi->s_fc_bh = bh;
 647                 }
 648                 sbi->s_fc_bytes += len;
 649                 return sbi->s_fc_bh->b_data + off;
 650         }
 651         /* Need to add PAD tag */
 652         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 653         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 654         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 655         tl->fc_len = cpu_to_le16(pad_len);
 656         if (crc)
 657                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 658         if (pad_len > 0)
 659                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 660         /* Don't leak uninitialized memory in the unused last byte. */
 661         *((u8 *)(tl + 1) + pad_len) = 0;
 662
 663         ext4_fc_submit_bh(sb, false);
 664
 665         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 666         if (ret)
 667                 return NULL;
 668         sbi->s_fc_bh = bh;
 669         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 670         return sbi->s_fc_bh->b_data;
 671 }
 672
 673 /* memcpy to fc reserved space and update CRC */
 674 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 675                                 int len, u32 *crc)
 676 {
 677         if (crc)
 678                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 679         return memcpy(dst, src, len);
 680 }
 681
 682 /*
 683  * Complete a fast commit by writing tail tag.
 684  *
 685  * Writing tail tag marks the end of a fast commit. In order to guarantee
 686  * atomicity, after writing tail tag, even if there's space remaining
 687  * in the block, next commit shouldn't use it. That's why tail tag
 688  * has the length as that of the remaining space on the block.
 689  */
 690 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 691 {
 692         struct ext4_sb_info *sbi = EXT4_SB(sb);
 693         struct ext4_fc_tl tl;
 694         struct ext4_fc_tail tail;
 695         int off, bsize = sbi->s_journal->j_blocksize;
 696         u8 *dst;
 697
 698         /*
 699          * ext4_fc_reserve_space takes care of allocating an extra block if
 700          * there's no enough space on this block for accommodating this tail.
 701          */
 702         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 703         if (!dst)
 704                 return -ENOSPC;
 705
 706         off = sbi->s_fc_bytes % bsize;
 707
 708         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 709         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 710         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 711
 712         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 713         dst += sizeof(tl);
 714         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 715         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 716         dst += sizeof(tail.fc_tid);
 717         tail.fc_crc = cpu_to_le32(crc);
 718         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 719         dst += sizeof(tail.fc_crc);
 720         memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 721
 722         ext4_fc_submit_bh(sb, true);
 723
 724         return 0;
 725 }
 726
 727 /*
 728  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 729  * Returns false if there's not enough space.
 730  */
 731 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 732                            u32 *crc)
 733 {
 734         struct ext4_fc_tl tl;
 735         u8 *dst;
 736
 737         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 738         if (!dst)
 739                 return false;
 740
 741         tl.fc_tag = cpu_to_le16(tag);
 742         tl.fc_len = cpu_to_le16(len);
 743
 744         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 745         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 746
 747         return true;
 748 }
 749
 750 /* Same as above, but adds dentry tlv. */
 751 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 752                                    struct ext4_fc_dentry_update *fc_dentry)
 753 {
 754         struct ext4_fc_dentry_info fcd;
 755         struct ext4_fc_tl tl;
 756         int dlen = fc_dentry->fcd_name.len;
 757         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 758                                         crc);
 759
 760         if (!dst)
 761                 return false;
 762
 763         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 764         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 765         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 766         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 767         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 768         dst += sizeof(tl);
 769         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 770         dst += sizeof(fcd);
 771         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
 772         dst += dlen;
 773
 774         return true;
 775 }
 776
 777 /*
 778  * Writes inode in the fast commit space under TLV with tag @tag.
 779  * Returns 0 on success, error on failure.
 780  */
 781 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 782 {
 783         struct ext4_inode_info *ei = EXT4_I(inode);
 784         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 785         int ret;
 786         struct ext4_iloc iloc;
 787         struct ext4_fc_inode fc_inode;
 788         struct ext4_fc_tl tl;
 789         u8 *dst;
 790
 791         ret = ext4_get_inode_loc(inode, &iloc);
 792         if (ret)
 793                 return ret;
 794
 795         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 796                 inode_len += ei->i_extra_isize;
 797
 798         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 799         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 800         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 801
 802         ret = -ECANCELED;
 803         dst = ext4_fc_reserve_space(inode->i_sb,
 804                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 805         if (!dst)
 806                 goto err;
 807
 808         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 809                 goto err;
 810         dst += sizeof(tl);
 811         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 812                 goto err;
 813         dst += sizeof(fc_inode);
 814         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 815                                         inode_len, crc))
 816                 goto err;
 817         ret = 0;
 818 err:
 819         brelse(iloc.bh);
 820         return ret;
 821 }
 822
 823 /*
 824  * Writes updated data ranges for the inode in question. Updates CRC.
 825  * Returns 0 on success, error otherwise.
 826  */
 827 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 828 {
 829         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 830         struct ext4_inode_info *ei = EXT4_I(inode);
 831         struct ext4_map_blocks map;
 832         struct ext4_fc_add_range fc_ext;
 833         struct ext4_fc_del_range lrange;
 834         struct ext4_extent *ex;
 835         int ret;
 836
 837         mutex_lock(&ei->i_fc_lock);
 838         if (ei->i_fc_lblk_len == 0) {
 839                 mutex_unlock(&ei->i_fc_lock);
 840                 return 0;
 841         }
 842         old_blk_size = ei->i_fc_lblk_start;
 843         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 844         ei->i_fc_lblk_len = 0;
 845         mutex_unlock(&ei->i_fc_lock);
 846
 847         cur_lblk_off = old_blk_size;
 848         ext4_debug("will try writing %d to %d for inode %ld\n",
 849                    cur_lblk_off, new_blk_size, inode->i_ino);
 850
 851         while (cur_lblk_off <= new_blk_size) {
 852                 map.m_lblk = cur_lblk_off;
 853                 map.m_len = new_blk_size - cur_lblk_off + 1;
 854                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 855                 if (ret < 0)
 856                         return -ECANCELED;
 857
 858                 if (map.m_len == 0) {
 859                         cur_lblk_off++;
 860                         continue;
 861                 }
 862
 863                 if (ret == 0) {
 864                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 865                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 866                         lrange.fc_len = cpu_to_le32(map.m_len);
 867                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 868                                             sizeof(lrange), (u8 *)&lrange, crc))
 869                                 return -ENOSPC;
 870                 } else {
 871                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 872                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 873
 874                         /* Limit the number of blocks in one extent */
 875                         map.m_len = min(max, map.m_len);
 876
 877                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 878                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 879                         ex->ee_block = cpu_to_le32(map.m_lblk);
 880                         ex->ee_len = cpu_to_le16(map.m_len);
 881                         ext4_ext_store_pblock(ex, map.m_pblk);
 882                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 883                                 ext4_ext_mark_unwritten(ex);
 884                         else
 885                                 ext4_ext_mark_initialized(ex);
 886                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 887                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 888                                 return -ENOSPC;
 889                 }
 890
 891                 cur_lblk_off += map.m_len;
 892         }
 893
 894         return 0;
 895 }
 896
 897
 898 /* Submit data for all the fast commit inodes */
 899 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 900 {
 901         struct super_block *sb = (struct super_block *)(journal->j_private);
 902         struct ext4_sb_info *sbi = EXT4_SB(sb);
 903         struct ext4_inode_info *ei;
 904         int ret = 0;
 905
 906         spin_lock(&sbi->s_fc_lock);
 907         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 908                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 909                 while (atomic_read(&ei->i_fc_updates)) {
 910                         DEFINE_WAIT(wait);
 911
 912                         prepare_to_wait(&ei->i_fc_wait, &wait,
 913                                                 TASK_UNINTERRUPTIBLE);
 914                         if (atomic_read(&ei->i_fc_updates)) {
 915                                 spin_unlock(&sbi->s_fc_lock);
 916                                 schedule();
 917                                 spin_lock(&sbi->s_fc_lock);
 918                         }
 919                         finish_wait(&ei->i_fc_wait, &wait);
 920                 }
 921                 spin_unlock(&sbi->s_fc_lock);
 922                 ret = jbd2_submit_inode_data(ei->jinode);
 923                 if (ret)
 924                         return ret;
 925                 spin_lock(&sbi->s_fc_lock);
 926         }
 927         spin_unlock(&sbi->s_fc_lock);
 928
 929         return ret;
 930 }
 931
 932 /* Wait for completion of data for all the fast commit inodes */
 933 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 934 {
 935         struct super_block *sb = (struct super_block *)(journal->j_private);
 936         struct ext4_sb_info *sbi = EXT4_SB(sb);
 937         struct ext4_inode_info *pos, *n;
 938         int ret = 0;
 939
 940         spin_lock(&sbi->s_fc_lock);
 941         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 942                 if (!ext4_test_inode_state(&pos->vfs_inode,
 943                                            EXT4_STATE_FC_COMMITTING))
 944                         continue;
 945                 spin_unlock(&sbi->s_fc_lock);
 946
 947                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 948                 if (ret)
 949                         return ret;
 950                 spin_lock(&sbi->s_fc_lock);
 951         }
 952         spin_unlock(&sbi->s_fc_lock);
 953
 954         return 0;
 955 }
 956
 957 /* Commit all the directory entry updates */
 958 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 959 __acquires(&sbi->s_fc_lock)
 960 __releases(&sbi->s_fc_lock)
 961 {
 962         struct super_block *sb = (struct super_block *)(journal->j_private);
 963         struct ext4_sb_info *sbi = EXT4_SB(sb);
 964         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 965         struct inode *inode;
 966         struct ext4_inode_info *ei, *ei_n;
 967         int ret;
 968
 969         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 970                 return 0;
 971         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 972                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 973                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 974                         spin_unlock(&sbi->s_fc_lock);
 975                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 976                                 ret = -ENOSPC;
 977                                 goto lock_and_exit;
 978                         }
 979                         spin_lock(&sbi->s_fc_lock);
 980                         continue;
 981                 }
 982
 983                 inode = NULL;
 984                 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
 985                                          i_fc_list) {
 986                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
 987                                 inode = &ei->vfs_inode;
 988                                 break;
 989                         }
 990                 }
 991                 /*
 992                  * If we don't find inode in our list, then it was deleted,
 993                  * in which case, we don't need to record it's create tag.
 994                  */
 995                 if (!inode)
 996                         continue;
 997                 spin_unlock(&sbi->s_fc_lock);
 998
 999                 /*
1000                  * We first write the inode and then the create dirent. This
1001                  * allows the recovery code to create an unnamed inode first
1002                  * and then link it to a directory entry. This allows us
1003                  * to use namei.c routines almost as is and simplifies
1004                  * the recovery code.
1005                  */
1006                 ret = ext4_fc_write_inode(inode, crc);
1007                 if (ret)
1008                         goto lock_and_exit;
1009
1010                 ret = ext4_fc_write_inode_data(inode, crc);
1011                 if (ret)
1012                         goto lock_and_exit;
1013
1014                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1015                         ret = -ENOSPC;
1016                         goto lock_and_exit;
1017                 }
1018
1019                 spin_lock(&sbi->s_fc_lock);
1020         }
1021         return 0;
1022 lock_and_exit:
1023         spin_lock(&sbi->s_fc_lock);
1024         return ret;
1025 }
1026
1027 static int ext4_fc_perform_commit(journal_t *journal)
1028 {
1029         struct super_block *sb = (struct super_block *)(journal->j_private);
1030         struct ext4_sb_info *sbi = EXT4_SB(sb);
1031         struct ext4_inode_info *iter;
1032         struct ext4_fc_head head;
1033         struct inode *inode;
1034         struct blk_plug plug;
1035         int ret = 0;
1036         u32 crc = 0;
1037
1038         ret = ext4_fc_submit_inode_data_all(journal);
1039         if (ret)
1040                 return ret;
1041
1042         ret = ext4_fc_wait_inode_data_all(journal);
1043         if (ret)
1044                 return ret;
1045
1046         /*
1047          * If file system device is different from journal device, issue a cache
1048          * flush before we start writing fast commit blocks.
1049          */
1050         if (journal->j_fs_dev != journal->j_dev)
1051                 blkdev_issue_flush(journal->j_fs_dev);
1052
1053         blk_start_plug(&plug);
1054         if (sbi->s_fc_bytes == 0) {
1055                 /*
1056                  * Add a head tag only if this is the first fast commit
1057                  * in this TID.
1058                  */
1059                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1060                 head.fc_tid = cpu_to_le32(
1061                         sbi->s_journal->j_running_transaction->t_tid);
1062                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1063                         (u8 *)&head, &crc)) {
1064                         ret = -ENOSPC;
1065                         goto out;
1066                 }
1067         }
1068
1069         spin_lock(&sbi->s_fc_lock);
1070         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1071         if (ret) {
1072                 spin_unlock(&sbi->s_fc_lock);
1073                 goto out;
1074         }
1075
1076         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1077                 inode = &iter->vfs_inode;
1078                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1079                         continue;
1080
1081                 spin_unlock(&sbi->s_fc_lock);
1082                 ret = ext4_fc_write_inode_data(inode, &crc);
1083                 if (ret)
1084                         goto out;
1085                 ret = ext4_fc_write_inode(inode, &crc);
1086                 if (ret)
1087                         goto out;
1088                 spin_lock(&sbi->s_fc_lock);
1089         }
1090         spin_unlock(&sbi->s_fc_lock);
1091
1092         ret = ext4_fc_write_tail(sb, crc);
1093
1094 out:
1095         blk_finish_plug(&plug);
1096         return ret;
1097 }
1098
1099 static void ext4_fc_update_stats(struct super_block *sb, int status,
1100                                  u64 commit_time, int nblks)
1101 {
1102         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1103
1104         ext4_debug("Fast commit ended with status = %d", status);
1105         if (status == EXT4_FC_STATUS_OK) {
1106                 stats->fc_num_commits++;
1107                 stats->fc_numblks += nblks;
1108                 if (likely(stats->s_fc_avg_commit_time))
1109                         stats->s_fc_avg_commit_time =
1110                                 (commit_time +
1111                                  stats->s_fc_avg_commit_time * 3) / 4;
1112                 else
1113                         stats->s_fc_avg_commit_time = commit_time;
1114         } else if (status == EXT4_FC_STATUS_FAILED ||
1115                    status == EXT4_FC_STATUS_INELIGIBLE) {
1116                 if (status == EXT4_FC_STATUS_FAILED)
1117                         stats->fc_failed_commits++;
1118                 stats->fc_ineligible_commits++;
1119         } else {
1120                 stats->fc_skipped_commits++;
1121         }
1122         trace_ext4_fc_commit_stop(sb, nblks, status);
1123 }
1124
1125 /*
1126  * The main commit entry point. Performs a fast commit for transaction
1127  * commit_tid if needed. If it's not possible to perform a fast commit
1128  * due to various reasons, we fall back to full commit. Returns 0
1129  * on success, error otherwise.
1130  */
1131 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1132 {
1133         struct super_block *sb = (struct super_block *)(journal->j_private);
1134         struct ext4_sb_info *sbi = EXT4_SB(sb);
1135         int nblks = 0, ret, bsize = journal->j_blocksize;
1136         int subtid = atomic_read(&sbi->s_fc_subtid);
1137         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1138         ktime_t start_time, commit_time;
1139
1140         trace_ext4_fc_commit_start(sb);
1141
1142         start_time = ktime_get();
1143
1144         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1145                 return jbd2_complete_transaction(journal, commit_tid);
1146
1147 restart_fc:
1148         ret = jbd2_fc_begin_commit(journal, commit_tid);
1149         if (ret == -EALREADY) {
1150                 /* There was an ongoing commit, check if we need to restart */
1151                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1152                         commit_tid > journal->j_commit_sequence)
1153                         goto restart_fc;
1154                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
1155                 return 0;
1156         } else if (ret) {
1157                 /*
1158                  * Commit couldn't start. Just update stats and perform a
1159                  * full commit.
1160                  */
1161                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
1162                 return jbd2_complete_transaction(journal, commit_tid);
1163         }
1164
1165         /*
1166          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1167          * if we are fast commit ineligible.
1168          */
1169         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1170                 status = EXT4_FC_STATUS_INELIGIBLE;
1171                 goto fallback;
1172         }
1173
1174         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1175         ret = ext4_fc_perform_commit(journal);
1176         if (ret < 0) {
1177                 status = EXT4_FC_STATUS_FAILED;
1178                 goto fallback;
1179         }
1180         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181         ret = jbd2_fc_wait_bufs(journal, nblks);
1182         if (ret < 0) {
1183                 status = EXT4_FC_STATUS_FAILED;
1184                 goto fallback;
1185         }
1186         atomic_inc(&sbi->s_fc_subtid);
1187         ret = jbd2_fc_end_commit(journal);
1188         /*
1189          * weight the commit time higher than the average time so we
1190          * don't react too strongly to vast changes in the commit time
1191          */
1192         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1193         ext4_fc_update_stats(sb, status, commit_time, nblks);
1194         return ret;
1195
1196 fallback:
1197         ret = jbd2_fc_end_commit_fallback(journal);
1198         ext4_fc_update_stats(sb, status, 0, 0);
1199         return ret;
1200 }
1201
1202 /*
1203  * Fast commit cleanup routine. This is called after every fast commit and
1204  * full commit. full is true if we are called after a full commit.
1205  */
1206 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1207 {
1208         struct super_block *sb = journal->j_private;
1209         struct ext4_sb_info *sbi = EXT4_SB(sb);
1210         struct ext4_inode_info *iter, *iter_n;
1211         struct ext4_fc_dentry_update *fc_dentry;
1212
1213         if (full && sbi->s_fc_bh)
1214                 sbi->s_fc_bh = NULL;
1215
1216         jbd2_fc_release_bufs(journal);
1217
1218         spin_lock(&sbi->s_fc_lock);
1219         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1220                                  i_fc_list) {
1221                 list_del_init(&iter->i_fc_list);
1222                 ext4_clear_inode_state(&iter->vfs_inode,
1223                                        EXT4_STATE_FC_COMMITTING);
1224                 if (iter->i_sync_tid <= tid)
1225                         ext4_fc_reset_inode(&iter->vfs_inode);
1226                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1227                 smp_mb();
1228 #if (BITS_PER_LONG < 64)
1229                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1230 #else
1231                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1232 #endif
1233         }
1234
1235         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1236                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1237                                              struct ext4_fc_dentry_update,
1238                                              fcd_list);
1239                 list_del_init(&fc_dentry->fcd_list);
1240                 spin_unlock(&sbi->s_fc_lock);
1241
1242                 if (fc_dentry->fcd_name.name &&
1243                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1244                         kfree(fc_dentry->fcd_name.name);
1245                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1246                 spin_lock(&sbi->s_fc_lock);
1247         }
1248
1249         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1250                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1251         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1252                                 &sbi->s_fc_q[FC_Q_MAIN]);
1253
1254         if (tid >= sbi->s_fc_ineligible_tid) {
1255                 sbi->s_fc_ineligible_tid = 0;
1256                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1257         }
1258
1259         if (full)
1260                 sbi->s_fc_bytes = 0;
1261         spin_unlock(&sbi->s_fc_lock);
1262         trace_ext4_fc_stats(sb);
1263 }
1264
1265 /* Ext4 Replay Path Routines */
1266
1267 /* Helper struct for dentry replay routines */
1268 struct dentry_info_args {
1269         int parent_ino, dname_len, ino, inode_len;
1270         char *dname;
1271 };
1272
1273 static inline void tl_to_darg(struct dentry_info_args *darg,
1274                               struct  ext4_fc_tl *tl, u8 *val)
1275 {
1276         struct ext4_fc_dentry_info fcd;
1277
1278         memcpy(&fcd, val, sizeof(fcd));
1279
1280         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1281         darg->ino = le32_to_cpu(fcd.fc_ino);
1282         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1283         darg->dname_len = le16_to_cpu(tl->fc_len) -
1284                 sizeof(struct ext4_fc_dentry_info);
1285 }
1286
1287 /* Unlink replay function */
1288 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1289                                  u8 *val)
1290 {
1291         struct inode *inode, *old_parent;
1292         struct qstr entry;
1293         struct dentry_info_args darg;
1294         int ret = 0;
1295
1296         tl_to_darg(&darg, tl, val);
1297
1298         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1299                         darg.parent_ino, darg.dname_len);
1300
1301         entry.name = darg.dname;
1302         entry.len = darg.dname_len;
1303         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1304
1305         if (IS_ERR(inode)) {
1306                 ext4_debug("Inode %d not found", darg.ino);
1307                 return 0;
1308         }
1309
1310         old_parent = ext4_iget(sb, darg.parent_ino,
1311                                 EXT4_IGET_NORMAL);
1312         if (IS_ERR(old_parent)) {
1313                 ext4_debug("Dir with inode %d not found", darg.parent_ino);
1314                 iput(inode);
1315                 return 0;
1316         }
1317
1318         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1319         /* -ENOENT ok coz it might not exist anymore. */
1320         if (ret == -ENOENT)
1321                 ret = 0;
1322         iput(old_parent);
1323         iput(inode);
1324         return ret;
1325 }
1326
1327 static int ext4_fc_replay_link_internal(struct super_block *sb,
1328                                 struct dentry_info_args *darg,
1329                                 struct inode *inode)
1330 {
1331         struct inode *dir = NULL;
1332         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1333         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1334         int ret = 0;
1335
1336         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1337         if (IS_ERR(dir)) {
1338                 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1339                 dir = NULL;
1340                 goto out;
1341         }
1342
1343         dentry_dir = d_obtain_alias(dir);
1344         if (IS_ERR(dentry_dir)) {
1345                 ext4_debug("Failed to obtain dentry");
1346                 dentry_dir = NULL;
1347                 goto out;
1348         }
1349
1350         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1351         if (!dentry_inode) {
1352                 ext4_debug("Inode dentry not created.");
1353                 ret = -ENOMEM;
1354                 goto out;
1355         }
1356
1357         ret = __ext4_link(dir, inode, dentry_inode);
1358         /*
1359          * It's possible that link already existed since data blocks
1360          * for the dir in question got persisted before we crashed OR
1361          * we replayed this tag and crashed before the entire replay
1362          * could complete.
1363          */
1364         if (ret && ret != -EEXIST) {
1365                 ext4_debug("Failed to link\n");
1366                 goto out;
1367         }
1368
1369         ret = 0;
1370 out:
1371         if (dentry_dir) {
1372                 d_drop(dentry_dir);
1373                 dput(dentry_dir);
1374         } else if (dir) {
1375                 iput(dir);
1376         }
1377         if (dentry_inode) {
1378                 d_drop(dentry_inode);
1379                 dput(dentry_inode);
1380         }
1381
1382         return ret;
1383 }
1384
1385 /* Link replay function */
1386 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1387                                u8 *val)
1388 {
1389         struct inode *inode;
1390         struct dentry_info_args darg;
1391         int ret = 0;
1392
1393         tl_to_darg(&darg, tl, val);
1394         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1395                         darg.parent_ino, darg.dname_len);
1396
1397         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1398         if (IS_ERR(inode)) {
1399                 ext4_debug("Inode not found.");
1400                 return 0;
1401         }
1402
1403         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1404         iput(inode);
1405         return ret;
1406 }
1407
1408 /*
1409  * Record all the modified inodes during replay. We use this later to setup
1410  * block bitmaps correctly.
1411  */
1412 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1413 {
1414         struct ext4_fc_replay_state *state;
1415         int i;
1416
1417         state = &EXT4_SB(sb)->s_fc_replay_state;
1418         for (i = 0; i < state->fc_modified_inodes_used; i++)
1419                 if (state->fc_modified_inodes[i] == ino)
1420                         return 0;
1421         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1422                 int *fc_modified_inodes;
1423
1424                 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1425                                 sizeof(int) * (state->fc_modified_inodes_size +
1426                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1427                                 GFP_KERNEL);
1428                 if (!fc_modified_inodes)
1429                         return -ENOMEM;
1430                 state->fc_modified_inodes = fc_modified_inodes;
1431                 state->fc_modified_inodes_size +=
1432                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1433         }
1434         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1435         return 0;
1436 }
1437
1438 /*
1439  * Inode replay function
1440  */
1441 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1442                                 u8 *val)
1443 {
1444         struct ext4_fc_inode fc_inode;
1445         struct ext4_inode *raw_inode;
1446         struct ext4_inode *raw_fc_inode;
1447         struct inode *inode = NULL;
1448         struct ext4_iloc iloc;
1449         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1450         struct ext4_extent_header *eh;
1451
1452         memcpy(&fc_inode, val, sizeof(fc_inode));
1453
1454         ino = le32_to_cpu(fc_inode.fc_ino);
1455         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1456
1457         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1458         if (!IS_ERR(inode)) {
1459                 ext4_ext_clear_bb(inode);
1460                 iput(inode);
1461         }
1462         inode = NULL;
1463
1464         ret = ext4_fc_record_modified_inode(sb, ino);
1465         if (ret)
1466                 goto out;
1467
1468         raw_fc_inode = (struct ext4_inode *)
1469                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1470         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1471         if (ret)
1472                 goto out;
1473
1474         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1475         raw_inode = ext4_raw_inode(&iloc);
1476
1477         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1478         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1479                 inode_len - offsetof(struct ext4_inode, i_generation));
1480         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1481                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1482                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1483                         memset(eh, 0, sizeof(*eh));
1484                         eh->eh_magic = EXT4_EXT_MAGIC;
1485                         eh->eh_max = cpu_to_le16(
1486                                 (sizeof(raw_inode->i_block) -
1487                                  sizeof(struct ext4_extent_header))
1488                                  / sizeof(struct ext4_extent));
1489                 }
1490         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1491                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1492                         sizeof(raw_inode->i_block));
1493         }
1494
1495         /* Immediately update the inode on disk. */
1496         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1497         if (ret)
1498                 goto out;
1499         ret = sync_dirty_buffer(iloc.bh);
1500         if (ret)
1501                 goto out;
1502         ret = ext4_mark_inode_used(sb, ino);
1503         if (ret)
1504                 goto out;
1505
1506         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1507         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1508         if (IS_ERR(inode)) {
1509                 ext4_debug("Inode not found.");
1510                 return -EFSCORRUPTED;
1511         }
1512
1513         /*
1514          * Our allocator could have made different decisions than before
1515          * crashing. This should be fixed but until then, we calculate
1516          * the number of blocks the inode.
1517          */
1518         ext4_ext_replay_set_iblocks(inode);
1519
1520         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1521         ext4_reset_inode_seed(inode);
1522
1523         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1524         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1525         sync_dirty_buffer(iloc.bh);
1526         brelse(iloc.bh);
1527 out:
1528         iput(inode);
1529         if (!ret)
1530                 blkdev_issue_flush(sb->s_bdev);
1531
1532         return 0;
1533 }
1534
1535 /*
1536  * Dentry create replay function.
1537  *
1538  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1539  * inode for which we are trying to create a dentry here, should already have
1540  * been replayed before we start here.
1541  */
1542 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1543                                  u8 *val)
1544 {
1545         int ret = 0;
1546         struct inode *inode = NULL;
1547         struct inode *dir = NULL;
1548         struct dentry_info_args darg;
1549
1550         tl_to_darg(&darg, tl, val);
1551
1552         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1553                         darg.parent_ino, darg.dname_len);
1554
1555         /* This takes care of update group descriptor and other metadata */
1556         ret = ext4_mark_inode_used(sb, darg.ino);
1557         if (ret)
1558                 goto out;
1559
1560         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1561         if (IS_ERR(inode)) {
1562                 ext4_debug("inode %d not found.", darg.ino);
1563                 inode = NULL;
1564                 ret = -EINVAL;
1565                 goto out;
1566         }
1567
1568         if (S_ISDIR(inode->i_mode)) {
1569                 /*
1570                  * If we are creating a directory, we need to make sure that the
1571                  * dot and dot dot dirents are setup properly.
1572                  */
1573                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1574                 if (IS_ERR(dir)) {
1575                         ext4_debug("Dir %d not found.", darg.ino);
1576                         goto out;
1577                 }
1578                 ret = ext4_init_new_dir(NULL, dir, inode);
1579                 iput(dir);
1580                 if (ret) {
1581                         ret = 0;
1582                         goto out;
1583                 }
1584         }
1585         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1586         if (ret)
1587                 goto out;
1588         set_nlink(inode, 1);
1589         ext4_mark_inode_dirty(NULL, inode);
1590 out:
1591         if (inode)
1592                 iput(inode);
1593         return ret;
1594 }
1595
1596 /*
1597  * Record physical disk regions which are in use as per fast commit area,
1598  * and used by inodes during replay phase. Our simple replay phase
1599  * allocator excludes these regions from allocation.
1600  */
1601 int ext4_fc_record_regions(struct super_block *sb, int ino,
1602                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1603 {
1604         struct ext4_fc_replay_state *state;
1605         struct ext4_fc_alloc_region *region;
1606
1607         state = &EXT4_SB(sb)->s_fc_replay_state;
1608         /*
1609          * during replay phase, the fc_regions_valid may not same as
1610          * fc_regions_used, update it when do new additions.
1611          */
1612         if (replay && state->fc_regions_used != state->fc_regions_valid)
1613                 state->fc_regions_used = state->fc_regions_valid;
1614         if (state->fc_regions_used == state->fc_regions_size) {
1615                 struct ext4_fc_alloc_region *fc_regions;
1616
1617                 fc_regions = krealloc(state->fc_regions,
1618                                       sizeof(struct ext4_fc_alloc_region) *
1619                                       (state->fc_regions_size +
1620                                        EXT4_FC_REPLAY_REALLOC_INCREMENT),
1621                                       GFP_KERNEL);
1622                 if (!fc_regions)
1623                         return -ENOMEM;
1624                 state->fc_regions_size +=
1625                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1626                 state->fc_regions = fc_regions;
1627         }
1628         region = &state->fc_regions[state->fc_regions_used++];
1629         region->ino = ino;
1630         region->lblk = lblk;
1631         region->pblk = pblk;
1632         region->len = len;
1633
1634         if (replay)
1635                 state->fc_regions_valid++;
1636
1637         return 0;
1638 }
1639
1640 /* Replay add range tag */
1641 static int ext4_fc_replay_add_range(struct super_block *sb,
1642                                     struct ext4_fc_tl *tl, u8 *val)
1643 {
1644         struct ext4_fc_add_range fc_add_ex;
1645         struct ext4_extent newex, *ex;
1646         struct inode *inode;
1647         ext4_lblk_t start, cur;
1648         int remaining, len;
1649         ext4_fsblk_t start_pblk;
1650         struct ext4_map_blocks map;
1651         struct ext4_ext_path *path = NULL;
1652         int ret;
1653
1654         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1655         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1656
1657         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1658                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1659                 ext4_ext_get_actual_len(ex));
1660
1661         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1662         if (IS_ERR(inode)) {
1663                 ext4_debug("Inode not found.");
1664                 return 0;
1665         }
1666
1667         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1668         if (ret)
1669                 goto out;
1670
1671         start = le32_to_cpu(ex->ee_block);
1672         start_pblk = ext4_ext_pblock(ex);
1673         len = ext4_ext_get_actual_len(ex);
1674
1675         cur = start;
1676         remaining = len;
1677         ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1678                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1679                   inode->i_ino);
1680
1681         while (remaining > 0) {
1682                 map.m_lblk = cur;
1683                 map.m_len = remaining;
1684                 map.m_pblk = 0;
1685                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1686
1687                 if (ret < 0)
1688                         goto out;
1689
1690                 if (ret == 0) {
1691                         /* Range is not mapped */
1692                         path = ext4_find_extent(inode, cur, NULL, 0);
1693                         if (IS_ERR(path))
1694                                 goto out;
1695                         memset(&newex, 0, sizeof(newex));
1696                         newex.ee_block = cpu_to_le32(cur);
1697                         ext4_ext_store_pblock(
1698                                 &newex, start_pblk + cur - start);
1699                         newex.ee_len = cpu_to_le16(map.m_len);
1700                         if (ext4_ext_is_unwritten(ex))
1701                                 ext4_ext_mark_unwritten(&newex);
1702                         down_write(&EXT4_I(inode)->i_data_sem);
1703                         ret = ext4_ext_insert_extent(
1704                                 NULL, inode, &path, &newex, 0);
1705                         up_write((&EXT4_I(inode)->i_data_sem));
1706                         ext4_ext_drop_refs(path);
1707                         kfree(path);
1708                         if (ret)
1709                                 goto out;
1710                         goto next;
1711                 }
1712
1713                 if (start_pblk + cur - start != map.m_pblk) {
1714                         /*
1715                          * Logical to physical mapping changed. This can happen
1716                          * if this range was removed and then reallocated to
1717                          * map to new physical blocks during a fast commit.
1718                          */
1719                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1720                                         ext4_ext_is_unwritten(ex),
1721                                         start_pblk + cur - start);
1722                         if (ret)
1723                                 goto out;
1724                         /*
1725                          * Mark the old blocks as free since they aren't used
1726                          * anymore. We maintain an array of all the modified
1727                          * inodes. In case these blocks are still used at either
1728                          * a different logical range in the same inode or in
1729                          * some different inode, we will mark them as allocated
1730                          * at the end of the FC replay using our array of
1731                          * modified inodes.
1732                          */
1733                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1734                         goto next;
1735                 }
1736
1737                 /* Range is mapped and needs a state change */
1738                 ext4_debug("Converting from %ld to %d %lld",
1739                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1740                         ext4_ext_is_unwritten(ex), map.m_pblk);
1741                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1742                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1743                 if (ret)
1744                         goto out;
1745                 /*
1746                  * We may have split the extent tree while toggling the state.
1747                  * Try to shrink the extent tree now.
1748                  */
1749                 ext4_ext_replay_shrink_inode(inode, start + len);
1750 next:
1751                 cur += map.m_len;
1752                 remaining -= map.m_len;
1753         }
1754         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1755                                         sb->s_blocksize_bits);
1756 out:
1757         iput(inode);
1758         return 0;
1759 }
1760
1761 /* Replay DEL_RANGE tag */
1762 static int
1763 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1764                          u8 *val)
1765 {
1766         struct inode *inode;
1767         struct ext4_fc_del_range lrange;
1768         struct ext4_map_blocks map;
1769         ext4_lblk_t cur, remaining;
1770         int ret;
1771
1772         memcpy(&lrange, val, sizeof(lrange));
1773         cur = le32_to_cpu(lrange.fc_lblk);
1774         remaining = le32_to_cpu(lrange.fc_len);
1775
1776         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1777                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1778
1779         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1780         if (IS_ERR(inode)) {
1781                 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1782                 return 0;
1783         }
1784
1785         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1786         if (ret)
1787                 goto out;
1788
1789         ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1790                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1791                         le32_to_cpu(lrange.fc_len));
1792         while (remaining > 0) {
1793                 map.m_lblk = cur;
1794                 map.m_len = remaining;
1795
1796                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1797                 if (ret < 0)
1798                         goto out;
1799                 if (ret > 0) {
1800                         remaining -= ret;
1801                         cur += ret;
1802                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1803                 } else {
1804                         remaining -= map.m_len;
1805                         cur += map.m_len;
1806                 }
1807         }
1808
1809         down_write(&EXT4_I(inode)->i_data_sem);
1810         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1811                                 le32_to_cpu(lrange.fc_lblk) +
1812                                 le32_to_cpu(lrange.fc_len) - 1);
1813         up_write(&EXT4_I(inode)->i_data_sem);
1814         if (ret)
1815                 goto out;
1816         ext4_ext_replay_shrink_inode(inode,
1817                 i_size_read(inode) >> sb->s_blocksize_bits);
1818         ext4_mark_inode_dirty(NULL, inode);
1819 out:
1820         iput(inode);
1821         return 0;
1822 }
1823
1824 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1825 {
1826         struct ext4_fc_replay_state *state;
1827         struct inode *inode;
1828         struct ext4_ext_path *path = NULL;
1829         struct ext4_map_blocks map;
1830         int i, ret, j;
1831         ext4_lblk_t cur, end;
1832
1833         state = &EXT4_SB(sb)->s_fc_replay_state;
1834         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1835                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1836                         EXT4_IGET_NORMAL);
1837                 if (IS_ERR(inode)) {
1838                         ext4_debug("Inode %d not found.",
1839                                 state->fc_modified_inodes[i]);
1840                         continue;
1841                 }
1842                 cur = 0;
1843                 end = EXT_MAX_BLOCKS;
1844                 while (cur < end) {
1845                         map.m_lblk = cur;
1846                         map.m_len = end - cur;
1847
1848                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1849                         if (ret < 0)
1850                                 break;
1851
1852                         if (ret > 0) {
1853                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1854                                 if (!IS_ERR(path)) {
1855                                         for (j = 0; j < path->p_depth; j++)
1856                                                 ext4_mb_mark_bb(inode->i_sb,
1857                                                         path[j].p_block, 1, 1);
1858                                         ext4_ext_drop_refs(path);
1859                                         kfree(path);
1860                                 }
1861                                 cur += ret;
1862                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1863                                                         map.m_len, 1);
1864                         } else {
1865                                 cur = cur + (map.m_len ? map.m_len : 1);
1866                         }
1867                 }
1868                 iput(inode);
1869         }
1870 }
1871
1872 /*
1873  * Check if block is in excluded regions for block allocation. The simple
1874  * allocator that runs during replay phase is calls this function to see
1875  * if it is okay to use a block.
1876  */
1877 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1878 {
1879         int i;
1880         struct ext4_fc_replay_state *state;
1881
1882         state = &EXT4_SB(sb)->s_fc_replay_state;
1883         for (i = 0; i < state->fc_regions_valid; i++) {
1884                 if (state->fc_regions[i].ino == 0 ||
1885                         state->fc_regions[i].len == 0)
1886                         continue;
1887                 if (blk >= state->fc_regions[i].pblk &&
1888                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1889                         return true;
1890         }
1891         return false;
1892 }
1893
1894 /* Cleanup function called after replay */
1895 void ext4_fc_replay_cleanup(struct super_block *sb)
1896 {
1897         struct ext4_sb_info *sbi = EXT4_SB(sb);
1898
1899         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1900         kfree(sbi->s_fc_replay_state.fc_regions);
1901         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1902 }
1903
1904 /*
1905  * Recovery Scan phase handler
1906  *
1907  * This function is called during the scan phase and is responsible
1908  * for doing following things:
1909  * - Make sure the fast commit area has valid tags for replay
1910  * - Count number of tags that need to be replayed by the replay handler
1911  * - Verify CRC
1912  * - Create a list of excluded blocks for allocation during replay phase
1913  *
1914  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1915  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1916  * to indicate that scan has finished and JBD2 can now start replay phase.
1917  * It returns a negative error to indicate that there was an error. At the end
1918  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1919  * to indicate the number of tags that need to replayed during the replay phase.
1920  */
1921 static int ext4_fc_replay_scan(journal_t *journal,
1922                                 struct buffer_head *bh, int off,
1923                                 tid_t expected_tid)
1924 {
1925         struct super_block *sb = journal->j_private;
1926         struct ext4_sb_info *sbi = EXT4_SB(sb);
1927         struct ext4_fc_replay_state *state;
1928         int ret = JBD2_FC_REPLAY_CONTINUE;
1929         struct ext4_fc_add_range ext;
1930         struct ext4_fc_tl tl;
1931         struct ext4_fc_tail tail;
1932         __u8 *start, *end, *cur, *val;
1933         struct ext4_fc_head head;
1934         struct ext4_extent *ex;
1935
1936         state = &sbi->s_fc_replay_state;
1937
1938         start = (u8 *)bh->b_data;
1939         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1940
1941         if (state->fc_replay_expected_off == 0) {
1942                 state->fc_cur_tag = 0;
1943                 state->fc_replay_num_tags = 0;
1944                 state->fc_crc = 0;
1945                 state->fc_regions = NULL;
1946                 state->fc_regions_valid = state->fc_regions_used =
1947                         state->fc_regions_size = 0;
1948                 /* Check if we can stop early */
1949                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1950                         != EXT4_FC_TAG_HEAD)
1951                         return 0;
1952         }
1953
1954         if (off != state->fc_replay_expected_off) {
1955                 ret = -EFSCORRUPTED;
1956                 goto out_err;
1957         }
1958
1959         state->fc_replay_expected_off++;
1960         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1961                 memcpy(&tl, cur, sizeof(tl));
1962                 val = cur + sizeof(tl);
1963                 ext4_debug("Scan phase, tag:%s, blk %lld\n",
1964                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1965                 switch (le16_to_cpu(tl.fc_tag)) {
1966                 case EXT4_FC_TAG_ADD_RANGE:
1967                         memcpy(&ext, val, sizeof(ext));
1968                         ex = (struct ext4_extent *)&ext.fc_ex;
1969                         ret = ext4_fc_record_regions(sb,
1970                                 le32_to_cpu(ext.fc_ino),
1971                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1972                                 ext4_ext_get_actual_len(ex), 0);
1973                         if (ret < 0)
1974                                 break;
1975                         ret = JBD2_FC_REPLAY_CONTINUE;
1976                         fallthrough;
1977                 case EXT4_FC_TAG_DEL_RANGE:
1978                 case EXT4_FC_TAG_LINK:
1979                 case EXT4_FC_TAG_UNLINK:
1980                 case EXT4_FC_TAG_CREAT:
1981                 case EXT4_FC_TAG_INODE:
1982                 case EXT4_FC_TAG_PAD:
1983                         state->fc_cur_tag++;
1984                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1985                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1986                         break;
1987                 case EXT4_FC_TAG_TAIL:
1988                         state->fc_cur_tag++;
1989                         memcpy(&tail, val, sizeof(tail));
1990                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1991                                                 sizeof(tl) +
1992                                                 offsetof(struct ext4_fc_tail,
1993                                                 fc_crc));
1994                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1995                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1996                                 state->fc_replay_num_tags = state->fc_cur_tag;
1997                                 state->fc_regions_valid =
1998                                         state->fc_regions_used;
1999                         } else {
2000                                 ret = state->fc_replay_num_tags ?
2001                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2002                         }
2003                         state->fc_crc = 0;
2004                         break;
2005                 case EXT4_FC_TAG_HEAD:
2006                         memcpy(&head, val, sizeof(head));
2007                         if (le32_to_cpu(head.fc_features) &
2008                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2009                                 ret = -EOPNOTSUPP;
2010                                 break;
2011                         }
2012                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2013                                 ret = JBD2_FC_REPLAY_STOP;
2014                                 break;
2015                         }
2016                         state->fc_cur_tag++;
2017                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2018                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2019                         break;
2020                 default:
2021                         ret = state->fc_replay_num_tags ?
2022                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2023                 }
2024                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2025                         break;
2026         }
2027
2028 out_err:
2029         trace_ext4_fc_replay_scan(sb, ret, off);
2030         return ret;
2031 }
2032
2033 /*
2034  * Main recovery path entry point.
2035  * The meaning of return codes is similar as above.
2036  */
2037 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2038                                 enum passtype pass, int off, tid_t expected_tid)
2039 {
2040         struct super_block *sb = journal->j_private;
2041         struct ext4_sb_info *sbi = EXT4_SB(sb);
2042         struct ext4_fc_tl tl;
2043         __u8 *start, *end, *cur, *val;
2044         int ret = JBD2_FC_REPLAY_CONTINUE;
2045         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2046         struct ext4_fc_tail tail;
2047
2048         if (pass == PASS_SCAN) {
2049                 state->fc_current_pass = PASS_SCAN;
2050                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2051         }
2052
2053         if (state->fc_current_pass != pass) {
2054                 state->fc_current_pass = pass;
2055                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2056         }
2057         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2058                 ext4_debug("Replay stops\n");
2059                 ext4_fc_set_bitmaps_and_counters(sb);
2060                 return 0;
2061         }
2062
2063 #ifdef CONFIG_EXT4_DEBUG
2064         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2065                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2066                 return JBD2_FC_REPLAY_STOP;
2067         }
2068 #endif
2069
2070         start = (u8 *)bh->b_data;
2071         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2072
2073         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2074                 memcpy(&tl, cur, sizeof(tl));
2075                 val = cur + sizeof(tl);
2076
2077                 if (state->fc_replay_num_tags == 0) {
2078                         ret = JBD2_FC_REPLAY_STOP;
2079                         ext4_fc_set_bitmaps_and_counters(sb);
2080                         break;
2081                 }
2082                 ext4_debug("Replay phase, tag:%s\n",
2083                                 tag2str(le16_to_cpu(tl.fc_tag)));
2084                 state->fc_replay_num_tags--;
2085                 switch (le16_to_cpu(tl.fc_tag)) {
2086                 case EXT4_FC_TAG_LINK:
2087                         ret = ext4_fc_replay_link(sb, &tl, val);
2088                         break;
2089                 case EXT4_FC_TAG_UNLINK:
2090                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2091                         break;
2092                 case EXT4_FC_TAG_ADD_RANGE:
2093                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2094                         break;
2095                 case EXT4_FC_TAG_CREAT:
2096                         ret = ext4_fc_replay_create(sb, &tl, val);
2097                         break;
2098                 case EXT4_FC_TAG_DEL_RANGE:
2099                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2100                         break;
2101                 case EXT4_FC_TAG_INODE:
2102                         ret = ext4_fc_replay_inode(sb, &tl, val);
2103                         break;
2104                 case EXT4_FC_TAG_PAD:
2105                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2106                                              le16_to_cpu(tl.fc_len), 0);
2107                         break;
2108                 case EXT4_FC_TAG_TAIL:
2109                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2110                                              le16_to_cpu(tl.fc_len), 0);
2111                         memcpy(&tail, val, sizeof(tail));
2112                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2113                         break;
2114                 case EXT4_FC_TAG_HEAD:
2115                         break;
2116                 default:
2117                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2118                                              le16_to_cpu(tl.fc_len), 0);
2119                         ret = -ECANCELED;
2120                         break;
2121                 }
2122                 if (ret < 0)
2123                         break;
2124                 ret = JBD2_FC_REPLAY_CONTINUE;
2125         }
2126         return ret;
2127 }
2128
2129 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2130 {
2131         /*
2132          * We set replay callback even if fast commit disabled because we may
2133          * could still have fast commit blocks that need to be replayed even if
2134          * fast commit has now been turned off.
2135          */
2136         journal->j_fc_replay_callback = ext4_fc_replay;
2137         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2138                 return;
2139         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2140 }
2141
2142 static const char *fc_ineligible_reasons[] = {
2143         "Extended attributes changed",
2144         "Cross rename",
2145         "Journal flag changed",
2146         "Insufficient memory",
2147         "Swap boot",
2148         "Resize",
2149         "Dir renamed",
2150         "Falloc range op",
2151         "Data journalling",
2152         "FC Commit Failed"
2153 };
2154
2155 int ext4_fc_info_show(struct seq_file *seq, void *v)
2156 {
2157         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2158         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2159         int i;
2160
2161         if (v != SEQ_START_TOKEN)
2162                 return 0;
2163
2164         seq_printf(seq,
2165                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2166                    stats->fc_num_commits, stats->fc_ineligible_commits,
2167                    stats->fc_numblks,
2168                    div_u64(stats->s_fc_avg_commit_time, 1000));
2169         seq_puts(seq, "Ineligible reasons:\n");
2170         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2171                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2172                         stats->fc_ineligible_reason_count[i]);
2173
2174         return 0;
2175 }
2176
2177 int __init ext4_fc_init_dentry_cache(void)
2178 {
2179         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2180                                            SLAB_RECLAIM_ACCOUNT);
2181
2182         if (ext4_fc_dentry_cachep == NULL)
2183                 return -ENOMEM;
2184
2185         return 0;
2186 }
2187
2188 void ext4_fc_destroy_dentry_cache(void)
2189 {
2190         kmem_cache_destroy(ext4_fc_dentry_cachep);
2191 }