pstore: Avoid kcore oops by vmap()ing with VM_IOREMAP
[platform/kernel/linux-starfive.git] / fs / ext4 / fast_commit.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4  * fs/ext4/fast_commit.c
5  *
6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7  *
8  * Ext4 fast commits routines.
9  */
10 #include "ext4.h"
11 #include "ext4_jbd2.h"
12 #include "ext4_extents.h"
13 #include "mballoc.h"
14
15 /*
16  * Ext4 Fast Commits
17  * -----------------
18  *
19  * Ext4 fast commits implement fine grained journalling for Ext4.
20  *
21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23  * TLV during the recovery phase. For the scenarios for which we currently
24  * don't have replay code, fast commit falls back to full commits.
25  * Fast commits record delta in one of the following three categories.
26  *
27  * (A) Directory entry updates:
28  *
29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
30  * - EXT4_FC_TAG_LINK           - records directory entry link
31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
32  *
33  * (B) File specific data range updates:
34  *
35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
37  *
38  * (C) Inode metadata (mtime / ctime etc):
39  *
40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
41  *                                during recovery. Note that iblocks field is
42  *                                not replayed and instead derived during
43  *                                replay.
44  * Commit Operation
45  * ----------------
46  * With fast commits, we maintain all the directory entry operations in the
47  * order in which they are issued in an in-memory queue. This queue is flushed
48  * to disk during the commit operation. We also maintain a list of inodes
49  * that need to be committed during a fast commit in another in memory queue of
50  * inodes. During the commit operation, we commit in the following order:
51  *
52  * [1] Lock inodes for any further data updates by setting COMMITTING state
53  * [2] Submit data buffers of all the inodes
54  * [3] Wait for [2] to complete
55  * [4] Commit all the directory entry updates in the fast commit space
56  * [5] Commit all the changed inode structures
57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
58  *     section for more details).
59  * [7] Wait for [4], [5] and [6] to complete.
60  *
61  * All the inode updates must call ext4_fc_start_update() before starting an
62  * update. If such an ongoing update is present, fast commit waits for it to
63  * complete. The completion of such an update is marked by
64  * ext4_fc_stop_update().
65  *
66  * Fast Commit Ineligibility
67  * -------------------------
68  *
69  * Not all operations are supported by fast commits today (e.g extended
70  * attributes). Fast commit ineligibility is marked by calling
71  * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72  * to full commit.
73  *
74  * Atomicity of commits
75  * --------------------
76  * In order to guarantee atomicity during the commit operation, fast commit
77  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78  * tag contains CRC of the contents and TID of the transaction after which
79  * this fast commit should be applied. Recovery code replays fast commit
80  * logs only if there's at least 1 valid tail present. For every fast commit
81  * operation, there is 1 tail. This means, we may end up with multiple tails
82  * in the fast commit space. Here's an example:
83  *
84  * - Create a new file A and remove existing file B
85  * - fsync()
86  * - Append contents to file A
87  * - Truncate file A
88  * - fsync()
89  *
90  * The fast commit space at the end of above operations would look like this:
91  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
93  *
94  * Replay code should thus check for all the valid tails in the FC area.
95  *
96  * Fast Commit Replay Idempotence
97  * ------------------------------
98  *
99  * Fast commits tags are idempotent in nature provided the recovery code follows
100  * certain rules. The guiding principle that the commit path follows while
101  * committing is that it stores the result of a particular operation instead of
102  * storing the procedure.
103  *
104  * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105  * was associated with inode 10. During fast commit, instead of storing this
106  * operation as a procedure "rename a to b", we store the resulting file system
107  * state as a "series" of outcomes:
108  *
109  * - Link dirent b to inode 10
110  * - Unlink dirent a
111  * - Inode <10> with valid refcount
112  *
113  * Now when recovery code runs, it needs "enforce" this state on the file
114  * system. This is what guarantees idempotence of fast commit replay.
115  *
116  * Let's take an example of a procedure that is not idempotent and see how fast
117  * commits make it idempotent. Consider following sequence of operations:
118  *
119  *     rm A;    mv B A;    read A
120  *  (x)     (y)        (z)
121  *
122  * (x), (y) and (z) are the points at which we can crash. If we store this
123  * sequence of operations as is then the replay is not idempotent. Let's say
124  * while in replay, we crash at (z). During the second replay, file A (which was
125  * actually created as a result of "mv B A" operation) would get deleted. Thus,
126  * file named A would be absent when we try to read A. So, this sequence of
127  * operations is not idempotent. However, as mentioned above, instead of storing
128  * the procedure fast commits store the outcome of each procedure. Thus the fast
129  * commit log for above procedure would be as follows:
130  *
131  * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132  * inode 11 before the replay)
133  *
134  *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
135  * (w)          (x)                    (y)          (z)
136  *
137  * If we crash at (z), we will have file A linked to inode 11. During the second
138  * replay, we will remove file A (inode 11). But we will create it back and make
139  * it point to inode 11. We won't find B, so we'll just skip that step. At this
140  * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141  * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142  * similarly. Thus, by converting a non-idempotent procedure into a series of
143  * idempotent outcomes, fast commits ensured idempotence during the replay.
144  *
145  * TODOs
146  * -----
147  *
148  * 0) Fast commit replay path hardening: Fast commit replay code should use
149  *    journal handles to make sure all the updates it does during the replay
150  *    path are atomic. With that if we crash during fast commit replay, after
151  *    trying to do recovery again, we will find a file system where fast commit
152  *    area is invalid (because new full commit would be found). In order to deal
153  *    with that, fast commit replay code should ensure that the "FC_REPLAY"
154  *    superblock state is persisted before starting the replay, so that after
155  *    the crash, fast commit recovery code can look at that flag and perform
156  *    fast commit recovery even if that area is invalidated by later full
157  *    commits.
158  *
159  * 1) Fast commit's commit path locks the entire file system during fast
160  *    commit. This has significant performance penalty. Instead of that, we
161  *    should use ext4_fc_start/stop_update functions to start inode level
162  *    updates from ext4_journal_start/stop. Once we do that we can drop file
163  *    system locking during commit path.
164  *
165  * 2) Handle more ineligible cases.
166  */
167
168 #include <trace/events/ext4.h>
169 static struct kmem_cache *ext4_fc_dentry_cachep;
170
171 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172 {
173         BUFFER_TRACE(bh, "");
174         if (uptodate) {
175                 ext4_debug("%s: Block %lld up-to-date",
176                            __func__, bh->b_blocknr);
177                 set_buffer_uptodate(bh);
178         } else {
179                 ext4_debug("%s: Block %lld not up-to-date",
180                            __func__, bh->b_blocknr);
181                 clear_buffer_uptodate(bh);
182         }
183
184         unlock_buffer(bh);
185 }
186
187 static inline void ext4_fc_reset_inode(struct inode *inode)
188 {
189         struct ext4_inode_info *ei = EXT4_I(inode);
190
191         ei->i_fc_lblk_start = 0;
192         ei->i_fc_lblk_len = 0;
193 }
194
195 void ext4_fc_init_inode(struct inode *inode)
196 {
197         struct ext4_inode_info *ei = EXT4_I(inode);
198
199         ext4_fc_reset_inode(inode);
200         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201         INIT_LIST_HEAD(&ei->i_fc_list);
202         INIT_LIST_HEAD(&ei->i_fc_dilist);
203         init_waitqueue_head(&ei->i_fc_wait);
204         atomic_set(&ei->i_fc_updates, 0);
205 }
206
207 /* This function must be called with sbi->s_fc_lock held. */
208 static void ext4_fc_wait_committing_inode(struct inode *inode)
209 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210 {
211         wait_queue_head_t *wq;
212         struct ext4_inode_info *ei = EXT4_I(inode);
213
214 #if (BITS_PER_LONG < 64)
215         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216                         EXT4_STATE_FC_COMMITTING);
217         wq = bit_waitqueue(&ei->i_state_flags,
218                                 EXT4_STATE_FC_COMMITTING);
219 #else
220         DEFINE_WAIT_BIT(wait, &ei->i_flags,
221                         EXT4_STATE_FC_COMMITTING);
222         wq = bit_waitqueue(&ei->i_flags,
223                                 EXT4_STATE_FC_COMMITTING);
224 #endif
225         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228         schedule();
229         finish_wait(wq, &wait.wq_entry);
230 }
231
232 static bool ext4_fc_disabled(struct super_block *sb)
233 {
234         return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
235                 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
236 }
237
238 /*
239  * Inform Ext4's fast about start of an inode update
240  *
241  * This function is called by the high level call VFS callbacks before
242  * performing any inode update. This function blocks if there's an ongoing
243  * fast commit on the inode in question.
244  */
245 void ext4_fc_start_update(struct inode *inode)
246 {
247         struct ext4_inode_info *ei = EXT4_I(inode);
248
249         if (ext4_fc_disabled(inode->i_sb))
250                 return;
251
252 restart:
253         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
254         if (list_empty(&ei->i_fc_list))
255                 goto out;
256
257         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
258                 ext4_fc_wait_committing_inode(inode);
259                 goto restart;
260         }
261 out:
262         atomic_inc(&ei->i_fc_updates);
263         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
264 }
265
266 /*
267  * Stop inode update and wake up waiting fast commits if any.
268  */
269 void ext4_fc_stop_update(struct inode *inode)
270 {
271         struct ext4_inode_info *ei = EXT4_I(inode);
272
273         if (ext4_fc_disabled(inode->i_sb))
274                 return;
275
276         if (atomic_dec_and_test(&ei->i_fc_updates))
277                 wake_up_all(&ei->i_fc_wait);
278 }
279
280 /*
281  * Remove inode from fast commit list. If the inode is being committed
282  * we wait until inode commit is done.
283  */
284 void ext4_fc_del(struct inode *inode)
285 {
286         struct ext4_inode_info *ei = EXT4_I(inode);
287         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
288         struct ext4_fc_dentry_update *fc_dentry;
289
290         if (ext4_fc_disabled(inode->i_sb))
291                 return;
292
293 restart:
294         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
295         if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
296                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
297                 return;
298         }
299
300         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
301                 ext4_fc_wait_committing_inode(inode);
302                 goto restart;
303         }
304
305         if (!list_empty(&ei->i_fc_list))
306                 list_del_init(&ei->i_fc_list);
307
308         /*
309          * Since this inode is getting removed, let's also remove all FC
310          * dentry create references, since it is not needed to log it anyways.
311          */
312         if (list_empty(&ei->i_fc_dilist)) {
313                 spin_unlock(&sbi->s_fc_lock);
314                 return;
315         }
316
317         fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
318         WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
319         list_del_init(&fc_dentry->fcd_list);
320         list_del_init(&fc_dentry->fcd_dilist);
321
322         WARN_ON(!list_empty(&ei->i_fc_dilist));
323         spin_unlock(&sbi->s_fc_lock);
324
325         if (fc_dentry->fcd_name.name &&
326                 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
327                 kfree(fc_dentry->fcd_name.name);
328         kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
329
330         return;
331 }
332
333 /*
334  * Mark file system as fast commit ineligible, and record latest
335  * ineligible transaction tid. This means until the recorded
336  * transaction, commit operation would result in a full jbd2 commit.
337  */
338 void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
339 {
340         struct ext4_sb_info *sbi = EXT4_SB(sb);
341         tid_t tid;
342
343         if (ext4_fc_disabled(sb))
344                 return;
345
346         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
347         if (handle && !IS_ERR(handle))
348                 tid = handle->h_transaction->t_tid;
349         else {
350                 read_lock(&sbi->s_journal->j_state_lock);
351                 tid = sbi->s_journal->j_running_transaction ?
352                                 sbi->s_journal->j_running_transaction->t_tid : 0;
353                 read_unlock(&sbi->s_journal->j_state_lock);
354         }
355         spin_lock(&sbi->s_fc_lock);
356         if (sbi->s_fc_ineligible_tid < tid)
357                 sbi->s_fc_ineligible_tid = tid;
358         spin_unlock(&sbi->s_fc_lock);
359         WARN_ON(reason >= EXT4_FC_REASON_MAX);
360         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
361 }
362
363 /*
364  * Generic fast commit tracking function. If this is the first time this we are
365  * called after a full commit, we initialize fast commit fields and then call
366  * __fc_track_fn() with update = 0. If we have already been called after a full
367  * commit, we pass update = 1. Based on that, the track function can determine
368  * if it needs to track a field for the first time or if it needs to just
369  * update the previously tracked value.
370  *
371  * If enqueue is set, this function enqueues the inode in fast commit list.
372  */
373 static int ext4_fc_track_template(
374         handle_t *handle, struct inode *inode,
375         int (*__fc_track_fn)(struct inode *, void *, bool),
376         void *args, int enqueue)
377 {
378         bool update = false;
379         struct ext4_inode_info *ei = EXT4_I(inode);
380         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
381         tid_t tid = 0;
382         int ret;
383
384         tid = handle->h_transaction->t_tid;
385         mutex_lock(&ei->i_fc_lock);
386         if (tid == ei->i_sync_tid) {
387                 update = true;
388         } else {
389                 ext4_fc_reset_inode(inode);
390                 ei->i_sync_tid = tid;
391         }
392         ret = __fc_track_fn(inode, args, update);
393         mutex_unlock(&ei->i_fc_lock);
394
395         if (!enqueue)
396                 return ret;
397
398         spin_lock(&sbi->s_fc_lock);
399         if (list_empty(&EXT4_I(inode)->i_fc_list))
400                 list_add_tail(&EXT4_I(inode)->i_fc_list,
401                                 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
402                                  sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
403                                 &sbi->s_fc_q[FC_Q_STAGING] :
404                                 &sbi->s_fc_q[FC_Q_MAIN]);
405         spin_unlock(&sbi->s_fc_lock);
406
407         return ret;
408 }
409
410 struct __track_dentry_update_args {
411         struct dentry *dentry;
412         int op;
413 };
414
415 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
416 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
417 {
418         struct ext4_fc_dentry_update *node;
419         struct ext4_inode_info *ei = EXT4_I(inode);
420         struct __track_dentry_update_args *dentry_update =
421                 (struct __track_dentry_update_args *)arg;
422         struct dentry *dentry = dentry_update->dentry;
423         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
424
425         mutex_unlock(&ei->i_fc_lock);
426         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
427         if (!node) {
428                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
429                 mutex_lock(&ei->i_fc_lock);
430                 return -ENOMEM;
431         }
432
433         node->fcd_op = dentry_update->op;
434         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
435         node->fcd_ino = inode->i_ino;
436         if (dentry->d_name.len > DNAME_INLINE_LEN) {
437                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
438                 if (!node->fcd_name.name) {
439                         kmem_cache_free(ext4_fc_dentry_cachep, node);
440                         ext4_fc_mark_ineligible(inode->i_sb,
441                                 EXT4_FC_REASON_NOMEM, NULL);
442                         mutex_lock(&ei->i_fc_lock);
443                         return -ENOMEM;
444                 }
445                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
446                         dentry->d_name.len);
447         } else {
448                 memcpy(node->fcd_iname, dentry->d_name.name,
449                         dentry->d_name.len);
450                 node->fcd_name.name = node->fcd_iname;
451         }
452         node->fcd_name.len = dentry->d_name.len;
453         INIT_LIST_HEAD(&node->fcd_dilist);
454         spin_lock(&sbi->s_fc_lock);
455         if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
456                 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
457                 list_add_tail(&node->fcd_list,
458                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
459         else
460                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
461
462         /*
463          * This helps us keep a track of all fc_dentry updates which is part of
464          * this ext4 inode. So in case the inode is getting unlinked, before
465          * even we get a chance to fsync, we could remove all fc_dentry
466          * references while evicting the inode in ext4_fc_del().
467          * Also with this, we don't need to loop over all the inodes in
468          * sbi->s_fc_q to get the corresponding inode in
469          * ext4_fc_commit_dentry_updates().
470          */
471         if (dentry_update->op == EXT4_FC_TAG_CREAT) {
472                 WARN_ON(!list_empty(&ei->i_fc_dilist));
473                 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
474         }
475         spin_unlock(&sbi->s_fc_lock);
476         mutex_lock(&ei->i_fc_lock);
477
478         return 0;
479 }
480
481 void __ext4_fc_track_unlink(handle_t *handle,
482                 struct inode *inode, struct dentry *dentry)
483 {
484         struct __track_dentry_update_args args;
485         int ret;
486
487         args.dentry = dentry;
488         args.op = EXT4_FC_TAG_UNLINK;
489
490         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
491                                         (void *)&args, 0);
492         trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
493 }
494
495 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
496 {
497         struct inode *inode = d_inode(dentry);
498
499         if (ext4_fc_disabled(inode->i_sb))
500                 return;
501
502         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
503                 return;
504
505         __ext4_fc_track_unlink(handle, inode, dentry);
506 }
507
508 void __ext4_fc_track_link(handle_t *handle,
509         struct inode *inode, struct dentry *dentry)
510 {
511         struct __track_dentry_update_args args;
512         int ret;
513
514         args.dentry = dentry;
515         args.op = EXT4_FC_TAG_LINK;
516
517         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
518                                         (void *)&args, 0);
519         trace_ext4_fc_track_link(handle, inode, dentry, ret);
520 }
521
522 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
523 {
524         struct inode *inode = d_inode(dentry);
525
526         if (ext4_fc_disabled(inode->i_sb))
527                 return;
528
529         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
530                 return;
531
532         __ext4_fc_track_link(handle, inode, dentry);
533 }
534
535 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
536                           struct dentry *dentry)
537 {
538         struct __track_dentry_update_args args;
539         int ret;
540
541         args.dentry = dentry;
542         args.op = EXT4_FC_TAG_CREAT;
543
544         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
545                                         (void *)&args, 0);
546         trace_ext4_fc_track_create(handle, inode, dentry, ret);
547 }
548
549 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
550 {
551         struct inode *inode = d_inode(dentry);
552
553         if (ext4_fc_disabled(inode->i_sb))
554                 return;
555
556         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
557                 return;
558
559         __ext4_fc_track_create(handle, inode, dentry);
560 }
561
562 /* __track_fn for inode tracking */
563 static int __track_inode(struct inode *inode, void *arg, bool update)
564 {
565         if (update)
566                 return -EEXIST;
567
568         EXT4_I(inode)->i_fc_lblk_len = 0;
569
570         return 0;
571 }
572
573 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
574 {
575         int ret;
576
577         if (S_ISDIR(inode->i_mode))
578                 return;
579
580         if (ext4_fc_disabled(inode->i_sb))
581                 return;
582
583         if (ext4_should_journal_data(inode)) {
584                 ext4_fc_mark_ineligible(inode->i_sb,
585                                         EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
586                 return;
587         }
588
589         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
590                 return;
591
592         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
593         trace_ext4_fc_track_inode(handle, inode, ret);
594 }
595
596 struct __track_range_args {
597         ext4_lblk_t start, end;
598 };
599
600 /* __track_fn for tracking data updates */
601 static int __track_range(struct inode *inode, void *arg, bool update)
602 {
603         struct ext4_inode_info *ei = EXT4_I(inode);
604         ext4_lblk_t oldstart;
605         struct __track_range_args *__arg =
606                 (struct __track_range_args *)arg;
607
608         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
609                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
610                 return -ECANCELED;
611         }
612
613         oldstart = ei->i_fc_lblk_start;
614
615         if (update && ei->i_fc_lblk_len > 0) {
616                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
617                 ei->i_fc_lblk_len =
618                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
619                                 ei->i_fc_lblk_start + 1;
620         } else {
621                 ei->i_fc_lblk_start = __arg->start;
622                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
623         }
624
625         return 0;
626 }
627
628 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
629                          ext4_lblk_t end)
630 {
631         struct __track_range_args args;
632         int ret;
633
634         if (S_ISDIR(inode->i_mode))
635                 return;
636
637         if (ext4_fc_disabled(inode->i_sb))
638                 return;
639
640         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
641                 return;
642
643         args.start = start;
644         args.end = end;
645
646         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
647
648         trace_ext4_fc_track_range(handle, inode, start, end, ret);
649 }
650
651 static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
652 {
653         blk_opf_t write_flags = REQ_SYNC;
654         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
655
656         /* Add REQ_FUA | REQ_PREFLUSH only its tail */
657         if (test_opt(sb, BARRIER) && is_tail)
658                 write_flags |= REQ_FUA | REQ_PREFLUSH;
659         lock_buffer(bh);
660         set_buffer_dirty(bh);
661         set_buffer_uptodate(bh);
662         bh->b_end_io = ext4_end_buffer_io_sync;
663         submit_bh(REQ_OP_WRITE | write_flags, bh);
664         EXT4_SB(sb)->s_fc_bh = NULL;
665 }
666
667 /* Ext4 commit path routines */
668
669 /* memzero and update CRC */
670 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
671                                 u32 *crc)
672 {
673         void *ret;
674
675         ret = memset(dst, 0, len);
676         if (crc)
677                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
678         return ret;
679 }
680
681 /*
682  * Allocate len bytes on a fast commit buffer.
683  *
684  * During the commit time this function is used to manage fast commit
685  * block space. We don't split a fast commit log onto different
686  * blocks. So this function makes sure that if there's not enough space
687  * on the current block, the remaining space in the current block is
688  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
689  * new block is from jbd2 and CRC is updated to reflect the padding
690  * we added.
691  */
692 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
693 {
694         struct ext4_fc_tl *tl;
695         struct ext4_sb_info *sbi = EXT4_SB(sb);
696         struct buffer_head *bh;
697         int bsize = sbi->s_journal->j_blocksize;
698         int ret, off = sbi->s_fc_bytes % bsize;
699         int pad_len;
700
701         /*
702          * After allocating len, we should have space at least for a 0 byte
703          * padding.
704          */
705         if (len + EXT4_FC_TAG_BASE_LEN > bsize)
706                 return NULL;
707
708         if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
709                 /*
710                  * Only allocate from current buffer if we have enough space for
711                  * this request AND we have space to add a zero byte padding.
712                  */
713                 if (!sbi->s_fc_bh) {
714                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
715                         if (ret)
716                                 return NULL;
717                         sbi->s_fc_bh = bh;
718                 }
719                 sbi->s_fc_bytes += len;
720                 return sbi->s_fc_bh->b_data + off;
721         }
722         /* Need to add PAD tag */
723         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
724         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
725         pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
726         tl->fc_len = cpu_to_le16(pad_len);
727         if (crc)
728                 *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
729         if (pad_len > 0)
730                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
731         ext4_fc_submit_bh(sb, false);
732
733         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
734         if (ret)
735                 return NULL;
736         sbi->s_fc_bh = bh;
737         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
738         return sbi->s_fc_bh->b_data;
739 }
740
741 /* memcpy to fc reserved space and update CRC */
742 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
743                                 int len, u32 *crc)
744 {
745         if (crc)
746                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
747         return memcpy(dst, src, len);
748 }
749
750 /*
751  * Complete a fast commit by writing tail tag.
752  *
753  * Writing tail tag marks the end of a fast commit. In order to guarantee
754  * atomicity, after writing tail tag, even if there's space remaining
755  * in the block, next commit shouldn't use it. That's why tail tag
756  * has the length as that of the remaining space on the block.
757  */
758 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
759 {
760         struct ext4_sb_info *sbi = EXT4_SB(sb);
761         struct ext4_fc_tl tl;
762         struct ext4_fc_tail tail;
763         int off, bsize = sbi->s_journal->j_blocksize;
764         u8 *dst;
765
766         /*
767          * ext4_fc_reserve_space takes care of allocating an extra block if
768          * there's no enough space on this block for accommodating this tail.
769          */
770         dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
771         if (!dst)
772                 return -ENOSPC;
773
774         off = sbi->s_fc_bytes % bsize;
775
776         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
777         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
778         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
779
780         ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
781         dst += EXT4_FC_TAG_BASE_LEN;
782         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
783         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
784         dst += sizeof(tail.fc_tid);
785         tail.fc_crc = cpu_to_le32(crc);
786         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
787
788         ext4_fc_submit_bh(sb, true);
789
790         return 0;
791 }
792
793 /*
794  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
795  * Returns false if there's not enough space.
796  */
797 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
798                            u32 *crc)
799 {
800         struct ext4_fc_tl tl;
801         u8 *dst;
802
803         dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
804         if (!dst)
805                 return false;
806
807         tl.fc_tag = cpu_to_le16(tag);
808         tl.fc_len = cpu_to_le16(len);
809
810         ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
811         ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
812
813         return true;
814 }
815
816 /* Same as above, but adds dentry tlv. */
817 static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
818                                    struct ext4_fc_dentry_update *fc_dentry)
819 {
820         struct ext4_fc_dentry_info fcd;
821         struct ext4_fc_tl tl;
822         int dlen = fc_dentry->fcd_name.len;
823         u8 *dst = ext4_fc_reserve_space(sb,
824                         EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
825
826         if (!dst)
827                 return false;
828
829         fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
830         fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
831         tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
832         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
833         ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
834         dst += EXT4_FC_TAG_BASE_LEN;
835         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
836         dst += sizeof(fcd);
837         ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
838
839         return true;
840 }
841
842 /*
843  * Writes inode in the fast commit space under TLV with tag @tag.
844  * Returns 0 on success, error on failure.
845  */
846 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
847 {
848         struct ext4_inode_info *ei = EXT4_I(inode);
849         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
850         int ret;
851         struct ext4_iloc iloc;
852         struct ext4_fc_inode fc_inode;
853         struct ext4_fc_tl tl;
854         u8 *dst;
855
856         ret = ext4_get_inode_loc(inode, &iloc);
857         if (ret)
858                 return ret;
859
860         if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
861                 inode_len = EXT4_INODE_SIZE(inode->i_sb);
862         else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
863                 inode_len += ei->i_extra_isize;
864
865         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
866         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
867         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
868
869         ret = -ECANCELED;
870         dst = ext4_fc_reserve_space(inode->i_sb,
871                 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
872         if (!dst)
873                 goto err;
874
875         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
876                 goto err;
877         dst += EXT4_FC_TAG_BASE_LEN;
878         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
879                 goto err;
880         dst += sizeof(fc_inode);
881         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
882                                         inode_len, crc))
883                 goto err;
884         ret = 0;
885 err:
886         brelse(iloc.bh);
887         return ret;
888 }
889
890 /*
891  * Writes updated data ranges for the inode in question. Updates CRC.
892  * Returns 0 on success, error otherwise.
893  */
894 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
895 {
896         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
897         struct ext4_inode_info *ei = EXT4_I(inode);
898         struct ext4_map_blocks map;
899         struct ext4_fc_add_range fc_ext;
900         struct ext4_fc_del_range lrange;
901         struct ext4_extent *ex;
902         int ret;
903
904         mutex_lock(&ei->i_fc_lock);
905         if (ei->i_fc_lblk_len == 0) {
906                 mutex_unlock(&ei->i_fc_lock);
907                 return 0;
908         }
909         old_blk_size = ei->i_fc_lblk_start;
910         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
911         ei->i_fc_lblk_len = 0;
912         mutex_unlock(&ei->i_fc_lock);
913
914         cur_lblk_off = old_blk_size;
915         ext4_debug("will try writing %d to %d for inode %ld\n",
916                    cur_lblk_off, new_blk_size, inode->i_ino);
917
918         while (cur_lblk_off <= new_blk_size) {
919                 map.m_lblk = cur_lblk_off;
920                 map.m_len = new_blk_size - cur_lblk_off + 1;
921                 ret = ext4_map_blocks(NULL, inode, &map, 0);
922                 if (ret < 0)
923                         return -ECANCELED;
924
925                 if (map.m_len == 0) {
926                         cur_lblk_off++;
927                         continue;
928                 }
929
930                 if (ret == 0) {
931                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
932                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
933                         lrange.fc_len = cpu_to_le32(map.m_len);
934                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
935                                             sizeof(lrange), (u8 *)&lrange, crc))
936                                 return -ENOSPC;
937                 } else {
938                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
939                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
940
941                         /* Limit the number of blocks in one extent */
942                         map.m_len = min(max, map.m_len);
943
944                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
945                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
946                         ex->ee_block = cpu_to_le32(map.m_lblk);
947                         ex->ee_len = cpu_to_le16(map.m_len);
948                         ext4_ext_store_pblock(ex, map.m_pblk);
949                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
950                                 ext4_ext_mark_unwritten(ex);
951                         else
952                                 ext4_ext_mark_initialized(ex);
953                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
954                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
955                                 return -ENOSPC;
956                 }
957
958                 cur_lblk_off += map.m_len;
959         }
960
961         return 0;
962 }
963
964
965 /* Submit data for all the fast commit inodes */
966 static int ext4_fc_submit_inode_data_all(journal_t *journal)
967 {
968         struct super_block *sb = journal->j_private;
969         struct ext4_sb_info *sbi = EXT4_SB(sb);
970         struct ext4_inode_info *ei;
971         int ret = 0;
972
973         spin_lock(&sbi->s_fc_lock);
974         list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
975                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
976                 while (atomic_read(&ei->i_fc_updates)) {
977                         DEFINE_WAIT(wait);
978
979                         prepare_to_wait(&ei->i_fc_wait, &wait,
980                                                 TASK_UNINTERRUPTIBLE);
981                         if (atomic_read(&ei->i_fc_updates)) {
982                                 spin_unlock(&sbi->s_fc_lock);
983                                 schedule();
984                                 spin_lock(&sbi->s_fc_lock);
985                         }
986                         finish_wait(&ei->i_fc_wait, &wait);
987                 }
988                 spin_unlock(&sbi->s_fc_lock);
989                 ret = jbd2_submit_inode_data(ei->jinode);
990                 if (ret)
991                         return ret;
992                 spin_lock(&sbi->s_fc_lock);
993         }
994         spin_unlock(&sbi->s_fc_lock);
995
996         return ret;
997 }
998
999 /* Wait for completion of data for all the fast commit inodes */
1000 static int ext4_fc_wait_inode_data_all(journal_t *journal)
1001 {
1002         struct super_block *sb = journal->j_private;
1003         struct ext4_sb_info *sbi = EXT4_SB(sb);
1004         struct ext4_inode_info *pos, *n;
1005         int ret = 0;
1006
1007         spin_lock(&sbi->s_fc_lock);
1008         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1009                 if (!ext4_test_inode_state(&pos->vfs_inode,
1010                                            EXT4_STATE_FC_COMMITTING))
1011                         continue;
1012                 spin_unlock(&sbi->s_fc_lock);
1013
1014                 ret = jbd2_wait_inode_data(journal, pos->jinode);
1015                 if (ret)
1016                         return ret;
1017                 spin_lock(&sbi->s_fc_lock);
1018         }
1019         spin_unlock(&sbi->s_fc_lock);
1020
1021         return 0;
1022 }
1023
1024 /* Commit all the directory entry updates */
1025 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1026 __acquires(&sbi->s_fc_lock)
1027 __releases(&sbi->s_fc_lock)
1028 {
1029         struct super_block *sb = journal->j_private;
1030         struct ext4_sb_info *sbi = EXT4_SB(sb);
1031         struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1032         struct inode *inode;
1033         struct ext4_inode_info *ei;
1034         int ret;
1035
1036         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1037                 return 0;
1038         list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1039                                  &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1040                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1041                         spin_unlock(&sbi->s_fc_lock);
1042                         if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1043                                 ret = -ENOSPC;
1044                                 goto lock_and_exit;
1045                         }
1046                         spin_lock(&sbi->s_fc_lock);
1047                         continue;
1048                 }
1049                 /*
1050                  * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1051                  * corresponding inode pointer
1052                  */
1053                 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1054                 ei = list_first_entry(&fc_dentry->fcd_dilist,
1055                                 struct ext4_inode_info, i_fc_dilist);
1056                 inode = &ei->vfs_inode;
1057                 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1058
1059                 spin_unlock(&sbi->s_fc_lock);
1060
1061                 /*
1062                  * We first write the inode and then the create dirent. This
1063                  * allows the recovery code to create an unnamed inode first
1064                  * and then link it to a directory entry. This allows us
1065                  * to use namei.c routines almost as is and simplifies
1066                  * the recovery code.
1067                  */
1068                 ret = ext4_fc_write_inode(inode, crc);
1069                 if (ret)
1070                         goto lock_and_exit;
1071
1072                 ret = ext4_fc_write_inode_data(inode, crc);
1073                 if (ret)
1074                         goto lock_and_exit;
1075
1076                 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1077                         ret = -ENOSPC;
1078                         goto lock_and_exit;
1079                 }
1080
1081                 spin_lock(&sbi->s_fc_lock);
1082         }
1083         return 0;
1084 lock_and_exit:
1085         spin_lock(&sbi->s_fc_lock);
1086         return ret;
1087 }
1088
1089 static int ext4_fc_perform_commit(journal_t *journal)
1090 {
1091         struct super_block *sb = journal->j_private;
1092         struct ext4_sb_info *sbi = EXT4_SB(sb);
1093         struct ext4_inode_info *iter;
1094         struct ext4_fc_head head;
1095         struct inode *inode;
1096         struct blk_plug plug;
1097         int ret = 0;
1098         u32 crc = 0;
1099
1100         ret = ext4_fc_submit_inode_data_all(journal);
1101         if (ret)
1102                 return ret;
1103
1104         ret = ext4_fc_wait_inode_data_all(journal);
1105         if (ret)
1106                 return ret;
1107
1108         /*
1109          * If file system device is different from journal device, issue a cache
1110          * flush before we start writing fast commit blocks.
1111          */
1112         if (journal->j_fs_dev != journal->j_dev)
1113                 blkdev_issue_flush(journal->j_fs_dev);
1114
1115         blk_start_plug(&plug);
1116         if (sbi->s_fc_bytes == 0) {
1117                 /*
1118                  * Add a head tag only if this is the first fast commit
1119                  * in this TID.
1120                  */
1121                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1122                 head.fc_tid = cpu_to_le32(
1123                         sbi->s_journal->j_running_transaction->t_tid);
1124                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1125                         (u8 *)&head, &crc)) {
1126                         ret = -ENOSPC;
1127                         goto out;
1128                 }
1129         }
1130
1131         spin_lock(&sbi->s_fc_lock);
1132         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1133         if (ret) {
1134                 spin_unlock(&sbi->s_fc_lock);
1135                 goto out;
1136         }
1137
1138         list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1139                 inode = &iter->vfs_inode;
1140                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1141                         continue;
1142
1143                 spin_unlock(&sbi->s_fc_lock);
1144                 ret = ext4_fc_write_inode_data(inode, &crc);
1145                 if (ret)
1146                         goto out;
1147                 ret = ext4_fc_write_inode(inode, &crc);
1148                 if (ret)
1149                         goto out;
1150                 spin_lock(&sbi->s_fc_lock);
1151         }
1152         spin_unlock(&sbi->s_fc_lock);
1153
1154         ret = ext4_fc_write_tail(sb, crc);
1155
1156 out:
1157         blk_finish_plug(&plug);
1158         return ret;
1159 }
1160
1161 static void ext4_fc_update_stats(struct super_block *sb, int status,
1162                                  u64 commit_time, int nblks, tid_t commit_tid)
1163 {
1164         struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1165
1166         ext4_debug("Fast commit ended with status = %d for tid %u",
1167                         status, commit_tid);
1168         if (status == EXT4_FC_STATUS_OK) {
1169                 stats->fc_num_commits++;
1170                 stats->fc_numblks += nblks;
1171                 if (likely(stats->s_fc_avg_commit_time))
1172                         stats->s_fc_avg_commit_time =
1173                                 (commit_time +
1174                                  stats->s_fc_avg_commit_time * 3) / 4;
1175                 else
1176                         stats->s_fc_avg_commit_time = commit_time;
1177         } else if (status == EXT4_FC_STATUS_FAILED ||
1178                    status == EXT4_FC_STATUS_INELIGIBLE) {
1179                 if (status == EXT4_FC_STATUS_FAILED)
1180                         stats->fc_failed_commits++;
1181                 stats->fc_ineligible_commits++;
1182         } else {
1183                 stats->fc_skipped_commits++;
1184         }
1185         trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1186 }
1187
1188 /*
1189  * The main commit entry point. Performs a fast commit for transaction
1190  * commit_tid if needed. If it's not possible to perform a fast commit
1191  * due to various reasons, we fall back to full commit. Returns 0
1192  * on success, error otherwise.
1193  */
1194 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1195 {
1196         struct super_block *sb = journal->j_private;
1197         struct ext4_sb_info *sbi = EXT4_SB(sb);
1198         int nblks = 0, ret, bsize = journal->j_blocksize;
1199         int subtid = atomic_read(&sbi->s_fc_subtid);
1200         int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1201         ktime_t start_time, commit_time;
1202
1203         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1204                 return jbd2_complete_transaction(journal, commit_tid);
1205
1206         trace_ext4_fc_commit_start(sb, commit_tid);
1207
1208         start_time = ktime_get();
1209
1210 restart_fc:
1211         ret = jbd2_fc_begin_commit(journal, commit_tid);
1212         if (ret == -EALREADY) {
1213                 /* There was an ongoing commit, check if we need to restart */
1214                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1215                         commit_tid > journal->j_commit_sequence)
1216                         goto restart_fc;
1217                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1218                                 commit_tid);
1219                 return 0;
1220         } else if (ret) {
1221                 /*
1222                  * Commit couldn't start. Just update stats and perform a
1223                  * full commit.
1224                  */
1225                 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1226                                 commit_tid);
1227                 return jbd2_complete_transaction(journal, commit_tid);
1228         }
1229
1230         /*
1231          * After establishing journal barrier via jbd2_fc_begin_commit(), check
1232          * if we are fast commit ineligible.
1233          */
1234         if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1235                 status = EXT4_FC_STATUS_INELIGIBLE;
1236                 goto fallback;
1237         }
1238
1239         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1240         ret = ext4_fc_perform_commit(journal);
1241         if (ret < 0) {
1242                 status = EXT4_FC_STATUS_FAILED;
1243                 goto fallback;
1244         }
1245         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1246         ret = jbd2_fc_wait_bufs(journal, nblks);
1247         if (ret < 0) {
1248                 status = EXT4_FC_STATUS_FAILED;
1249                 goto fallback;
1250         }
1251         atomic_inc(&sbi->s_fc_subtid);
1252         ret = jbd2_fc_end_commit(journal);
1253         /*
1254          * weight the commit time higher than the average time so we
1255          * don't react too strongly to vast changes in the commit time
1256          */
1257         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1258         ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1259         return ret;
1260
1261 fallback:
1262         ret = jbd2_fc_end_commit_fallback(journal);
1263         ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1264         return ret;
1265 }
1266
1267 /*
1268  * Fast commit cleanup routine. This is called after every fast commit and
1269  * full commit. full is true if we are called after a full commit.
1270  */
1271 static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1272 {
1273         struct super_block *sb = journal->j_private;
1274         struct ext4_sb_info *sbi = EXT4_SB(sb);
1275         struct ext4_inode_info *iter, *iter_n;
1276         struct ext4_fc_dentry_update *fc_dentry;
1277
1278         if (full && sbi->s_fc_bh)
1279                 sbi->s_fc_bh = NULL;
1280
1281         trace_ext4_fc_cleanup(journal, full, tid);
1282         jbd2_fc_release_bufs(journal);
1283
1284         spin_lock(&sbi->s_fc_lock);
1285         list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1286                                  i_fc_list) {
1287                 list_del_init(&iter->i_fc_list);
1288                 ext4_clear_inode_state(&iter->vfs_inode,
1289                                        EXT4_STATE_FC_COMMITTING);
1290                 if (iter->i_sync_tid <= tid)
1291                         ext4_fc_reset_inode(&iter->vfs_inode);
1292                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1293                 smp_mb();
1294 #if (BITS_PER_LONG < 64)
1295                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1296 #else
1297                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1298 #endif
1299         }
1300
1301         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1302                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1303                                              struct ext4_fc_dentry_update,
1304                                              fcd_list);
1305                 list_del_init(&fc_dentry->fcd_list);
1306                 list_del_init(&fc_dentry->fcd_dilist);
1307                 spin_unlock(&sbi->s_fc_lock);
1308
1309                 if (fc_dentry->fcd_name.name &&
1310                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1311                         kfree(fc_dentry->fcd_name.name);
1312                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1313                 spin_lock(&sbi->s_fc_lock);
1314         }
1315
1316         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1317                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1318         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1319                                 &sbi->s_fc_q[FC_Q_MAIN]);
1320
1321         if (tid >= sbi->s_fc_ineligible_tid) {
1322                 sbi->s_fc_ineligible_tid = 0;
1323                 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1324         }
1325
1326         if (full)
1327                 sbi->s_fc_bytes = 0;
1328         spin_unlock(&sbi->s_fc_lock);
1329         trace_ext4_fc_stats(sb);
1330 }
1331
1332 /* Ext4 Replay Path Routines */
1333
1334 /* Helper struct for dentry replay routines */
1335 struct dentry_info_args {
1336         int parent_ino, dname_len, ino, inode_len;
1337         char *dname;
1338 };
1339
1340 static inline void tl_to_darg(struct dentry_info_args *darg,
1341                               struct ext4_fc_tl *tl, u8 *val)
1342 {
1343         struct ext4_fc_dentry_info fcd;
1344
1345         memcpy(&fcd, val, sizeof(fcd));
1346
1347         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1348         darg->ino = le32_to_cpu(fcd.fc_ino);
1349         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1350         darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1351 }
1352
1353 static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
1354 {
1355         memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
1356         tl->fc_len = le16_to_cpu(tl->fc_len);
1357         tl->fc_tag = le16_to_cpu(tl->fc_tag);
1358 }
1359
1360 /* Unlink replay function */
1361 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1362                                  u8 *val)
1363 {
1364         struct inode *inode, *old_parent;
1365         struct qstr entry;
1366         struct dentry_info_args darg;
1367         int ret = 0;
1368
1369         tl_to_darg(&darg, tl, val);
1370
1371         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1372                         darg.parent_ino, darg.dname_len);
1373
1374         entry.name = darg.dname;
1375         entry.len = darg.dname_len;
1376         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1377
1378         if (IS_ERR(inode)) {
1379                 ext4_debug("Inode %d not found", darg.ino);
1380                 return 0;
1381         }
1382
1383         old_parent = ext4_iget(sb, darg.parent_ino,
1384                                 EXT4_IGET_NORMAL);
1385         if (IS_ERR(old_parent)) {
1386                 ext4_debug("Dir with inode %d not found", darg.parent_ino);
1387                 iput(inode);
1388                 return 0;
1389         }
1390
1391         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1392         /* -ENOENT ok coz it might not exist anymore. */
1393         if (ret == -ENOENT)
1394                 ret = 0;
1395         iput(old_parent);
1396         iput(inode);
1397         return ret;
1398 }
1399
1400 static int ext4_fc_replay_link_internal(struct super_block *sb,
1401                                 struct dentry_info_args *darg,
1402                                 struct inode *inode)
1403 {
1404         struct inode *dir = NULL;
1405         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1406         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1407         int ret = 0;
1408
1409         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1410         if (IS_ERR(dir)) {
1411                 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1412                 dir = NULL;
1413                 goto out;
1414         }
1415
1416         dentry_dir = d_obtain_alias(dir);
1417         if (IS_ERR(dentry_dir)) {
1418                 ext4_debug("Failed to obtain dentry");
1419                 dentry_dir = NULL;
1420                 goto out;
1421         }
1422
1423         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1424         if (!dentry_inode) {
1425                 ext4_debug("Inode dentry not created.");
1426                 ret = -ENOMEM;
1427                 goto out;
1428         }
1429
1430         ret = __ext4_link(dir, inode, dentry_inode);
1431         /*
1432          * It's possible that link already existed since data blocks
1433          * for the dir in question got persisted before we crashed OR
1434          * we replayed this tag and crashed before the entire replay
1435          * could complete.
1436          */
1437         if (ret && ret != -EEXIST) {
1438                 ext4_debug("Failed to link\n");
1439                 goto out;
1440         }
1441
1442         ret = 0;
1443 out:
1444         if (dentry_dir) {
1445                 d_drop(dentry_dir);
1446                 dput(dentry_dir);
1447         } else if (dir) {
1448                 iput(dir);
1449         }
1450         if (dentry_inode) {
1451                 d_drop(dentry_inode);
1452                 dput(dentry_inode);
1453         }
1454
1455         return ret;
1456 }
1457
1458 /* Link replay function */
1459 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1460                                u8 *val)
1461 {
1462         struct inode *inode;
1463         struct dentry_info_args darg;
1464         int ret = 0;
1465
1466         tl_to_darg(&darg, tl, val);
1467         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1468                         darg.parent_ino, darg.dname_len);
1469
1470         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1471         if (IS_ERR(inode)) {
1472                 ext4_debug("Inode not found.");
1473                 return 0;
1474         }
1475
1476         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1477         iput(inode);
1478         return ret;
1479 }
1480
1481 /*
1482  * Record all the modified inodes during replay. We use this later to setup
1483  * block bitmaps correctly.
1484  */
1485 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1486 {
1487         struct ext4_fc_replay_state *state;
1488         int i;
1489
1490         state = &EXT4_SB(sb)->s_fc_replay_state;
1491         for (i = 0; i < state->fc_modified_inodes_used; i++)
1492                 if (state->fc_modified_inodes[i] == ino)
1493                         return 0;
1494         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1495                 int *fc_modified_inodes;
1496
1497                 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1498                                 sizeof(int) * (state->fc_modified_inodes_size +
1499                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1500                                 GFP_KERNEL);
1501                 if (!fc_modified_inodes)
1502                         return -ENOMEM;
1503                 state->fc_modified_inodes = fc_modified_inodes;
1504                 state->fc_modified_inodes_size +=
1505                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1506         }
1507         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1508         return 0;
1509 }
1510
1511 /*
1512  * Inode replay function
1513  */
1514 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1515                                 u8 *val)
1516 {
1517         struct ext4_fc_inode fc_inode;
1518         struct ext4_inode *raw_inode;
1519         struct ext4_inode *raw_fc_inode;
1520         struct inode *inode = NULL;
1521         struct ext4_iloc iloc;
1522         int inode_len, ino, ret, tag = tl->fc_tag;
1523         struct ext4_extent_header *eh;
1524         size_t off_gen = offsetof(struct ext4_inode, i_generation);
1525
1526         memcpy(&fc_inode, val, sizeof(fc_inode));
1527
1528         ino = le32_to_cpu(fc_inode.fc_ino);
1529         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1530
1531         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1532         if (!IS_ERR(inode)) {
1533                 ext4_ext_clear_bb(inode);
1534                 iput(inode);
1535         }
1536         inode = NULL;
1537
1538         ret = ext4_fc_record_modified_inode(sb, ino);
1539         if (ret)
1540                 goto out;
1541
1542         raw_fc_inode = (struct ext4_inode *)
1543                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1544         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1545         if (ret)
1546                 goto out;
1547
1548         inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1549         raw_inode = ext4_raw_inode(&iloc);
1550
1551         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1552         memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1553                inode_len - off_gen);
1554         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1555                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1556                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1557                         memset(eh, 0, sizeof(*eh));
1558                         eh->eh_magic = EXT4_EXT_MAGIC;
1559                         eh->eh_max = cpu_to_le16(
1560                                 (sizeof(raw_inode->i_block) -
1561                                  sizeof(struct ext4_extent_header))
1562                                  / sizeof(struct ext4_extent));
1563                 }
1564         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1565                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1566                         sizeof(raw_inode->i_block));
1567         }
1568
1569         /* Immediately update the inode on disk. */
1570         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1571         if (ret)
1572                 goto out;
1573         ret = sync_dirty_buffer(iloc.bh);
1574         if (ret)
1575                 goto out;
1576         ret = ext4_mark_inode_used(sb, ino);
1577         if (ret)
1578                 goto out;
1579
1580         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1581         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1582         if (IS_ERR(inode)) {
1583                 ext4_debug("Inode not found.");
1584                 return -EFSCORRUPTED;
1585         }
1586
1587         /*
1588          * Our allocator could have made different decisions than before
1589          * crashing. This should be fixed but until then, we calculate
1590          * the number of blocks the inode.
1591          */
1592         if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1593                 ext4_ext_replay_set_iblocks(inode);
1594
1595         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1596         ext4_reset_inode_seed(inode);
1597
1598         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1599         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1600         sync_dirty_buffer(iloc.bh);
1601         brelse(iloc.bh);
1602 out:
1603         iput(inode);
1604         if (!ret)
1605                 blkdev_issue_flush(sb->s_bdev);
1606
1607         return 0;
1608 }
1609
1610 /*
1611  * Dentry create replay function.
1612  *
1613  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1614  * inode for which we are trying to create a dentry here, should already have
1615  * been replayed before we start here.
1616  */
1617 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1618                                  u8 *val)
1619 {
1620         int ret = 0;
1621         struct inode *inode = NULL;
1622         struct inode *dir = NULL;
1623         struct dentry_info_args darg;
1624
1625         tl_to_darg(&darg, tl, val);
1626
1627         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1628                         darg.parent_ino, darg.dname_len);
1629
1630         /* This takes care of update group descriptor and other metadata */
1631         ret = ext4_mark_inode_used(sb, darg.ino);
1632         if (ret)
1633                 goto out;
1634
1635         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1636         if (IS_ERR(inode)) {
1637                 ext4_debug("inode %d not found.", darg.ino);
1638                 inode = NULL;
1639                 ret = -EINVAL;
1640                 goto out;
1641         }
1642
1643         if (S_ISDIR(inode->i_mode)) {
1644                 /*
1645                  * If we are creating a directory, we need to make sure that the
1646                  * dot and dot dot dirents are setup properly.
1647                  */
1648                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1649                 if (IS_ERR(dir)) {
1650                         ext4_debug("Dir %d not found.", darg.ino);
1651                         goto out;
1652                 }
1653                 ret = ext4_init_new_dir(NULL, dir, inode);
1654                 iput(dir);
1655                 if (ret) {
1656                         ret = 0;
1657                         goto out;
1658                 }
1659         }
1660         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1661         if (ret)
1662                 goto out;
1663         set_nlink(inode, 1);
1664         ext4_mark_inode_dirty(NULL, inode);
1665 out:
1666         iput(inode);
1667         return ret;
1668 }
1669
1670 /*
1671  * Record physical disk regions which are in use as per fast commit area,
1672  * and used by inodes during replay phase. Our simple replay phase
1673  * allocator excludes these regions from allocation.
1674  */
1675 int ext4_fc_record_regions(struct super_block *sb, int ino,
1676                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1677 {
1678         struct ext4_fc_replay_state *state;
1679         struct ext4_fc_alloc_region *region;
1680
1681         state = &EXT4_SB(sb)->s_fc_replay_state;
1682         /*
1683          * during replay phase, the fc_regions_valid may not same as
1684          * fc_regions_used, update it when do new additions.
1685          */
1686         if (replay && state->fc_regions_used != state->fc_regions_valid)
1687                 state->fc_regions_used = state->fc_regions_valid;
1688         if (state->fc_regions_used == state->fc_regions_size) {
1689                 struct ext4_fc_alloc_region *fc_regions;
1690
1691                 fc_regions = krealloc(state->fc_regions,
1692                                       sizeof(struct ext4_fc_alloc_region) *
1693                                       (state->fc_regions_size +
1694                                        EXT4_FC_REPLAY_REALLOC_INCREMENT),
1695                                       GFP_KERNEL);
1696                 if (!fc_regions)
1697                         return -ENOMEM;
1698                 state->fc_regions_size +=
1699                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1700                 state->fc_regions = fc_regions;
1701         }
1702         region = &state->fc_regions[state->fc_regions_used++];
1703         region->ino = ino;
1704         region->lblk = lblk;
1705         region->pblk = pblk;
1706         region->len = len;
1707
1708         if (replay)
1709                 state->fc_regions_valid++;
1710
1711         return 0;
1712 }
1713
1714 /* Replay add range tag */
1715 static int ext4_fc_replay_add_range(struct super_block *sb,
1716                                     struct ext4_fc_tl *tl, u8 *val)
1717 {
1718         struct ext4_fc_add_range fc_add_ex;
1719         struct ext4_extent newex, *ex;
1720         struct inode *inode;
1721         ext4_lblk_t start, cur;
1722         int remaining, len;
1723         ext4_fsblk_t start_pblk;
1724         struct ext4_map_blocks map;
1725         struct ext4_ext_path *path = NULL;
1726         int ret;
1727
1728         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1729         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1730
1731         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1732                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1733                 ext4_ext_get_actual_len(ex));
1734
1735         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1736         if (IS_ERR(inode)) {
1737                 ext4_debug("Inode not found.");
1738                 return 0;
1739         }
1740
1741         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1742         if (ret)
1743                 goto out;
1744
1745         start = le32_to_cpu(ex->ee_block);
1746         start_pblk = ext4_ext_pblock(ex);
1747         len = ext4_ext_get_actual_len(ex);
1748
1749         cur = start;
1750         remaining = len;
1751         ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1752                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1753                   inode->i_ino);
1754
1755         while (remaining > 0) {
1756                 map.m_lblk = cur;
1757                 map.m_len = remaining;
1758                 map.m_pblk = 0;
1759                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1760
1761                 if (ret < 0)
1762                         goto out;
1763
1764                 if (ret == 0) {
1765                         /* Range is not mapped */
1766                         path = ext4_find_extent(inode, cur, NULL, 0);
1767                         if (IS_ERR(path))
1768                                 goto out;
1769                         memset(&newex, 0, sizeof(newex));
1770                         newex.ee_block = cpu_to_le32(cur);
1771                         ext4_ext_store_pblock(
1772                                 &newex, start_pblk + cur - start);
1773                         newex.ee_len = cpu_to_le16(map.m_len);
1774                         if (ext4_ext_is_unwritten(ex))
1775                                 ext4_ext_mark_unwritten(&newex);
1776                         down_write(&EXT4_I(inode)->i_data_sem);
1777                         ret = ext4_ext_insert_extent(
1778                                 NULL, inode, &path, &newex, 0);
1779                         up_write((&EXT4_I(inode)->i_data_sem));
1780                         ext4_free_ext_path(path);
1781                         if (ret)
1782                                 goto out;
1783                         goto next;
1784                 }
1785
1786                 if (start_pblk + cur - start != map.m_pblk) {
1787                         /*
1788                          * Logical to physical mapping changed. This can happen
1789                          * if this range was removed and then reallocated to
1790                          * map to new physical blocks during a fast commit.
1791                          */
1792                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1793                                         ext4_ext_is_unwritten(ex),
1794                                         start_pblk + cur - start);
1795                         if (ret)
1796                                 goto out;
1797                         /*
1798                          * Mark the old blocks as free since they aren't used
1799                          * anymore. We maintain an array of all the modified
1800                          * inodes. In case these blocks are still used at either
1801                          * a different logical range in the same inode or in
1802                          * some different inode, we will mark them as allocated
1803                          * at the end of the FC replay using our array of
1804                          * modified inodes.
1805                          */
1806                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1807                         goto next;
1808                 }
1809
1810                 /* Range is mapped and needs a state change */
1811                 ext4_debug("Converting from %ld to %d %lld",
1812                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1813                         ext4_ext_is_unwritten(ex), map.m_pblk);
1814                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1815                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1816                 if (ret)
1817                         goto out;
1818                 /*
1819                  * We may have split the extent tree while toggling the state.
1820                  * Try to shrink the extent tree now.
1821                  */
1822                 ext4_ext_replay_shrink_inode(inode, start + len);
1823 next:
1824                 cur += map.m_len;
1825                 remaining -= map.m_len;
1826         }
1827         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1828                                         sb->s_blocksize_bits);
1829 out:
1830         iput(inode);
1831         return 0;
1832 }
1833
1834 /* Replay DEL_RANGE tag */
1835 static int
1836 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1837                          u8 *val)
1838 {
1839         struct inode *inode;
1840         struct ext4_fc_del_range lrange;
1841         struct ext4_map_blocks map;
1842         ext4_lblk_t cur, remaining;
1843         int ret;
1844
1845         memcpy(&lrange, val, sizeof(lrange));
1846         cur = le32_to_cpu(lrange.fc_lblk);
1847         remaining = le32_to_cpu(lrange.fc_len);
1848
1849         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1850                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1851
1852         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1853         if (IS_ERR(inode)) {
1854                 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1855                 return 0;
1856         }
1857
1858         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1859         if (ret)
1860                 goto out;
1861
1862         ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1863                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1864                         le32_to_cpu(lrange.fc_len));
1865         while (remaining > 0) {
1866                 map.m_lblk = cur;
1867                 map.m_len = remaining;
1868
1869                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1870                 if (ret < 0)
1871                         goto out;
1872                 if (ret > 0) {
1873                         remaining -= ret;
1874                         cur += ret;
1875                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1876                 } else {
1877                         remaining -= map.m_len;
1878                         cur += map.m_len;
1879                 }
1880         }
1881
1882         down_write(&EXT4_I(inode)->i_data_sem);
1883         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1884                                 le32_to_cpu(lrange.fc_lblk) +
1885                                 le32_to_cpu(lrange.fc_len) - 1);
1886         up_write(&EXT4_I(inode)->i_data_sem);
1887         if (ret)
1888                 goto out;
1889         ext4_ext_replay_shrink_inode(inode,
1890                 i_size_read(inode) >> sb->s_blocksize_bits);
1891         ext4_mark_inode_dirty(NULL, inode);
1892 out:
1893         iput(inode);
1894         return 0;
1895 }
1896
1897 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1898 {
1899         struct ext4_fc_replay_state *state;
1900         struct inode *inode;
1901         struct ext4_ext_path *path = NULL;
1902         struct ext4_map_blocks map;
1903         int i, ret, j;
1904         ext4_lblk_t cur, end;
1905
1906         state = &EXT4_SB(sb)->s_fc_replay_state;
1907         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1908                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1909                         EXT4_IGET_NORMAL);
1910                 if (IS_ERR(inode)) {
1911                         ext4_debug("Inode %d not found.",
1912                                 state->fc_modified_inodes[i]);
1913                         continue;
1914                 }
1915                 cur = 0;
1916                 end = EXT_MAX_BLOCKS;
1917                 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1918                         iput(inode);
1919                         continue;
1920                 }
1921                 while (cur < end) {
1922                         map.m_lblk = cur;
1923                         map.m_len = end - cur;
1924
1925                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1926                         if (ret < 0)
1927                                 break;
1928
1929                         if (ret > 0) {
1930                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1931                                 if (!IS_ERR(path)) {
1932                                         for (j = 0; j < path->p_depth; j++)
1933                                                 ext4_mb_mark_bb(inode->i_sb,
1934                                                         path[j].p_block, 1, 1);
1935                                         ext4_free_ext_path(path);
1936                                 }
1937                                 cur += ret;
1938                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1939                                                         map.m_len, 1);
1940                         } else {
1941                                 cur = cur + (map.m_len ? map.m_len : 1);
1942                         }
1943                 }
1944                 iput(inode);
1945         }
1946 }
1947
1948 /*
1949  * Check if block is in excluded regions for block allocation. The simple
1950  * allocator that runs during replay phase is calls this function to see
1951  * if it is okay to use a block.
1952  */
1953 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1954 {
1955         int i;
1956         struct ext4_fc_replay_state *state;
1957
1958         state = &EXT4_SB(sb)->s_fc_replay_state;
1959         for (i = 0; i < state->fc_regions_valid; i++) {
1960                 if (state->fc_regions[i].ino == 0 ||
1961                         state->fc_regions[i].len == 0)
1962                         continue;
1963                 if (in_range(blk, state->fc_regions[i].pblk,
1964                                         state->fc_regions[i].len))
1965                         return true;
1966         }
1967         return false;
1968 }
1969
1970 /* Cleanup function called after replay */
1971 void ext4_fc_replay_cleanup(struct super_block *sb)
1972 {
1973         struct ext4_sb_info *sbi = EXT4_SB(sb);
1974
1975         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1976         kfree(sbi->s_fc_replay_state.fc_regions);
1977         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1978 }
1979
1980 static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
1981                                            u8 *val, u8 *end)
1982 {
1983         if (val + tl->fc_len > end)
1984                 return false;
1985
1986         /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
1987          * journal rescan before do CRC check. Other tags length check will
1988          * rely on CRC check.
1989          */
1990         switch (tl->fc_tag) {
1991         case EXT4_FC_TAG_ADD_RANGE:
1992                 return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
1993         case EXT4_FC_TAG_TAIL:
1994                 return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
1995         case EXT4_FC_TAG_HEAD:
1996                 return (sizeof(struct ext4_fc_head) == tl->fc_len);
1997         case EXT4_FC_TAG_DEL_RANGE:
1998         case EXT4_FC_TAG_LINK:
1999         case EXT4_FC_TAG_UNLINK:
2000         case EXT4_FC_TAG_CREAT:
2001         case EXT4_FC_TAG_INODE:
2002         case EXT4_FC_TAG_PAD:
2003         default:
2004                 return true;
2005         }
2006 }
2007
2008 /*
2009  * Recovery Scan phase handler
2010  *
2011  * This function is called during the scan phase and is responsible
2012  * for doing following things:
2013  * - Make sure the fast commit area has valid tags for replay
2014  * - Count number of tags that need to be replayed by the replay handler
2015  * - Verify CRC
2016  * - Create a list of excluded blocks for allocation during replay phase
2017  *
2018  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2019  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2020  * to indicate that scan has finished and JBD2 can now start replay phase.
2021  * It returns a negative error to indicate that there was an error. At the end
2022  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2023  * to indicate the number of tags that need to replayed during the replay phase.
2024  */
2025 static int ext4_fc_replay_scan(journal_t *journal,
2026                                 struct buffer_head *bh, int off,
2027                                 tid_t expected_tid)
2028 {
2029         struct super_block *sb = journal->j_private;
2030         struct ext4_sb_info *sbi = EXT4_SB(sb);
2031         struct ext4_fc_replay_state *state;
2032         int ret = JBD2_FC_REPLAY_CONTINUE;
2033         struct ext4_fc_add_range ext;
2034         struct ext4_fc_tl tl;
2035         struct ext4_fc_tail tail;
2036         __u8 *start, *end, *cur, *val;
2037         struct ext4_fc_head head;
2038         struct ext4_extent *ex;
2039
2040         state = &sbi->s_fc_replay_state;
2041
2042         start = (u8 *)bh->b_data;
2043         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2044
2045         if (state->fc_replay_expected_off == 0) {
2046                 state->fc_cur_tag = 0;
2047                 state->fc_replay_num_tags = 0;
2048                 state->fc_crc = 0;
2049                 state->fc_regions = NULL;
2050                 state->fc_regions_valid = state->fc_regions_used =
2051                         state->fc_regions_size = 0;
2052                 /* Check if we can stop early */
2053                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2054                         != EXT4_FC_TAG_HEAD)
2055                         return 0;
2056         }
2057
2058         if (off != state->fc_replay_expected_off) {
2059                 ret = -EFSCORRUPTED;
2060                 goto out_err;
2061         }
2062
2063         state->fc_replay_expected_off++;
2064         for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
2065              cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2066                 ext4_fc_get_tl(&tl, cur);
2067                 val = cur + EXT4_FC_TAG_BASE_LEN;
2068                 if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
2069                         ret = state->fc_replay_num_tags ?
2070                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2071                         goto out_err;
2072                 }
2073                 ext4_debug("Scan phase, tag:%s, blk %lld\n",
2074                            tag2str(tl.fc_tag), bh->b_blocknr);
2075                 switch (tl.fc_tag) {
2076                 case EXT4_FC_TAG_ADD_RANGE:
2077                         memcpy(&ext, val, sizeof(ext));
2078                         ex = (struct ext4_extent *)&ext.fc_ex;
2079                         ret = ext4_fc_record_regions(sb,
2080                                 le32_to_cpu(ext.fc_ino),
2081                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2082                                 ext4_ext_get_actual_len(ex), 0);
2083                         if (ret < 0)
2084                                 break;
2085                         ret = JBD2_FC_REPLAY_CONTINUE;
2086                         fallthrough;
2087                 case EXT4_FC_TAG_DEL_RANGE:
2088                 case EXT4_FC_TAG_LINK:
2089                 case EXT4_FC_TAG_UNLINK:
2090                 case EXT4_FC_TAG_CREAT:
2091                 case EXT4_FC_TAG_INODE:
2092                 case EXT4_FC_TAG_PAD:
2093                         state->fc_cur_tag++;
2094                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2095                                 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2096                         break;
2097                 case EXT4_FC_TAG_TAIL:
2098                         state->fc_cur_tag++;
2099                         memcpy(&tail, val, sizeof(tail));
2100                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2101                                                 EXT4_FC_TAG_BASE_LEN +
2102                                                 offsetof(struct ext4_fc_tail,
2103                                                 fc_crc));
2104                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2105                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2106                                 state->fc_replay_num_tags = state->fc_cur_tag;
2107                                 state->fc_regions_valid =
2108                                         state->fc_regions_used;
2109                         } else {
2110                                 ret = state->fc_replay_num_tags ?
2111                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2112                         }
2113                         state->fc_crc = 0;
2114                         break;
2115                 case EXT4_FC_TAG_HEAD:
2116                         memcpy(&head, val, sizeof(head));
2117                         if (le32_to_cpu(head.fc_features) &
2118                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2119                                 ret = -EOPNOTSUPP;
2120                                 break;
2121                         }
2122                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2123                                 ret = JBD2_FC_REPLAY_STOP;
2124                                 break;
2125                         }
2126                         state->fc_cur_tag++;
2127                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2128                                 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2129                         break;
2130                 default:
2131                         ret = state->fc_replay_num_tags ?
2132                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2133                 }
2134                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2135                         break;
2136         }
2137
2138 out_err:
2139         trace_ext4_fc_replay_scan(sb, ret, off);
2140         return ret;
2141 }
2142
2143 /*
2144  * Main recovery path entry point.
2145  * The meaning of return codes is similar as above.
2146  */
2147 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2148                                 enum passtype pass, int off, tid_t expected_tid)
2149 {
2150         struct super_block *sb = journal->j_private;
2151         struct ext4_sb_info *sbi = EXT4_SB(sb);
2152         struct ext4_fc_tl tl;
2153         __u8 *start, *end, *cur, *val;
2154         int ret = JBD2_FC_REPLAY_CONTINUE;
2155         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2156         struct ext4_fc_tail tail;
2157
2158         if (pass == PASS_SCAN) {
2159                 state->fc_current_pass = PASS_SCAN;
2160                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2161         }
2162
2163         if (state->fc_current_pass != pass) {
2164                 state->fc_current_pass = pass;
2165                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2166         }
2167         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2168                 ext4_debug("Replay stops\n");
2169                 ext4_fc_set_bitmaps_and_counters(sb);
2170                 return 0;
2171         }
2172
2173 #ifdef CONFIG_EXT4_DEBUG
2174         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2175                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2176                 return JBD2_FC_REPLAY_STOP;
2177         }
2178 #endif
2179
2180         start = (u8 *)bh->b_data;
2181         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2182
2183         for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
2184              cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2185                 ext4_fc_get_tl(&tl, cur);
2186                 val = cur + EXT4_FC_TAG_BASE_LEN;
2187
2188                 if (state->fc_replay_num_tags == 0) {
2189                         ret = JBD2_FC_REPLAY_STOP;
2190                         ext4_fc_set_bitmaps_and_counters(sb);
2191                         break;
2192                 }
2193
2194                 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2195                 state->fc_replay_num_tags--;
2196                 switch (tl.fc_tag) {
2197                 case EXT4_FC_TAG_LINK:
2198                         ret = ext4_fc_replay_link(sb, &tl, val);
2199                         break;
2200                 case EXT4_FC_TAG_UNLINK:
2201                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2202                         break;
2203                 case EXT4_FC_TAG_ADD_RANGE:
2204                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2205                         break;
2206                 case EXT4_FC_TAG_CREAT:
2207                         ret = ext4_fc_replay_create(sb, &tl, val);
2208                         break;
2209                 case EXT4_FC_TAG_DEL_RANGE:
2210                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2211                         break;
2212                 case EXT4_FC_TAG_INODE:
2213                         ret = ext4_fc_replay_inode(sb, &tl, val);
2214                         break;
2215                 case EXT4_FC_TAG_PAD:
2216                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2217                                              tl.fc_len, 0);
2218                         break;
2219                 case EXT4_FC_TAG_TAIL:
2220                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2221                                              0, tl.fc_len, 0);
2222                         memcpy(&tail, val, sizeof(tail));
2223                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2224                         break;
2225                 case EXT4_FC_TAG_HEAD:
2226                         break;
2227                 default:
2228                         trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2229                         ret = -ECANCELED;
2230                         break;
2231                 }
2232                 if (ret < 0)
2233                         break;
2234                 ret = JBD2_FC_REPLAY_CONTINUE;
2235         }
2236         return ret;
2237 }
2238
2239 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2240 {
2241         /*
2242          * We set replay callback even if fast commit disabled because we may
2243          * could still have fast commit blocks that need to be replayed even if
2244          * fast commit has now been turned off.
2245          */
2246         journal->j_fc_replay_callback = ext4_fc_replay;
2247         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2248                 return;
2249         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2250 }
2251
2252 static const char *fc_ineligible_reasons[] = {
2253         "Extended attributes changed",
2254         "Cross rename",
2255         "Journal flag changed",
2256         "Insufficient memory",
2257         "Swap boot",
2258         "Resize",
2259         "Dir renamed",
2260         "Falloc range op",
2261         "Data journalling",
2262         "FC Commit Failed"
2263 };
2264
2265 int ext4_fc_info_show(struct seq_file *seq, void *v)
2266 {
2267         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2268         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2269         int i;
2270
2271         if (v != SEQ_START_TOKEN)
2272                 return 0;
2273
2274         seq_printf(seq,
2275                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2276                    stats->fc_num_commits, stats->fc_ineligible_commits,
2277                    stats->fc_numblks,
2278                    div_u64(stats->s_fc_avg_commit_time, 1000));
2279         seq_puts(seq, "Ineligible reasons:\n");
2280         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2281                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2282                         stats->fc_ineligible_reason_count[i]);
2283
2284         return 0;
2285 }
2286
2287 int __init ext4_fc_init_dentry_cache(void)
2288 {
2289         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2290                                            SLAB_RECLAIM_ACCOUNT);
2291
2292         if (ext4_fc_dentry_cachep == NULL)
2293                 return -ENOMEM;
2294
2295         return 0;
2296 }
2297
2298 void ext4_fc_destroy_dentry_cache(void)
2299 {
2300         kmem_cache_destroy(ext4_fc_dentry_cachep);
2301 }