Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
Pull ext4 updates from Ted Ts'o:
 "This merge window saw the the following new featuers added to ext4:

   - Direct I/O via iomap (required the iomap-for-next branch from
     Darrick as a prereq).

   - Support for using dioread-nolock where the block size < page size.

   - Support for encryption for file systems where the block size < page
     size.

   - Rework of journal credits handling so a revoke-heavy workload will
     not cause the journal to run out of space.

   - Replace bit-spinlocks with spinlocks in jbd2

  Also included were some bug fixes and cleanups, mostly to clean up
  corner cases from fuzzed file systems and error path handling"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (59 commits)
  ext4: work around deleting a file with i_nlink == 0 safely
  ext4: add more paranoia checking in ext4_expand_extra_isize handling
  jbd2: make jbd2_handle_buffer_credits() handle reserved handles
  ext4: fix a bug in ext4_wait_for_tail_page_commit
  ext4: bio_alloc with __GFP_DIRECT_RECLAIM never fails
  ext4: code cleanup for get_next_id
  ext4: fix leak of quota reservations
  ext4: remove unused variable warning in parse_options()
  ext4: Enable encryption for subpage-sized blocks
  fs/buffer.c: support fscrypt in block_read_full_page()
  ext4: Add error handling for io_end_vec struct allocation
  jbd2: Fine tune estimate of necessary descriptor blocks
  jbd2: Provide trace event for handle restarts
  ext4: Reserve revoke credits for freed blocks
  jbd2: Make credit checking more strict
  jbd2: Rename h_buffer_credits to h_total_credits
  jbd2: Reserve space for revoke descriptor blocks
  jbd2: Drop jbd2_space_needed()
  jbd2: Account descriptor blocks into t_outstanding_credits
  jbd2: Factor out common parts of stopping and restarting a handle
  ...

30 files changed:
Documentation/filesystems/fscrypt.rst
fs/buffer.c
fs/ext4/ext4.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/fsync.c
fs/ext4/ialloc.c
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/migrate.c
fs/ext4/namei.c
fs/ext4/page-io.c
fs/ext4/readpage.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/revoke.c
fs/jbd2/transaction.c
fs/ocfs2/alloc.c
fs/ocfs2/journal.c
fs/ocfs2/suballoc.c
include/linux/jbd2.h
include/linux/journal-head.h
include/trace/events/ext4.h
include/trace/events/jbd2.h

index 471a511..68c2bc8 100644 (file)
@@ -342,8 +342,8 @@ Contents encryption
 -------------------
 
 For file contents, each filesystem block is encrypted independently.
-Currently, only the case where the filesystem block size is equal to
-the system's page size (usually 4096 bytes) is supported.
+Starting from Linux kernel 5.5, encryption of filesystems with block
+size less than system's page size is supported.
 
 Each block's IV is set to the logical block number within the file as
 a little endian number, except that:
index 86a38b9..d398380 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/pagevec.h>
 #include <linux/sched/mm.h>
 #include <trace/events/block.h>
+#include <linux/fscrypt.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -246,10 +247,6 @@ out:
        return ret;
 }
 
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
        unsigned long flags;
@@ -307,6 +304,47 @@ still_busy:
        return;
 }
 
+struct decrypt_bh_ctx {
+       struct work_struct work;
+       struct buffer_head *bh;
+};
+
+static void decrypt_bh(struct work_struct *work)
+{
+       struct decrypt_bh_ctx *ctx =
+               container_of(work, struct decrypt_bh_ctx, work);
+       struct buffer_head *bh = ctx->bh;
+       int err;
+
+       err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
+                                              bh_offset(bh));
+       end_buffer_async_read(bh, err == 0);
+       kfree(ctx);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
+{
+       /* Decrypt if needed */
+       if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
+           IS_ENCRYPTED(bh->b_page->mapping->host) &&
+           S_ISREG(bh->b_page->mapping->host->i_mode)) {
+               struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+
+               if (ctx) {
+                       INIT_WORK(&ctx->work, decrypt_bh);
+                       ctx->bh = bh;
+                       fscrypt_enqueue_decrypt_work(&ctx->work);
+                       return;
+               }
+               uptodate = 0;
+       }
+       end_buffer_async_read(bh, uptodate);
+}
+
 /*
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
@@ -379,7 +417,7 @@ EXPORT_SYMBOL(end_buffer_async_write);
  */
 static void mark_buffer_async_read(struct buffer_head *bh)
 {
-       bh->b_end_io = end_buffer_async_read;
+       bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
 }
 
index b3a2cc7..f8578ca 100644 (file)
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
  */
 #define        EXT4_IO_END_UNWRITTEN   0x0001
 
+struct ext4_io_end_vec {
+       struct list_head list;          /* list of io_end_vec */
+       loff_t offset;                  /* offset in the file */
+       ssize_t size;                   /* size of the extent */
+};
+
 /*
  * For converting unwritten extents on a work queue. 'handle' is used for
  * buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
                                                 * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
        atomic_t                count;          /* reference counter */
-       loff_t                  offset;         /* offset in the file */
-       ssize_t                 size;           /* size of the extent */
+       struct list_head        list_vec;       /* list of ext4_io_end_vec */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
@@ -1579,7 +1584,6 @@ enum {
        EXT4_STATE_NO_EXPAND,           /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
-       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
@@ -2562,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
 int ext4_walk_page_buffers(handle_t *handle,
@@ -2606,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
@@ -3266,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+                                            ext4_io_end_t *io_end);
 extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
 extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3298,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
 extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+                                      int check_cred, int restart_cred,
+                                      int revoke_cred);
+
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3324,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               int len,
                               struct writeback_control *wbc,
                               bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -3381,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 }
 
 extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
 
 static inline int ext4_buffer_uptodate(struct buffer_head *bh)
 {
index 7c70b08..d3b8cde 100644 (file)
@@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb)
 }
 
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                 int type, int blocks, int rsv_blocks)
+                                 int type, int blocks, int rsv_blocks,
+                                 int revoke_creds)
 {
        journal_t *journal;
        int err;
 
-       trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+       trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+                                _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0)
                return ERR_PTR(err);
@@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
        journal = EXT4_SB(sb)->s_journal;
        if (!journal)
                return ext4_get_nojournal();
-       return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
-                                  type, line);
+       return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+                                  GFP_NOFS, type, line);
 }
 
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                return ext4_get_nojournal();
 
        sb = handle->h_journal->j_private;
-       trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
-                                         _RET_IP_);
+       trace_ext4_journal_start_reserved(sb,
+                               jbd2_handle_buffer_credits(handle), _RET_IP_);
        err = ext4_journal_check_start(sb);
        if (err < 0) {
                jbd2_journal_free_reserved(handle);
@@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
        return handle;
 }
 
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+                                 int extend_cred, int revoke_cred)
+{
+       if (!ext4_handle_valid(handle))
+               return 0;
+       if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+           handle->h_revoke_credits >= revoke_cred)
+               return 0;
+       extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+       revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+       return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
 static void ext4_journal_abort_handle(const char *caller, unsigned int line,
                                      const char *err_fn,
                                      struct buffer_head *bh,
@@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                       handle->h_type,
                                       handle->h_line_no,
                                       handle->h_requested_credits,
-                                      handle->h_buffer_credits, err);
+                                      jbd2_handle_buffer_credits(handle), err);
                                return err;
                        }
                        ext4_error_inode(inode, where, line,
@@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                         handle->h_type,
                                         handle->h_line_no,
                                         handle->h_requested_credits,
-                                        handle->h_buffer_credits, err);
+                                        jbd2_handle_buffer_credits(handle),
+                                        err);
                }
        } else {
                if (inode)
index ef8fcf7..a6b9b66 100644 (file)
@@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
        __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
 
 handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
-                                 int type, int blocks, int rsv_blocks);
+                                 int type, int blocks, int rsv_blocks,
+                                 int revoke_creds);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
 
 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle)
        return 0;
 }
 
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+                                                   int blocks)
 {
-       if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
-               return 0;
-       return 1;
+       /* Freeing each metadata block can result in freeing one cluster */
+       return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+       return ext4_free_metadata_revoke_credits(sb, 8);
 }
 
 #define ext4_journal_start_sb(sb, type, nblocks)                       \
-       __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+       __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0,   \
+                               ext4_trans_default_revoke_credits(sb))
 
 #define ext4_journal_start(inode, type, nblocks)                       \
-       __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+       __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,   \
+                            ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+       __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+                            ext4_trans_default_revoke_credits((inode)->i_sb))
 
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
-       __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+       __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,    \
+                            (revoke_creds))
 
 static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
-                                            int blocks, int rsv_blocks)
+                                            int blocks, int rsv_blocks,
+                                            int revoke_creds)
 {
        return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
-                                      rsv_blocks);
+                                      rsv_blocks, revoke_creds);
 }
 
 #define ext4_journal_stop(handle) \
@@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void)
        return journal_current_handle();
 }
 
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
 {
        if (ext4_handle_valid(handle))
-               return jbd2_journal_extend(handle, nblocks);
+               return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
 }
 
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+                                      int revoke)
 {
        if (ext4_handle_valid(handle))
-               return jbd2_journal_restart(handle, nblocks);
+               return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
 }
 
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+                                 int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
+                                      revoke_cred, fn) \
+({                                                                     \
+       __label__ __ensure_end;                                         \
+       int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+                                       (extend_cred), (revoke_cred));  \
+                                                                       \
+       if (err <= 0)                                                   \
+               goto __ensure_end;                                      \
+       err = (fn);                                                     \
+       if (err < 0)                                                    \
+               goto __ensure_end;                                      \
+       err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+       if (err == 0)                                                   \
+               err = 1;                                                \
+__ensure_end:                                                          \
+       err;                                                            \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+                                             int revoke_creds)
+{
+       return ext4_journal_ensure_credits_fn(handle, credits, credits,
+                               revoke_creds, 0);
+}
+
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
        if (EXT4_JOURNAL(inode) != NULL)
@@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
                return EXT4_INODE_WRITEBACK_DATA_MODE;  /* writeback */
        /* We do not support data journalling with delayed allocation */
        if (!S_ISREG(inode->i_mode) ||
+           ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
            test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
            (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
            !test_opt(inode->i_sb, DELALLOC))) {
@@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode)
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
 }
 
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+               return 0;
+       if (!ext4_should_journal_data(inode))
+               return 0;
+       /*
+        * Data blocks in one extent are contiguous, just account for partial
+        * clusters at extent boundaries
+        */
+       return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
 /*
  * This function controls whether or not we should try to go down the
  * dioread_nolock code paths, which makes it safe to avoid taking
index fb0f99d..0e8708b 100644 (file)
@@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle,
 static int ext4_find_delayed_extent(struct inode *inode,
                                    struct extent_status *newes);
 
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
-                                           struct inode *inode,
-                                           int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
 {
-       int err;
-
-       if (!ext4_handle_valid(handle))
-               return 0;
-       if (handle->h_buffer_credits >= needed)
-               return 0;
        /*
-        * If we need to extend the journal get a few extra blocks
-        * while we're at it for efficiency's sake.
+        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+        * moment, get_block can be called only for blocks inside i_size since
+        * page cache has been already dropped and writes are blocked by
+        * i_mutex. So we can safely drop the i_data_sem here.
         */
-       needed += 3;
-       err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
-       if (err <= 0)
-               return err;
-       err = ext4_truncate_restart_trans(handle, inode, needed);
-       if (err == 0)
-               err = -EAGAIN;
+       BUG_ON(EXT4_JOURNAL(inode) == NULL);
+       ext4_discard_preallocations(inode);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       *dropped = 1;
+       return 0;
+}
 
-       return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+                               int check_cred, int restart_cred,
+                               int revoke_cred)
+{
+       int ret;
+       int dropped = 0;
+
+       ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+               revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+       if (dropped)
+               down_write(&EXT4_I(inode)->i_data_sem);
+       return ret;
 }
 
 /*
@@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;
-       /*
-        * The check for IO to unwritten extent is somewhat racy as we
-        * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
-        * dropping i_data_sem. But reserved blocks should save us in that
-        * case.
-        */
+
        if (ext4_ext_is_unwritten(ex1) &&
-           (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
-            atomic_read(&EXT4_I(inode)->i_unwritten) ||
-            (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+           ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
                return 0;
 #ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
@@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
         * group descriptor to release the extent tree block.  If we
         * can't get the journal credits, give up.
         */
-       if (ext4_journal_extend(handle, 2))
+       if (ext4_journal_extend(handle, 2,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
                return;
 
        /*
@@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err = 0, correct_index = 0;
-       int depth = ext_depth(inode), credits;
+       int depth = ext_depth(inode), credits, revoke_credits;
        struct ext4_extent_header *eh;
        ext4_lblk_t a, b;
        unsigned num;
@@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                        credits += (ext_depth(inode)) + 1;
                }
                credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
-               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
-               if (err)
+               /*
+                * We may end up freeing some index blocks and data from the
+                * punched range. Note that partial clusters are accounted for
+                * by ext4_free_data_revoke_credits().
+                */
+               revoke_credits =
+                       ext4_free_metadata_revoke_credits(inode->i_sb,
+                                                         ext_depth(inode)) +
+                       ext4_free_data_revoke_credits(inode, b - a + 1);
+
+               err = ext4_datasem_ensure_credits(handle, inode, credits,
+                                                 credits, revoke_credits);
+               if (err) {
+                       if (err > 0)
+                               err = -EAGAIN;
                        goto out;
+               }
 
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
        ext_debug("truncate since %u to %u\n", start, end);
 
        /* probably first extent we're gonna free will be last in block */
-       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+       handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+                       depth + 1,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, depth));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
 
@@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
        int ret = 0;
        int ret2 = 0;
        struct ext4_map_blocks map;
-       unsigned int credits, blkbits = inode->i_blkbits;
+       unsigned int blkbits = inode->i_blkbits;
+       unsigned int credits = 0;
 
        map.m_lblk = offset >> blkbits;
        max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
 
-       /*
-        * This is somewhat ugly but the idea is clear: When transaction is
-        * reserved, everything goes into it. Otherwise we rather start several
-        * smaller transactions for conversion of each extent separately.
-        */
-       if (handle) {
-               handle = ext4_journal_start_reserved(handle,
-                                                    EXT4_HT_EXT_CONVERT);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
-               credits = 0;
-       } else {
+       if (!handle) {
                /*
                 * credits to insert 1 extent into extent tree
                 */
@@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                if (ret <= 0 || ret2)
                        break;
        }
-       if (!credits)
-               ret2 = ext4_journal_stop(handle);
        return ret > 0 ? ret2 : ret;
 }
 
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+       int ret, err = 0;
+       struct ext4_io_end_vec *io_end_vec;
+
+       /*
+        * This is somewhat ugly but the idea is clear: When transaction is
+        * reserved, everything goes into it. Otherwise we rather start several
+        * smaller transactions for conversion of each extent separately.
+        */
+       if (handle) {
+               handle = ext4_journal_start_reserved(handle,
+                                                    EXT4_HT_EXT_CONVERT);
+               if (IS_ERR(handle))
+                       return PTR_ERR(handle);
+       }
+
+       list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+               ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+                                                    io_end_vec->offset,
+                                                    io_end_vec->size);
+               if (ret)
+                       break;
+       }
+
+       if (handle)
+               err = ext4_journal_stop(handle);
+
+       return ret < 0 ? ret : err;
+}
+
 /*
  * If newes is not existing extent (newes->ec_pblk equals zero) find
  * delayed extent at start of newes and update newes accordingly and
@@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode,
         * descriptor) for each block group; assume two block
         * groups
         */
-       if (handle->h_buffer_credits < 7) {
-               credits = ext4_writepage_trans_blocks(inode);
-               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
-               /* EAGAIN is success */
-               if (err && err != -EAGAIN)
-                       return err;
-       }
+       credits = ext4_writepage_trans_blocks(inode);
+       err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
+       if (err < 0)
+               return err;
 
        err = ext4_ext_get_access(handle, inode, path);
        return err;
index 8d2bbcc..6a7293a 100644 (file)
 #include <linux/pagevec.h>
 #include <linux/uio.h>
 #include <linux/mman.h>
+#include <linux/backing-dev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+       if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+               return false;
+       if (fsverity_active(inode))
+               return false;
+       if (ext4_should_journal_data(inode))
+               return false;
+       if (ext4_has_inline_data(inode))
+               return false;
+       return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       ssize_t ret;
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock_shared(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock_shared(inode);
+       }
+
+       if (!ext4_dio_supported(inode)) {
+               inode_unlock_shared(inode);
+               /*
+                * Fallback to buffered I/O if the operation being performed on
+                * the inode is not supported by direct I/O. The IOCB_DIRECT
+                * flag needs to be cleared here in order to ensure that the
+                * direct I/O path within generic_file_read_iter() is not
+                * taken.
+                */
+               iocb->ki_flags &= ~IOCB_DIRECT;
+               return generic_file_read_iter(iocb, to);
+       }
+
+       ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+                          is_sync_kiocb(iocb));
+       inode_unlock_shared(inode);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
 
 #ifdef CONFIG_FS_DAX
 static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                return -EIO;
 
        if (!iov_iter_count(to))
                return 0; /* skip atime */
 
 #ifdef CONFIG_FS_DAX
-       if (IS_DAX(file_inode(iocb->ki_filp)))
+       if (IS_DAX(inode))
                return ext4_dax_read_iter(iocb, to);
 #endif
+       if (iocb->ki_flags & IOCB_DIRECT)
+               return ext4_dio_read_iter(iocb, to);
+
        return generic_file_read_iter(iocb, to);
 }
 
@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
 
-static void ext4_unwritten_wait(struct inode *inode)
-{
-       wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
-       wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
 /*
  * This tests whether the IO in question is block-aligned or not.
  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;
 
+       if (unlikely(IS_IMMUTABLE(inode)))
+               return -EPERM;
+
        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                return ret;
 
-       if (unlikely(IS_IMMUTABLE(inode)))
-               return -EPERM;
-
        /*
         * If we have encountered a bitmap-format file, the size limit
         * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
                        return -EFBIG;
                iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
        }
+
+       ret = file_modified(iocb->ki_filp);
+       if (ret)
+               return ret;
+
        return iov_iter_count(from);
 }
 
-#ifdef CONFIG_FS_DAX
-static ssize_t
-ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+                                       struct iov_iter *from)
 {
-       struct inode *inode = file_inode(iocb->ki_filp);
        ssize_t ret;
+       struct inode *inode = file_inode(iocb->ki_filp);
 
-       if (!inode_trylock(inode)) {
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       return -EAGAIN;
-               inode_lock(inode);
-       }
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               return -EOPNOTSUPP;
+
+       inode_lock(inode);
        ret = ext4_write_checks(iocb, from);
        if (ret <= 0)
                goto out;
-       ret = file_remove_privs(iocb->ki_filp);
-       if (ret)
-               goto out;
-       ret = file_update_time(iocb->ki_filp);
-       if (ret)
-               goto out;
 
-       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+       current->backing_dev_info = inode_to_bdi(inode);
+       ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+       current->backing_dev_info = NULL;
+
 out:
        inode_unlock(inode);
-       if (ret > 0)
+       if (likely(ret > 0)) {
+               iocb->ki_pos += ret;
                ret = generic_write_sync(iocb, ret);
+       }
+
        return ret;
 }
-#endif
 
-static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+                                          ssize_t written, size_t count)
 {
+       handle_t *handle;
+       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       ext4_lblk_t written_blk, end_blk;
+
+       /*
+        * Note that EXT4_I(inode)->i_disksize can get extended up to
+        * inode->i_size while the I/O was running due to writeback of delalloc
+        * blocks. But, the code in ext4_iomap_alloc() is careful to use
+        * zeroed/unwritten extents if this is possible; thus we won't leave
+        * uninitialized blocks in a file even if we didn't succeed in writing
+        * as much as we intended.
+        */
+       WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+       if (offset + count <= EXT4_I(inode)->i_disksize) {
+               /*
+                * We need to ensure that the inode is removed from the orphan
+                * list if it has been added prematurely, due to writeback of
+                * delalloc blocks.
+                */
+               if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+                       if (IS_ERR(handle)) {
+                               ext4_orphan_del(NULL, inode);
+                               return PTR_ERR(handle);
+                       }
+
+                       ext4_orphan_del(handle, inode);
+                       ext4_journal_stop(handle);
+               }
+
+               return written;
+       }
+
+       if (written < 0)
+               goto truncate;
+
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+       if (IS_ERR(handle)) {
+               written = PTR_ERR(handle);
+               goto truncate;
+       }
+
+       if (ext4_update_inode_size(inode, offset + written))
+               ext4_mark_inode_dirty(handle, inode);
+
+       /*
+        * We may need to truncate allocated but not written blocks beyond EOF.
+        */
+       written_blk = ALIGN(offset + written, 1 << blkbits);
+       end_blk = ALIGN(offset + count, 1 << blkbits);
+       if (written_blk < end_blk && ext4_can_truncate(inode))
+               truncate = true;
+
+       /*
+        * Remove the inode from the orphan list if it has been extended and
+        * everything went OK.
+        */
+       if (!truncate && inode->i_nlink)
+               ext4_orphan_del(handle, inode);
+       ext4_journal_stop(handle);
+
+       if (truncate) {
+truncate:
+               ext4_truncate_failed_write(inode);
+               /*
+                * If the truncate operation failed early, then the inode may
+                * still be on the orphan list. In that case, we need to try
+                * remove the inode from the in-memory linked list.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
+
+       return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+                                int error, unsigned int flags)
+{
+       loff_t offset = iocb->ki_pos;
        struct inode *inode = file_inode(iocb->ki_filp);
-       int o_direct = iocb->ki_flags & IOCB_DIRECT;
-       int unaligned_aio = 0;
-       int overwrite = 0;
+
+       if (error)
+               return error;
+
+       if (size && flags & IOMAP_DIO_UNWRITTEN)
+               return ext4_convert_unwritten_extents(NULL, inode,
+                                                     offset, size);
+
+       return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+       .end_io = ext4_dio_write_end_io,
+};
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
        ssize_t ret;
+       size_t count;
+       loff_t offset;
+       handle_t *handle;
+       struct inode *inode = file_inode(iocb->ki_filp);
+       bool extend = false, overwrite = false, unaligned_aio = false;
 
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
-               return -EIO;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock(inode);
+       }
+
+       if (!ext4_dio_supported(inode)) {
+               inode_unlock(inode);
+               /*
+                * Fallback to buffered I/O if the inode does not support
+                * direct I/O.
+                */
+               return ext4_buffered_write_iter(iocb, from);
+       }
+
+       ret = ext4_write_checks(iocb, from);
+       if (ret <= 0) {
+               inode_unlock(inode);
+               return ret;
+       }
+
+       /*
+        * Unaligned asynchronous direct I/O must be serialized among each
+        * other as the zeroing of partial blocks of two competing unaligned
+        * asynchronous direct I/O writes can result in data corruption.
+        */
+       offset = iocb->ki_pos;
+       count = iov_iter_count(from);
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+           !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+               unaligned_aio = true;
+               inode_dio_wait(inode);
+       }
+
+       /*
+        * Determine whether the I/O will overwrite allocated and initialized
+        * blocks. If so, check to see whether it is possible to take the
+        * dioread_nolock path.
+        */
+       if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+           ext4_should_dioread_nolock(inode)) {
+               overwrite = true;
+               downgrade_write(&inode->i_rwsem);
+       }
+
+       if (offset + count > EXT4_I(inode)->i_disksize) {
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+
+               extend = true;
+               ext4_journal_stop(handle);
+       }
+
+       ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
+                          is_sync_kiocb(iocb) || unaligned_aio || extend);
+
+       if (extend)
+               ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+       if (overwrite)
+               inode_unlock_shared(inode);
+       else
+               inode_unlock(inode);
+
+       if (ret >= 0 && iov_iter_count(from)) {
+               ssize_t err;
+               loff_t endbyte;
+
+               offset = iocb->ki_pos;
+               err = ext4_buffered_write_iter(iocb, from);
+               if (err < 0)
+                       return err;
+
+               /*
+                * We need to ensure that the pages within the page cache for
+                * the range covered by this I/O are written to disk and
+                * invalidated. This is in attempt to preserve the expected
+                * direct I/O semantics in the case we fallback to buffered I/O
+                * to complete off the I/O request.
+                */
+               ret += err;
+               endbyte = offset + err - 1;
+               err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+                                                  offset, endbyte);
+               if (!err)
+                       invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+                                                offset >> PAGE_SHIFT,
+                                                endbyte >> PAGE_SHIFT);
+       }
+
+       return ret;
+}
 
 #ifdef CONFIG_FS_DAX
-       if (IS_DAX(inode))
-               return ext4_dax_write_iter(iocb, from);
-#endif
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       ssize_t ret;
+       size_t count;
+       loff_t offset;
+       handle_t *handle;
+       bool extend = false;
+       struct inode *inode = file_inode(iocb->ki_filp);
 
        if (!inode_trylock(inode)) {
                if (iocb->ki_flags & IOCB_NOWAIT)
@@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (ret <= 0)
                goto out;
 
-       /*
-        * Unaligned direct AIO must be serialized among each other as zeroing
-        * of partial blocks of two competing unaligned AIOs can result in data
-        * corruption.
-        */
-       if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-           !is_sync_kiocb(iocb) &&
-           ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
-               unaligned_aio = 1;
-               ext4_unwritten_wait(inode);
-       }
+       offset = iocb->ki_pos;
+       count = iov_iter_count(from);
 
-       iocb->private = &overwrite;
-       /* Check whether we do a DIO overwrite or not */
-       if (o_direct && !unaligned_aio) {
-               if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-                       if (ext4_should_dioread_nolock(inode))
-                               overwrite = 1;
-               } else if (iocb->ki_flags & IOCB_NOWAIT) {
-                       ret = -EAGAIN;
+       if (offset + count > EXT4_I(inode)->i_disksize) {
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
                        goto out;
                }
-       }
 
-       ret = __generic_file_write_iter(iocb, from);
-       /*
-        * Unaligned direct AIO must be the only IO in flight. Otherwise
-        * overlapping aligned IO after unaligned might result in data
-        * corruption.
-        */
-       if (ret == -EIOCBQUEUED && unaligned_aio)
-               ext4_unwritten_wait(inode);
-       inode_unlock(inode);
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
 
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
+               extend = true;
+               ext4_journal_stop(handle);
+       }
 
-       return ret;
+       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 
+       if (extend)
+               ret = ext4_handle_inode_extension(inode, offset, ret, count);
 out:
        inode_unlock(inode);
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
        return ret;
 }
+#endif
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+               return -EIO;
+
+#ifdef CONFIG_FS_DAX
+       if (IS_DAX(inode))
+               return ext4_dax_write_iter(iocb, from);
+#endif
+       if (iocb->ki_flags & IOCB_DIRECT)
+               return ext4_dio_write_iter(iocb, from);
+
+       return ext4_buffered_write_iter(iocb, from);
+}
 
 #ifdef CONFIG_FS_DAX
 static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
@@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
                                                maxbytes, i_size_read(inode));
        case SEEK_HOLE:
                inode_lock_shared(inode);
-               offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+               offset = iomap_seek_hole(inode, offset,
+                                        &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        case SEEK_DATA:
                inode_lock_shared(inode);
-               offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+               offset = iomap_seek_data(inode, offset,
+                                        &ext4_iomap_report_ops);
                inode_unlock_shared(inode);
                break;
        }
index 5508baa..e10206e 100644 (file)
@@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode)
        return ret;
 }
 
+static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
+                               bool *needs_barrier)
+{
+       int ret, err;
+
+       ret = sync_mapping_buffers(inode->i_mapping);
+       if (!(inode->i_state & I_DIRTY_ALL))
+               return ret;
+       if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+               return ret;
+
+       err = sync_inode_metadata(inode, 1);
+       if (!ret)
+               ret = err;
+
+       if (!ret)
+               ret = ext4_sync_parent(inode);
+       if (test_opt(inode->i_sb, BARRIER))
+               *needs_barrier = true;
+
+       return ret;
+}
+
+static int ext4_fsync_journal(struct inode *inode, bool datasync,
+                            bool *needs_barrier)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+       tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+
+       if (journal->j_flags & JBD2_BARRIER &&
+           !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+               *needs_barrier = true;
+
+       return jbd2_complete_transaction(journal, commit_tid);
+}
+
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode)
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
  */
-
 int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret = 0, err;
-       tid_t commit_tid;
        bool needs_barrier = false;
+       struct inode *inode = file->f_mapping->host;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+       if (unlikely(ext4_forced_shutdown(sbi)))
                return -EIO;
 
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (sb_rdonly(inode->i_sb)) {
                /* Make sure that we read updated s_mount_flags value */
                smp_rmb();
-               if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+               if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
                        ret = -EROFS;
                goto out;
        }
 
-       if (!journal) {
-               ret = __generic_file_fsync(file, start, end, datasync);
-               if (!ret)
-                       ret = ext4_sync_parent(inode);
-               if (test_opt(inode->i_sb, BARRIER))
-                       goto issue_flush;
-               goto out;
-       }
-
        ret = file_write_and_wait_range(file, start, end);
        if (ret)
                return ret;
+
        /*
         * data=writeback,ordered:
         *  The caller's filemap_fdatawrite()/wait will sync the data.
@@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-       if (ext4_should_journal_data(inode)) {
+       if (!sbi->s_journal)
+               ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
+       else if (ext4_should_journal_data(inode))
                ret = ext4_force_commit(inode->i_sb);
-               goto out;
-       }
+       else
+               ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
 
-       commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-       if (journal->j_flags & JBD2_BARRIER &&
-           !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-               needs_barrier = true;
-       ret = jbd2_complete_transaction(journal, commit_tid);
        if (needs_barrier) {
-       issue_flush:
                err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
                if (!ret)
                        ret = err;
index 764ff4c..dc333e8 100644 (file)
@@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        ext4_debug("freeing inode %lu\n", ino);
        trace_ext4_free_inode(inode);
 
-       /*
-        * Note: we must free any quota before locking the superblock,
-        * as writing the quota to disk may need the lock as well.
-        */
        dquot_initialize(inode);
        dquot_free_inode(inode);
-       dquot_drop(inode);
 
        is_directory = S_ISDIR(inode->i_mode);
 
@@ -927,7 +922,7 @@ repeat_in_this_group:
                        BUG_ON(nblocks <= 0);
                        handle = __ext4_journal_start_sb(dir->i_sb, line_no,
                                                         handle_type, nblocks,
-                                                        0);
+                                                        0, 0);
                        if (IS_ERR(handle)) {
                                err = PTR_ERR(handle);
                                ext4_std_error(sb, err);
index 36699a1..3a4ab70 100644 (file)
@@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle,
        for (i = 0; i <= indirect_blks; i++) {
                if (i == indirect_blks) {
                        new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
-               } else
+               } else {
                        ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
                                        ar->inode, ar->goal,
                                        ar->flags & EXT4_MB_DELALLOC_RESERVED,
                                        NULL, &err);
+                       /* Simplify error cleanup... */
+                       branch[i+1].bh = NULL;
+               }
                if (err) {
                        i--;
                        goto failed;
@@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle,
        }
        return 0;
 failed:
+       if (i == indirect_blks) {
+               /* Free data blocks */
+               ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+                                ar->len, 0);
+               i--;
+       }
        for (; i >= 0; i--) {
                /*
                 * We want to ext4_forget() only freshly allocated indirect
-                * blocks.  Buffer for new_blocks[i-1] is at branch[i].bh and
-                * buffer at branch[0].bh is indirect block / inode already
-                * existing before ext4_alloc_branch() was called.
+                * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
+                * (buffer at branch[0].bh is indirect block / inode already
+                * existing before ext4_alloc_branch() was called). Also
+                * because blocks are freshly allocated, we don't need to
+                * revoke them which is why we don't set
+                * EXT4_FREE_BLOCKS_METADATA.
                 */
-               if (i > 0 && i != indirect_blks && branch[i].bh)
-                       ext4_forget(handle, 1, ar->inode, branch[i].bh,
-                                   branch[i].bh->b_blocknr);
-               ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
-                                (i == indirect_blks) ? ar->len : 1, 0);
+               ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
+                                new_blocks[i], 1,
+                                branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
        }
        return err;
 }
@@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
        return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
 }
 
+static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+                                    struct buffer_head *bh, int *dropped)
+{
+       int err;
+
+       if (bh) {
+               BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+               err = ext4_handle_dirty_metadata(handle, inode, bh);
+               if (unlikely(err))
+                       return err;
+       }
+       err = ext4_mark_inode_dirty(handle, inode);
+       if (unlikely(err))
+               return err;
+       /*
+        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+        * moment, get_block can be called only for blocks inside i_size since
+        * page cache has been already dropped and writes are blocked by
+        * i_mutex. So we can safely drop the i_data_sem here.
+        */
+       BUG_ON(EXT4_JOURNAL(inode) == NULL);
+       ext4_discard_preallocations(inode);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       *dropped = 1;
+       return 0;
+}
+
 /*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
  *
  * Try to extend this transaction for the purposes of truncation.  If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- *
- * Returns 0 if we managed to create more room.  If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * extend fails, we restart transaction.
  */
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_ind_truncate_ensure_credits(handle_t *handle,
+                                           struct inode *inode,
+                                           struct buffer_head *bh,
+                                           int revoke_creds)
 {
-       if (!ext4_handle_valid(handle))
-               return 0;
-       if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
-               return 0;
-       if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
-               return 0;
-       return 1;
+       int ret;
+       int dropped = 0;
+
+       ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
+                       ext4_blocks_for_truncate(inode), revoke_creds,
+                       ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
+       if (dropped)
+               down_write(&EXT4_I(inode)->i_data_sem);
+       if (ret <= 0)
+               return ret;
+       if (bh) {
+               BUFFER_TRACE(bh, "retaking write access");
+               ret = ext4_journal_get_write_access(handle, bh);
+               if (unlikely(ret))
+                       return ret;
+       }
+       return 0;
 }
 
 /*
@@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                return 1;
        }
 
-       if (try_to_extend_transaction(handle, inode)) {
-               if (bh) {
-                       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                       err = ext4_handle_dirty_metadata(handle, inode, bh);
-                       if (unlikely(err))
-                               goto out_err;
-               }
-               err = ext4_mark_inode_dirty(handle, inode);
-               if (unlikely(err))
-                       goto out_err;
-               err = ext4_truncate_restart_trans(handle, inode,
-                                       ext4_blocks_for_truncate(inode));
-               if (unlikely(err))
-                       goto out_err;
-               if (bh) {
-                       BUFFER_TRACE(bh, "retaking write access");
-                       err = ext4_journal_get_write_access(handle, bh);
-                       if (unlikely(err))
-                               goto out_err;
-               }
-       }
+       err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
+                               ext4_free_data_revoke_credits(inode, count));
+       if (err < 0)
+               goto out_err;
 
        for (p = first; p < last; p++)
                *p = 0;
@@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         */
                        if (ext4_handle_is_aborted(handle))
                                return;
-                       if (try_to_extend_transaction(handle, inode)) {
-                               ext4_mark_inode_dirty(handle, inode);
-                               ext4_truncate_restart_trans(handle, inode,
-                                           ext4_blocks_for_truncate(inode));
-                       }
+                       if (ext4_ind_truncate_ensure_credits(handle, inode,
+                                       NULL,
+                                       ext4_free_metadata_revoke_credits(
+                                                       inode->i_sb, 1)) < 0)
+                               return;
 
                        /*
                         * The forget flag here is critical because if
index 1d880ae..28f28de 100644 (file)
@@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
 }
 
 /*
- * Restart the transaction associated with *handle.  This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
-                                int nblocks)
-{
-       int ret;
-
-       /*
-        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
-        * moment, get_block can be called only for blocks inside i_size since
-        * page cache has been already dropped and writes are blocked by
-        * i_mutex. So we can safely drop the i_data_sem here.
-        */
-       BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       jbd_debug(2, "restarting handle %p\n", handle);
-       up_write(&EXT4_I(inode)->i_data_sem);
-       ret = ext4_journal_restart(handle, nblocks);
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
-
-       return ret;
-}
-
-/*
  * Called at the last iput() if i_nlink is zero.
  */
 void ext4_evict_inode(struct inode *inode)
 {
        handle_t *handle;
        int err;
-       int extra_credits = 3;
+       /*
+        * Credits for final inode cleanup and freeing:
+        * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+        * (xattr block freeing), bitmap, group descriptor (inode freeing)
+        */
+       int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
 
        trace_ext4_evict_inode(inode);
@@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode)
        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
 
+       /*
+        * Block bitmap, group descriptor, and inode are accounted in both
+        * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+        */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                ext4_blocks_for_truncate(inode)+extra_credits);
+                        ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 #define DIO_MAX_BLOCKS 4096
 
 /*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
-                               struct buffer_head *bh_result, int flags)
-{
-       int dio_credits;
-       handle_t *handle;
-       int retries = 0;
-       int ret;
-
-       /* Trim mapping request to maximum we can map at once for DIO */
-       if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
-               bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
-       dio_credits = ext4_chunk_trans_blocks(inode,
-                                     bh_result->b_size >> inode->i_blkbits);
-retry:
-       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
-
-       ret = _ext4_get_block(inode, iblock, bh_result, flags);
-       ext4_journal_stop(handle);
-
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-       return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh, int create)
-{
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       if (!create)
-               return _ext4_get_block(inode, iblock, bh, 0);
-       return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
-       /*
-        * When doing DIO using unwritten extents, we need io_end to convert
-        * unwritten extents to written on IO completion. We allocate io_end
-        * once we spot unwritten extent and store it in b_private. Generic
-        * DIO code keeps b_private set and furthermore passes the value to
-        * our completion callback in 'private' argument.
-        */
-       if (!ret && buffer_unwritten(bh_result)) {
-               if (!bh_result->b_private) {
-                       ext4_io_end_t *io_end;
-
-                       io_end = ext4_init_io_end(inode, GFP_KERNEL);
-                       if (!io_end)
-                               return -ENOMEM;
-                       bh_result->b_private = io_end;
-                       ext4_set_io_unwritten_flag(inode, io_end);
-               }
-               set_buffer_defer_completion(bh_result);
-       }
-
-       return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
-       /*
-        * Mark inode as having pending DIO writes to unwritten extents.
-        * ext4_direct_IO_write() checks this flag and converts extents to
-        * written.
-        */
-       if (!ret && buffer_unwritten(bh_result))
-               ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
-       return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = _ext4_get_block(inode, iblock, bh_result, 0);
-       /*
-        * Blocks should have been preallocated! ext4_file_write_iter() checks
-        * that.
-        */
-       WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
-       return ret;
-}
-
-
-/*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 }
 
 /*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ *                    may submit fully mapped page for IO
+ *
+ * @mpd                - description of extent to map, on return next extent to map
+ * @m_lblk     - logical block mapping.
+ * @m_pblk     - corresponding physical mapping.
+ * @map_bh     - determines on return whether this page requires any further
+ *               mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+                             ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+                             bool *map_bh)
+{
+       struct buffer_head *head, *bh;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       ext4_lblk_t lblk = *m_lblk;
+       ext4_fsblk_t pblock = *m_pblk;
+       int err = 0;
+       int blkbits = mpd->inode->i_blkbits;
+       ssize_t io_end_size = 0;
+       struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+       bh = head = page_buffers(page);
+       do {
+               if (lblk < mpd->map.m_lblk)
+                       continue;
+               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                       /*
+                        * Buffer after end of mapped extent.
+                        * Find next buffer in the page to map.
+                        */
+                       mpd->map.m_len = 0;
+                       mpd->map.m_flags = 0;
+                       io_end_vec->size += io_end_size;
+                       io_end_size = 0;
+
+                       err = mpage_process_page_bufs(mpd, head, bh, lblk);
+                       if (err > 0)
+                               err = 0;
+                       if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+                               io_end_vec = ext4_alloc_io_end_vec(io_end);
+                               if (IS_ERR(io_end_vec)) {
+                                       err = PTR_ERR(io_end_vec);
+                                       goto out;
+                               }
+                               io_end_vec->offset = mpd->map.m_lblk << blkbits;
+                       }
+                       *map_bh = true;
+                       goto out;
+               }
+               if (buffer_delay(bh)) {
+                       clear_buffer_delay(bh);
+                       bh->b_blocknr = pblock++;
+               }
+               clear_buffer_unwritten(bh);
+               io_end_size += (1 << blkbits);
+       } while (lblk++, (bh = bh->b_this_page) != head);
+
+       io_end_vec->size += io_end_size;
+       io_end_size = 0;
+       *map_bh = false;
+out:
+       *m_lblk = lblk;
+       *m_pblk = pblock;
+       return err;
+}
+
+/*
  * mpage_map_buffers - update buffers corresponding to changed extent and
  *                    submit fully mapped pages for IO
  *
@@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
        struct pagevec pvec;
        int nr_pages, i;
        struct inode *inode = mpd->inode;
-       struct buffer_head *head, *bh;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
-       sector_t pblock;
+       ext4_fsblk_t pblock;
        int err;
+       bool map_bh = false;
 
        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 
-                       bh = head = page_buffers(page);
-                       do {
-                               if (lblk < mpd->map.m_lblk)
-                                       continue;
-                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
-                                       /*
-                                        * Buffer after end of mapped extent.
-                                        * Find next buffer in the page to map.
-                                        */
-                                       mpd->map.m_len = 0;
-                                       mpd->map.m_flags = 0;
-                                       /*
-                                        * FIXME: If dioread_nolock supports
-                                        * blocksize < pagesize, we need to make
-                                        * sure we add size mapped so far to
-                                        * io_end->size as the following call
-                                        * can submit the page for IO.
-                                        */
-                                       err = mpage_process_page_bufs(mpd, head,
-                                                                     bh, lblk);
-                                       pagevec_release(&pvec);
-                                       if (err > 0)
-                                               err = 0;
-                                       return err;
-                               }
-                               if (buffer_delay(bh)) {
-                                       clear_buffer_delay(bh);
-                                       bh->b_blocknr = pblock++;
-                               }
-                               clear_buffer_unwritten(bh);
-                       } while (lblk++, (bh = bh->b_this_page) != head);
-
+                       err = mpage_process_page(mpd, page, &lblk, &pblock,
+                                                &map_bh);
                        /*
-                        * FIXME: This is going to break if dioread_nolock
-                        * supports blocksize < pagesize as we will try to
-                        * convert potentially unmapped parts of inode.
+                        * If map_bh is true, means page may require further bh
+                        * mapping, or maybe the page was submitted for IO.
+                        * So we return to call further extent mapping.
                         */
-                       mpd->io_submit.io_end->size += PAGE_SIZE;
+                       if (err < 0 || map_bh == true)
+                               goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_page(mpd, page);
-                       if (err < 0) {
-                               pagevec_release(&pvec);
-                               return err;
-                       }
+                       if (err < 0)
+                               goto out;
                }
                pagevec_release(&pvec);
        }
@@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
+out:
+       pagevec_release(&pvec);
+       return err;
 }
 
 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle,
        int err;
        loff_t disksize;
        int progress = 0;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       struct ext4_io_end_vec *io_end_vec;
 
-       mpd->io_submit.io_end->offset =
-                               ((loff_t)map->m_lblk) << inode->i_blkbits;
+       io_end_vec = ext4_alloc_io_end_vec(io_end);
+       if (IS_ERR(io_end_vec))
+               return PTR_ERR(io_end_vec);
+       io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
@@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
        return inode->i_state & I_DIRTY_DATASYNC;
 }
 
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+                          struct ext4_map_blocks *map, loff_t offset,
+                          loff_t length)
 {
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned int blkbits = inode->i_blkbits;
-       unsigned long first_block, last_block;
-       struct ext4_map_blocks map;
-       bool delalloc = false;
-       int ret;
-
-       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
-               return -EINVAL;
-       first_block = offset >> blkbits;
-       last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
-                          EXT4_MAX_LOGICAL_BLOCK);
-
-       if (flags & IOMAP_REPORT) {
-               if (ext4_has_inline_data(inode)) {
-                       ret = ext4_inline_data_iomap(inode, iomap);
-                       if (ret != -EAGAIN) {
-                               if (ret == 0 && offset >= iomap->length)
-                                       ret = -ENOENT;
-                               return ret;
-                       }
-               }
-       } else {
-               if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
-                       return -ERANGE;
-       }
-
-       map.m_lblk = first_block;
-       map.m_len = last_block - first_block + 1;
-
-       if (flags & IOMAP_REPORT) {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-
-               if (ret == 0) {
-                       ext4_lblk_t end = map.m_lblk + map.m_len - 1;
-                       struct extent_status es;
-
-                       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
-                                                 map.m_lblk, end, &es);
-
-                       if (!es.es_len || es.es_lblk > end) {
-                               /* entire range is a hole */
-                       } else if (es.es_lblk > map.m_lblk) {
-                               /* range starts with a hole */
-                               map.m_len = es.es_lblk - map.m_lblk;
-                       } else {
-                               ext4_lblk_t offs = 0;
-
-                               if (es.es_lblk < map.m_lblk)
-                                       offs = map.m_lblk - es.es_lblk;
-                               map.m_lblk = es.es_lblk + offs;
-                               map.m_len = es.es_len - offs;
-                               delalloc = true;
-                       }
-               }
-       } else if (flags & IOMAP_WRITE) {
-               int dio_credits;
-               handle_t *handle;
-               int retries = 0;
-
-               /* Trim mapping request to maximum we can map at once for DIO */
-               if (map.m_len > DIO_MAX_BLOCKS)
-                       map.m_len = DIO_MAX_BLOCKS;
-               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
-               /*
-                * Either we allocate blocks and then we don't get unwritten
-                * extent so we have reserved enough credits, or the blocks
-                * are already allocated and unwritten and in that case
-                * extent conversion fits in the credits as well.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           dio_credits);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
-
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (ret < 0) {
-                       ext4_journal_stop(handle);
-                       if (ret == -ENOSPC &&
-                           ext4_should_retry_alloc(inode->i_sb, &retries))
-                               goto retry;
-                       return ret;
-               }
-
-               /*
-                * If we added blocks beyond i_size, we need to make sure they
-                * will get truncated if we crash before updating i_size in
-                * ext4_iomap_end(). For faults we don't need to do that (and
-                * even cannot because for orphan list operations inode_lock is
-                * required) - if we happen to instantiate block beyond i_size,
-                * it is because we race with truncate which has already added
-                * the inode to the orphan list.
-                */
-               if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
-                   (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
-                       int err;
-
-                       err = ext4_orphan_add(handle, inode);
-                       if (err < 0) {
-                               ext4_journal_stop(handle);
-                               return err;
-                       }
-               }
-               ext4_journal_stop(handle);
-       } else {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-       }
+       u8 blkbits = inode->i_blkbits;
 
+       /*
+        * Writes that span EOF might trigger an I/O size update on completion,
+        * so consider them to be dirty for the purpose of O_DSYNC, even if
+        * there is no other metadata changes being made or are pending.
+        */
        iomap->flags = 0;
-       if (ext4_inode_datasync_dirty(inode))
+       if (ext4_inode_datasync_dirty(inode) ||
+           offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;
+
+       if (map->m_flags & EXT4_MAP_NEW)
+               iomap->flags |= IOMAP_F_NEW;
+
        iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = sbi->s_daxdev;
-       iomap->offset = (u64)first_block << blkbits;
-       iomap->length = (u64)map.m_len << blkbits;
+       iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+       iomap->offset = (u64) map->m_lblk << blkbits;
+       iomap->length = (u64) map->m_len << blkbits;
 
-       if (ret == 0) {
-               iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
-               iomap->addr = IOMAP_NULL_ADDR;
+       /*
+        * Flags passed to ext4_map_blocks() for direct I/O writes can result
+        * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+        * set. In order for any allocated unwritten extents to be converted
+        * into written extents correctly within the ->end_io() handler, we
+        * need to ensure that the iomap->type is set appropriately. Hence, the
+        * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+        * been set first.
+        */
+       if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+               iomap->type = IOMAP_UNWRITTEN;
+               iomap->addr = (u64) map->m_pblk << blkbits;
+       } else if (map->m_flags & EXT4_MAP_MAPPED) {
+               iomap->type = IOMAP_MAPPED;
+               iomap->addr = (u64) map->m_pblk << blkbits;
        } else {
-               if (map.m_flags & EXT4_MAP_MAPPED) {
-                       iomap->type = IOMAP_MAPPED;
-               } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       iomap->type = IOMAP_UNWRITTEN;
-               } else {
-                       WARN_ON_ONCE(1);
-                       return -EIO;
-               }
-               iomap->addr = (u64)map.m_pblk << blkbits;
+               iomap->type = IOMAP_HOLE;
+               iomap->addr = IOMAP_NULL_ADDR;
        }
-
-       if (map.m_flags & EXT4_MAP_NEW)
-               iomap->flags |= IOMAP_F_NEW;
-
-       return 0;
 }
 
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
-                         ssize_t written, unsigned flags, struct iomap *iomap)
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+                           unsigned int flags)
 {
-       int ret = 0;
        handle_t *handle;
-       int blkbits = inode->i_blkbits;
-       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       int ret, dio_credits, m_flags = 0, retries = 0;
 
-       if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-               return 0;
-
-       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto orphan_del;
-       }
-       if (ext4_update_inode_size(inode, offset + written))
-               ext4_mark_inode_dirty(handle, inode);
        /*
-        * We may need to truncate allocated but not written blocks beyond EOF.
+        * Trim the mapping request to the maximum value that we can map at
+        * once for direct I/O.
         */
-       if (iomap->offset + iomap->length > 
-           ALIGN(inode->i_size, 1 << blkbits)) {
-               ext4_lblk_t written_blk, end_blk;
+       if (map->m_len > DIO_MAX_BLOCKS)
+               map->m_len = DIO_MAX_BLOCKS;
+       dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
 
-               written_blk = (offset + written) >> blkbits;
-               end_blk = (offset + length) >> blkbits;
-               if (written_blk < end_blk && ext4_can_truncate(inode))
-                       truncate = true;
-       }
+retry:
        /*
-        * Remove inode from orphan list if we were extending a inode and
-        * everything went fine.
+        * Either we allocate blocks and then don't get an unwritten extent, so
+        * in that case we have reserved enough credits. Or, the blocks are
+        * already allocated and unwritten. In that case, the extent conversion
+        * fits into the credits as well.
         */
-       if (!truncate && inode->i_nlink &&
-           !list_empty(&EXT4_I(inode)->i_orphan))
-               ext4_orphan_del(handle, inode);
-       ext4_journal_stop(handle);
-       if (truncate) {
-               ext4_truncate_failed_write(inode);
-orphan_del:
-               /*
-                * If truncate failed early the inode might still be on the
-                * orphan list; we need to make sure the inode is removed from
-                * the orphan list in that case.
-                */
-               if (inode->i_nlink)
-                       ext4_orphan_del(NULL, inode);
-       }
-       return ret;
-}
-
-const struct iomap_ops ext4_iomap_ops = {
-       .iomap_begin            = ext4_iomap_begin,
-       .iomap_end              = ext4_iomap_end,
-};
-
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                           ssize_t size, void *private)
-{
-        ext4_io_end_t *io_end = private;
+       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
 
-       /* if not async direct IO just return */
-       if (!io_end)
-               return 0;
+       /*
+        * DAX and direct I/O are the only two operations that are currently
+        * supported with IOMAP_WRITE.
+        */
+       WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+       if (IS_DAX(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+       /*
+        * We use i_size instead of i_disksize here because delalloc writeback
+        * can complete at any point during the I/O and subsequently push the
+        * i_disksize out to i_size. This could be beyond where direct I/O is
+        * happening and thus expose allocated blocks to direct I/O reads.
+        */
+       else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE;
+       else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
 
-       ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                 io_end, io_end->inode->i_ino, iocb, offset, size);
+       ret = ext4_map_blocks(handle, inode, map, m_flags);
 
        /*
-        * Error during AIO DIO. We cannot convert unwritten extents as the
-        * data was not written. Just clear the unwritten flag and drop io_end.
+        * We cannot fill holes in indirect tree based inodes as that could
+        * expose stale data in the case of a crash. Use the magic error code
+        * to fallback to buffered I/O.
         */
-       if (size <= 0) {
-               ext4_clear_io_unwritten_flag(io_end);
-               size = 0;
-       }
-       io_end->offset = offset;
-       io_end->size = size;
-       ext4_put_io_end(io_end);
+       if (!m_flags && !ret)
+               ret = -ENOTBLK;
 
-       return 0;
+       ext4_journal_stop(handle);
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+
+       return ret;
 }
 
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
 {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       ssize_t ret;
-       loff_t offset = iocb->ki_pos;
-       size_t count = iov_iter_count(iter);
-       int overwrite = 0;
-       get_block_t *get_block_func = NULL;
-       int dio_flags = 0;
-       loff_t final_size = offset + count;
-       int orphan = 0;
-       handle_t *handle;
+       int ret;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
 
-       if (final_size > inode->i_size || final_size > ei->i_disksize) {
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
-               ret = ext4_orphan_add(handle, inode);
-               if (ret) {
-                       ext4_journal_stop(handle);
-                       goto out;
-               }
-               orphan = 1;
-               ext4_update_i_disksize(inode, inode->i_size);
-               ext4_journal_stop(handle);
-       }
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
 
-       BUG_ON(iocb->private == NULL);
+       if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+               return -ERANGE;
 
        /*
-        * Make all waiters for direct IO properly wait also for extent
-        * conversion. This also disallows race between truncate() and
-        * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+        * Calculate the first and last logical blocks respectively.
         */
-       inode_dio_begin(inode);
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
 
-       /* If we do a overwrite dio, i_mutex locking can be released */
-       overwrite = *((int *)iocb->private);
+       if (flags & IOMAP_WRITE)
+               ret = ext4_iomap_alloc(inode, &map, flags);
+       else
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+       if (ret < 0)
+               return ret;
 
-       if (overwrite)
-               inode_unlock(inode);
+       ext4_set_iomap(inode, iomap, &map, offset, length);
 
+       return 0;
+}
+
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+                         ssize_t written, unsigned flags, struct iomap *iomap)
+{
        /*
-        * For extent mapped files we could direct write to holes and fallocate.
-        *
-        * Allocated blocks to fill the hole are marked as unwritten to prevent
-        * parallel buffered read to expose the stale data before DIO complete
-        * the data IO.
-        *
-        * As to previously fallocated extents, ext4 get_block will just simply
-        * mark the buffer mapped but still keep the extents unwritten.
-        *
-        * For non AIO case, we will convert those unwritten extents to written
-        * after return back from blockdev_direct_IO. That way we save us from
-        * allocating io_end structure and also the overhead of offloading
-        * the extent convertion to a workqueue.
-        *
-        * For async DIO, the conversion needs to be deferred when the
-        * IO is completed. The ext4 end_io callback function will be
-        * called to take care of the conversion work.  Here for async
-        * case, we allocate an io_end structure to hook to the iocb.
+        * Check to see whether an error occurred while writing out the data to
+        * the allocated blocks. If so, return the magic error code so that we
+        * fallback to buffered I/O and attempt to complete the remainder of
+        * the I/O. Any blocks that may have been allocated in preparation for
+        * the direct I/O will be reused during buffered I/O.
         */
-       iocb->private = NULL;
-       if (overwrite)
-               get_block_func = ext4_dio_get_block_overwrite;
-       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-                  round_down(offset, i_blocksize(inode)) >= inode->i_size) {
-               get_block_func = ext4_dio_get_block;
-               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
-       } else if (is_sync_kiocb(iocb)) {
-               get_block_func = ext4_dio_get_block_unwritten_sync;
-               dio_flags = DIO_LOCKING;
-       } else {
-               get_block_func = ext4_dio_get_block_unwritten_async;
-               dio_flags = DIO_LOCKING;
-       }
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                  get_block_func, ext4_end_io_dio, NULL,
-                                  dio_flags);
+       if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+               return -ENOTBLK;
 
-       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                               EXT4_STATE_DIO_UNWRITTEN)) {
-               int err;
-               /*
-                * for non AIO case, since the IO is already
-                * completed, we could do the conversion right here
-                */
-               err = ext4_convert_unwritten_extents(NULL, inode,
-                                                    offset, ret);
-               if (err < 0)
-                       ret = err;
-               ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-       }
+       return 0;
+}
 
-       inode_dio_end(inode);
-       /* take i_mutex locking again if we do a ovewrite dio */
-       if (overwrite)
-               inode_lock(inode);
+const struct iomap_ops ext4_iomap_ops = {
+       .iomap_begin            = ext4_iomap_begin,
+       .iomap_end              = ext4_iomap_end,
+};
 
-       if (ret < 0 && final_size > inode->i_size)
-               ext4_truncate_failed_write(inode);
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+                                  struct ext4_map_blocks *map)
+{
+       struct extent_status es;
+       ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
 
-       /* Handle extending of i_size after direct IO write */
-       if (orphan) {
-               int err;
+       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+                                 map->m_lblk, end, &es);
 
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /*
-                        * We wrote the data but cannot extend
-                        * i_size. Bail out. In async io case, we do
-                        * not return error here because we have
-                        * already submmitted the corresponding
-                        * bio. Returning error here makes the caller
-                        * think that this IO is done and failed
-                        * resulting in race with bio's completion
-                        * handler.
-                        */
-                       if (!ret)
-                               ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
+       if (!es.es_len || es.es_lblk > end)
+               return false;
 
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size || end > ei->i_disksize) {
-                               ext4_update_i_disksize(inode, end);
-                               if (end > inode->i_size)
-                                       i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
+       if (es.es_lblk > map->m_lblk) {
+               map->m_len = es.es_lblk - map->m_lblk;
+               return false;
        }
-out:
-       return ret;
-}
 
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
+       offset = map->m_lblk - es.es_lblk;
+       map->m_len = es.es_len - offset;
 
-       /*
-        * Shared inode_lock is enough for us - it protects against concurrent
-        * writes & truncates and since we take care of writing back page cache,
-        * we are protected against page writeback as well.
-        */
-       inode_lock_shared(inode);
-       ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-                                          iocb->ki_pos + count - 1);
-       if (ret)
-               goto out_unlock;
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                  iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
-       inode_unlock_shared(inode);
-       return ret;
+       return true;
 }
 
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+                                  loff_t length, unsigned int flags,
+                                  struct iomap *iomap, struct iomap *srcmap)
 {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       size_t count = iov_iter_count(iter);
-       loff_t offset = iocb->ki_pos;
-       ssize_t ret;
+       int ret;
+       bool delalloc = false;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
 
-#ifdef CONFIG_FS_ENCRYPTION
-       if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
-               return 0;
-#endif
-       if (fsverity_active(inode))
-               return 0;
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
+
+       if (ext4_has_inline_data(inode)) {
+               ret = ext4_inline_data_iomap(inode, iomap);
+               if (ret != -EAGAIN) {
+                       if (ret == 0 && offset >= iomap->length)
+                               ret = -ENOENT;
+                       return ret;
+               }
+       }
 
        /*
-        * If we are doing data journalling we don't support O_DIRECT
+        * Calculate the first and last logical block respectively.
         */
-       if (ext4_should_journal_data(inode))
-               return 0;
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
 
-       /* Let buffer I/O handle the inline data case. */
-       if (ext4_has_inline_data(inode))
-               return 0;
+       ret = ext4_map_blocks(NULL, inode, &map, 0);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               delalloc = ext4_iomap_is_delalloc(inode, &map);
 
-       trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (iov_iter_rw(iter) == READ)
-               ret = ext4_direct_IO_read(iocb, iter);
-       else
-               ret = ext4_direct_IO_write(iocb, iter);
-       trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
-       return ret;
+       ext4_set_iomap(inode, iomap, &map, offset, length);
+       if (delalloc && iomap->type == IOMAP_HOLE)
+               iomap->type = IOMAP_DELALLOC;
+
+       return 0;
 }
 
+const struct iomap_ops ext4_iomap_report_ops = {
+       .iomap_begin = ext4_iomap_begin_report,
+};
+
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
@@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_journalled_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
 };
@@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
@@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 
        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
-        * All buffers in the last page remain valid? Then there's nothing to
-        * do. We do the check mainly to optimize the common PAGE_SIZE ==
-        * blocksize case
+        * If the page is fully truncated, we don't need to wait for any commit
+        * (and we even should not as __ext4_journalled_invalidatepage() may
+        * strip all buffers from the page but keep the page dirty which can then
+        * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+        * buffers). Also we don't need to wait for any commit if all buffers in
+        * the page remain valid. This is most beneficial for the common case of
+        * blocksize == PAGESIZE.
         */
-       if (offset > PAGE_SIZE - i_blocksize(inode))
+       if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                page = find_lock_page(inode->i_mapping,
@@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
 {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
+       unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
        int error;
 
+       /* this was checked at iget time, but double check for good measure */
+       if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+           (ei->i_extra_isize & 3)) {
+               EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+                                ei->i_extra_isize,
+                                EXT4_INODE_SIZE(inode->i_sb));
+               return -EFSCORRUPTED;
+       }
+       if ((new_extra_isize < ei->i_extra_isize) ||
+           (new_extra_isize < 4) ||
+           (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+               return -EINVAL; /* Should never happen */
+
        raw_inode = ext4_raw_inode(iloc);
 
        header = IHDR(inode, raw_inode);
@@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
-       if (ext4_handle_valid(handle) &&
-           jbd2_journal_extend(handle,
-                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+       if (ext4_journal_extend(handle,
+                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;
 
        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
index b1e4d35..89725fa 100644 (file)
@@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
        needed = ext4_ext_calc_credits_for_single_extent(inode,
                    lb->last_block - lb->first_block + 1, path);
 
-       /*
-        * Make sure the credit we accumalated is not really high
-        */
-       if (needed && ext4_handle_has_enough_credits(handle,
-                                               EXT4_RESERVE_TRANS_BLOCKS)) {
-               up_write((&EXT4_I(inode)->i_data_sem));
-               retval = ext4_journal_restart(handle, needed);
-               down_write((&EXT4_I(inode)->i_data_sem));
-               if (retval)
-                       goto err_out;
-       } else if (needed) {
-               retval = ext4_journal_extend(handle, needed);
-               if (retval) {
-                       /*
-                        * IF not able to extend the journal restart the journal
-                        */
-                       up_write((&EXT4_I(inode)->i_data_sem));
-                       retval = ext4_journal_restart(handle, needed);
-                       down_write((&EXT4_I(inode)->i_data_sem));
-                       if (retval)
-                               goto err_out;
-               }
-       }
+       retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+       if (retval < 0)
+               goto err_out;
        retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
 err_out:
        up_write((&EXT4_I(inode)->i_data_sem));
@@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
 
 }
 
-static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
-{
-       int retval = 0, needed;
-
-       if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
-               return 0;
-       /*
-        * We are freeing a blocks. During this we touch
-        * superblock, group descriptor and block bitmap.
-        * So allocate a credit of 3. We may update
-        * quota (user and group).
-        */
-       needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
-       if (ext4_journal_extend(handle, needed) != 0)
-               retval = ext4_journal_restart(handle, needed);
-
-       return retval;
-}
-
 static int free_dind_blocks(handle_t *handle,
                                struct inode *inode, __le32 i_data)
 {
        int i;
        __le32 *tmp_idata;
        struct buffer_head *bh;
+       struct super_block *sb = inode->i_sb;
        unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+       int err;
 
-       bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+       bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
 
        tmp_idata = (__le32 *)bh->b_data;
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
-                       extend_credit_for_blkdel(handle, inode);
+                       err = ext4_journal_ensure_credits(handle,
+                               EXT4_RESERVE_TRANS_BLOCKS,
+                               ext4_free_metadata_revoke_credits(sb, 1));
+                       if (err < 0) {
+                               put_bh(bh);
+                               return err;
+                       }
                        ext4_free_blocks(handle, inode, NULL,
                                         le32_to_cpu(tmp_idata[i]), 1,
                                         EXT4_FREE_BLOCKS_METADATA |
@@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle,
                }
        }
        put_bh(bh);
-       extend_credit_for_blkdel(handle, inode);
+       err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+                               ext4_free_metadata_revoke_credits(sb, 1));
+       if (err < 0)
+               return err;
        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
@@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle,
                }
        }
        put_bh(bh);
-       extend_credit_for_blkdel(handle, inode);
+       retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+       if (retval < 0)
+               return retval;
        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
@@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
-               extend_credit_for_blkdel(handle, inode);
+               retval = ext4_journal_ensure_credits(handle,
+                       EXT4_RESERVE_TRANS_BLOCKS,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+               if (retval < 0)
+                       return retval;
                ext4_free_blocks(handle, inode, NULL,
                                le32_to_cpu(i_data[0]), 1,
                                 EXT4_FREE_BLOCKS_METADATA |
@@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
         * One credit accounted for writing the
         * i_data field of the original inode
         */
-       retval = ext4_journal_extend(handle, 1);
-       if (retval) {
-               retval = ext4_journal_restart(handle, 1);
-               if (retval)
-                       goto err_out;
-       }
+       retval = ext4_journal_ensure_credits(handle, 1, 0);
+       if (retval < 0)
+               goto err_out;
 
        i_data[0] = ei->i_data[EXT4_IND_BLOCK];
        i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
@@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
                ix = EXT_FIRST_INDEX(eh);
                for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
                        retval = free_ext_idx(handle, inode, ix);
-                       if (retval)
-                               break;
+                       if (retval) {
+                               put_bh(bh);
+                               return retval;
+                       }
                }
        }
        put_bh(bh);
-       extend_credit_for_blkdel(handle, inode);
+       retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+       if (retval < 0)
+               return retval;
        ext4_free_blocks(handle, inode, NULL, block, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
-       return retval;
+       return 0;
 }
 
 /*
@@ -574,9 +554,9 @@ err_out:
        }
 
        /* We mark the tmp_inode dirty via ext4_ext_tree_init. */
-       if (ext4_journal_extend(handle, 1) != 0)
-               ext4_journal_restart(handle, 1);
-
+       retval = ext4_journal_ensure_credits(handle, 1, 0);
+       if (retval < 0)
+               goto out_stop;
        /*
         * Mark the tmp_inode as of size zero
         */
@@ -594,6 +574,7 @@ err_out:
 
        /* Reset the extent details */
        ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
        ext4_journal_stop(handle);
 out:
        unlock_new_inode(tmp_inode);
index a427d20..a856997 100644 (file)
@@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode)
 }
 
 
+/*
+ * Add non-directory inode to a directory. On success, the inode reference is
+ * consumed by dentry is instantiation. This is also indicated by clearing of
+ * *inodep pointer. On failure, the caller is responsible for dropping the
+ * inode reference in the safe context.
+ */
 static int ext4_add_nondir(handle_t *handle,
-               struct dentry *dentry, struct inode *inode)
+               struct dentry *dentry, struct inode **inodep)
 {
+       struct inode *dir = d_inode(dentry->d_parent);
+       struct inode *inode = *inodep;
        int err = ext4_add_entry(handle, dentry, inode);
        if (!err) {
                ext4_mark_inode_dirty(handle, inode);
+               if (IS_DIRSYNC(dir))
+                       ext4_handle_sync(handle);
                d_instantiate_new(dentry, inode);
+               *inodep = NULL;
                return 0;
        }
        drop_nlink(inode);
+       ext4_orphan_add(handle, inode);
        unlock_new_inode(inode);
-       iput(inode);
        return err;
 }
 
@@ -2592,12 +2603,12 @@ retry:
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
-               err = ext4_add_nondir(handle, dentry, inode);
-               if (!err && IS_DIRSYNC(dir))
-                       ext4_handle_sync(handle);
+               err = ext4_add_nondir(handle, dentry, &inode);
        }
        if (handle)
                ext4_journal_stop(handle);
+       if (!IS_ERR_OR_NULL(inode))
+               iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
@@ -2624,12 +2635,12 @@ retry:
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
                inode->i_op = &ext4_special_inode_operations;
-               err = ext4_add_nondir(handle, dentry, inode);
-               if (!err && IS_DIRSYNC(dir))
-                       ext4_handle_sync(handle);
+               err = ext4_add_nondir(handle, dentry, &inode);
        }
        if (handle)
                ext4_journal_stop(handle);
+       if (!IS_ERR_OR_NULL(inode))
+               iput(inode);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
@@ -2779,10 +2790,12 @@ retry:
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
+               ext4_orphan_add(handle, inode);
                unlock_new_inode(inode);
                ext4_mark_inode_dirty(handle, inode);
+               ext4_journal_stop(handle);
                iput(inode);
-               goto out_stop;
+               goto out_retry;
        }
        ext4_inc_count(handle, dir);
        ext4_update_dx_flag(dir);
@@ -2796,6 +2809,7 @@ out_clear_inode:
 out_stop:
        if (handle)
                ext4_journal_stop(handle);
+out_retry:
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
        return err;
@@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        if (IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
 
-       if (inode->i_nlink == 0) {
-               ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
-                                  dentry->d_name.len, dentry->d_name.name);
-               set_nlink(inode, 1);
-       }
        retval = ext4_delete_entry(handle, dir, de, bh);
        if (retval)
                goto end_unlink;
        dir->i_ctime = dir->i_mtime = current_time(dir);
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
-       drop_nlink(inode);
+       if (inode->i_nlink == 0)
+               ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+                                  dentry->d_name.len, dentry->d_name.name);
+       else
+               drop_nlink(inode);
        if (!inode->i_nlink)
                ext4_orphan_add(handle, inode);
        inode->i_ctime = current_time(inode);
@@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir,
                inode->i_size = disk_link.len - 1;
        }
        EXT4_I(inode)->i_disksize = inode->i_size;
-       err = ext4_add_nondir(handle, dentry, inode);
-       if (!err && IS_DIRSYNC(dir))
-               ext4_handle_sync(handle);
-
+       err = ext4_add_nondir(handle, dentry, &inode);
        if (handle)
                ext4_journal_stop(handle);
+       if (inode)
+               iput(inode);
        goto out_free_encrypted_link;
 
 err_drop_inode:
index 12ceade..24aeedb 100644 (file)
 #include "acl.h"
 
 static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
 
 int __init ext4_init_pageio(void)
 {
        io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
        if (io_end_cachep == NULL)
                return -ENOMEM;
+
+       io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+       if (io_end_vec_cachep == NULL) {
+               kmem_cache_destroy(io_end_cachep);
+               return -ENOMEM;
+       }
        return 0;
 }
 
 void ext4_exit_pageio(void)
 {
        kmem_cache_destroy(io_end_cachep);
+       kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+       struct ext4_io_end_vec *io_end_vec;
+
+       io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+       if (!io_end_vec)
+               return ERR_PTR(-ENOMEM);
+       INIT_LIST_HEAD(&io_end_vec->list);
+       list_add_tail(&io_end_vec->list, &io_end->list_vec);
+       return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+       struct ext4_io_end_vec *io_end_vec, *tmp;
+
+       if (list_empty(&io_end->list_vec))
+               return;
+       list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+               list_del(&io_end_vec->list);
+               kmem_cache_free(io_end_vec_cachep, io_end_vec);
+       }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+       BUG_ON(list_empty(&io_end->list_vec));
+       return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
 }
 
 /*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
                ext4_finish_bio(bio);
                bio_put(bio);
        }
+       ext4_free_io_end_vec(io_end);
        kmem_cache_free(io_end_cachep, io_end);
 }
 
@@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
  * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
  * completed (happens from ext4_free_ioend()).
  */
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
 {
-       struct inode *inode = io->inode;
-       loff_t offset = io->offset;
-       ssize_t size = io->size;
-       handle_t *handle = io->handle;
+       struct inode *inode = io_end->inode;
+       handle_t *handle = io_end->handle;
        int ret = 0;
 
-       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+       ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
                   "list->prev 0x%p\n",
-                  io, inode->i_ino, io->list.next, io->list.prev);
+                  io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
 
-       io->handle = NULL;      /* Following call will use up the handle */
-       ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+       io_end->handle = NULL;  /* Following call will use up the handle */
+       ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
        if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
                ext4_msg(inode->i_sb, KERN_EMERG,
                         "failed to convert unwritten extents to written "
                         "extents -- potential data loss!  "
-                        "(inode %lu, offset %llu, size %zd, error %d)",
-                        inode->i_ino, offset, size, ret);
+                        "(inode %lu, error %d)", inode->i_ino, ret);
        }
-       ext4_clear_io_unwritten_flag(io);
-       ext4_release_io_end(io);
+       ext4_clear_io_unwritten_flag(io_end);
+       ext4_release_io_end(io_end);
        return ret;
 }
 
@@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
 {
 #ifdef EXT4FS_DEBUG
        struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
+       ext4_io_end_t *io_end, *io_end0, *io_end1;
 
        if (list_empty(head))
                return;
 
        ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
-       list_for_each_entry(io, head, list) {
-               cur = &io->list;
+       list_for_each_entry(io_end, head, list) {
+               cur = &io_end->list;
                before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
+               io_end0 = container_of(before, ext4_io_end_t, list);
                after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
+               io_end1 = container_of(after, ext4_io_end_t, list);
 
                ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
+                           io_end, inode->i_ino, io_end0, io_end1);
        }
 #endif
 }
@@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
 static int ext4_do_flush_completed_IO(struct inode *inode,
                                      struct list_head *head)
 {
-       ext4_io_end_t *io;
+       ext4_io_end_t *io_end;
        struct list_head unwritten;
        unsigned long flags;
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
        spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 
        while (!list_empty(&unwritten)) {
-               io = list_entry(unwritten.next, ext4_io_end_t, list);
-               BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
-               list_del_init(&io->list);
+               io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+               BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+               list_del_init(&io_end->list);
 
-               err = ext4_end_io(io);
+               err = ext4_end_io_end(io_end);
                if (unlikely(!ret && err))
                        ret = err;
        }
@@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work)
 
 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
 {
-       ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
-       if (io) {
-               io->inode = inode;
-               INIT_LIST_HEAD(&io->list);
-               atomic_set(&io->count, 1);
+       ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+       if (io_end) {
+               io_end->inode = inode;
+               INIT_LIST_HEAD(&io_end->list);
+               INIT_LIST_HEAD(&io_end->list_vec);
+               atomic_set(&io_end->count, 1);
        }
-       return io;
+       return io_end;
 }
 
 void ext4_put_io_end_defer(ext4_io_end_t *io_end)
 {
        if (atomic_dec_and_test(&io_end->count)) {
-               if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+               if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+                               list_empty(&io_end->list_vec)) {
                        ext4_release_io_end(io_end);
                        return;
                }
@@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
 
        if (atomic_dec_and_test(&io_end->count)) {
                if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
-                       err = ext4_convert_unwritten_extents(io_end->handle,
-                                               io_end->inode, io_end->offset,
-                                               io_end->size);
+                       err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+                                                               io_end);
                        io_end->handle = NULL;
                        ext4_clear_io_unwritten_flag(io_end);
                }
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
                struct inode *inode = io_end->inode;
 
                ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
-                            "(offset %llu size %ld starting block %llu)",
+                            "starting block %llu)",
                             bio->bi_status, inode->i_ino,
-                            (unsigned long long) io_end->offset,
-                            (long) io_end->size,
                             (unsigned long long)
                             bi_sector >> (inode->i_blkbits - 9));
                mapping_set_error(inode->i_mapping,
@@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
        io->io_end = NULL;
 }
 
-static int io_submit_init_bio(struct ext4_io_submit *io,
-                             struct buffer_head *bh)
+static void io_submit_init_bio(struct ext4_io_submit *io,
+                              struct buffer_head *bh)
 {
        struct bio *bio;
 
+       /*
+        * bio_alloc will _always_ be able to allocate a bio if
+        * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+        */
        bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
-       if (!bio)
-               return -ENOMEM;
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio_set_dev(bio, bh->b_bdev);
        bio->bi_end_io = ext4_end_bio;
@@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
        io->io_bio = bio;
        io->io_next_block = bh->b_blocknr;
        wbc_init_bio(io->io_wbc, bio);
-       return 0;
 }
 
-static int io_submit_add_bh(struct ext4_io_submit *io,
-                           struct inode *inode,
-                           struct page *page,
-                           struct buffer_head *bh)
+static void io_submit_add_bh(struct ext4_io_submit *io,
+                            struct inode *inode,
+                            struct page *page,
+                            struct buffer_head *bh)
 {
        int ret;
 
@@ -388,9 +425,7 @@ submit_and_retry:
                ext4_io_submit(io);
        }
        if (io->io_bio == NULL) {
-               ret = io_submit_init_bio(io, bh);
-               if (ret)
-                       return ret;
+               io_submit_init_bio(io, bh);
                io->io_bio->bi_write_hint = inode->i_write_hint;
        }
        ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
@@ -398,7 +433,6 @@ submit_and_retry:
                goto submit_and_retry;
        wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
        io->io_next_block++;
-       return 0;
 }
 
 int ext4_bio_write_page(struct ext4_io_submit *io,
@@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                                gfp_flags |= __GFP_NOFAIL;
                                goto retry_encrypt;
                        }
-                       bounce_page = NULL;
-                       goto out;
+
+                       printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+                       redirty_page_for_writepage(wbc, page);
+                       do {
+                               clear_buffer_async_write(bh);
+                               bh = bh->b_this_page;
+                       } while (bh != head);
+                       goto unlock;
                }
        }
 
@@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        do {
                if (!buffer_async_write(bh))
                        continue;
-               ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
-               if (ret) {
-                       /*
-                        * We only get here on ENOMEM.  Not much else
-                        * we can do but mark the page as dirty, and
-                        * better luck next time.
-                        */
-                       break;
-               }
+               io_submit_add_bh(io, inode,
+                                bounce_page ? bounce_page : page, bh);
                nr_submitted++;
                clear_buffer_dirty(bh);
        } while ((bh = bh->b_this_page) != head);
 
-       /* Error stopped previous loop? Clean up buffers... */
-       if (ret) {
-       out:
-               fscrypt_free_bounce_page(bounce_page);
-               printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
-               redirty_page_for_writepage(wbc, page);
-               do {
-                       clear_buffer_async_write(bh);
-                       bh = bh->b_this_page;
-               } while (bh != head);
-       }
+unlock:
        unlock_page(page);
        /* Nothing submitted - we have to end page writeback */
        if (!nr_submitted)
index a30b203..fef7755 100644 (file)
@@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping,
                if (bio == NULL) {
                        struct bio_post_read_ctx *ctx;
 
+                       /*
+                        * bio_alloc will _always_ be able to allocate a bio if
+                        * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
+                        */
                        bio = bio_alloc(GFP_KERNEL,
                                min_t(int, nr_pages, BIO_MAX_PAGES));
-                       if (!bio)
-                               goto set_error_page;
                        ctx = get_bio_post_read_ctx(inode, bio, page->index);
                        if (IS_ERR(ctx)) {
                                bio_put(bio);
index c0e9aef..a8c0f2b 100644 (file)
@@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
        return bh;
 }
 
-/*
- * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh)
+static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits)
 {
-       int err;
-
-       if (ext4_handle_has_enough_credits(handle, thresh))
-               return 0;
-
-       err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
-       if (err < 0)
-               return err;
-       if (err) {
-               err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
-               if (err)
-                       return err;
-       }
-
-       return 0;
+       return ext4_journal_ensure_credits_fn(handle, credits,
+               EXT4_MAX_TRANS_DATA, 0, 0);
 }
 
 /*
@@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
                        continue;
                }
 
-               err = extend_or_restart_transaction(handle, 1);
-               if (err)
+               err = ext4_resize_ensure_credits_batch(handle, 1);
+               if (err < 0)
                        return err;
 
                bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
@@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
                        struct buffer_head *gdb;
 
                        ext4_debug("update backup group %#04llx\n", block);
-                       err = extend_or_restart_transaction(handle, 1);
-                       if (err)
+                       err = ext4_resize_ensure_credits_batch(handle, 1);
+                       if (err < 0)
                                goto out;
 
                        gdb = sb_getblk(sb, block);
@@ -602,8 +584,8 @@ handle_bb:
 
                /* Initialize block bitmap of the @group */
                block = group_data[i].block_bitmap;
-               err = extend_or_restart_transaction(handle, 1);
-               if (err)
+               err = ext4_resize_ensure_credits_batch(handle, 1);
+               if (err < 0)
                        goto out;
 
                bh = bclean(handle, sb, block);
@@ -631,8 +613,8 @@ handle_ib:
 
                /* Initialize inode bitmap of the @group */
                block = group_data[i].inode_bitmap;
-               err = extend_or_restart_transaction(handle, 1);
-               if (err)
+               err = ext4_resize_ensure_credits_batch(handle, 1);
+               if (err < 0)
                        goto out;
                /* Mark unused entries in inode bitmap used */
                bh = bclean(handle, sb, block);
@@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
                ext4_fsblk_t backup_block;
 
                /* Out of journal space, and can't get more - abort - so sad */
-               if (ext4_handle_valid(handle) &&
-                   handle->h_buffer_credits == 0 &&
-                   ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
-                   (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+               err = ext4_resize_ensure_credits_batch(handle, 1);
+               if (err < 0)
                        break;
 
                if (meta_bg == 0)
index b3cbf86..c540199 100644 (file)
@@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode)
 {
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-       dquot_drop(inode);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+       dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
@@ -1388,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);
 static int ext4_enable_quotas(struct super_block *sb);
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
 
 static struct dquot **ext4_get_dquots(struct inode *inode)
 {
@@ -1406,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = {
        .destroy_dquot          = dquot_destroy,
        .get_projid             = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
-       .get_next_id            = ext4_get_next_id,
+       .get_next_id            = dquot_get_next_id,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
@@ -2065,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb,
                         unsigned int *journal_ioprio,
                         int is_remount)
 {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
        char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
        substring_t args[MAX_OPT_ARGS];
        int token;
@@ -2119,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb,
                }
        }
 #endif
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               int blocksize =
-                       BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
-               if (blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR, "can't mount with "
-                                "dioread_nolock if block size != PAGE_SIZE");
-                       return 0;
-               }
-       }
        return 1;
 }
 
@@ -3569,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
+       unsigned def_extra_isize = sizeof(struct ext4_inode) -
+                                               EXT4_GOOD_OLD_INODE_SIZE;
 
-       /* determine the minimum size of new large inodes, if present */
-       if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
-           sbi->s_want_extra_isize == 0) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                    EXT4_GOOD_OLD_INODE_SIZE;
+       if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+               sbi->s_want_extra_isize = 0;
+               return;
+       }
+       if (sbi->s_want_extra_isize < 4) {
+               sbi->s_want_extra_isize = def_extra_isize;
                if (ext4_has_feature_extra_isize(sb)) {
                        if (sbi->s_want_extra_isize <
                            le16_to_cpu(es->s_want_extra_isize))
@@ -3587,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
                }
        }
        /* Check if enough inode space is available */
-       if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
-                                                       sbi->s_inode_size) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                      EXT4_GOOD_OLD_INODE_SIZE;
+       if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+           (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+                                                       sbi->s_inode_size)) {
+               sbi->s_want_extra_isize = def_extra_isize;
                ext4_msg(sb, KERN_INFO,
                         "required extra inode space not available");
        }
@@ -4453,13 +4445,6 @@ no_journal:
                }
        }
 
-       if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
-           (blocksize != PAGE_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Unsupported blocksize for fs encryption");
-               goto failed_mount_wq;
-       }
-
        if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
                ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
                goto failed_mount_wq;
@@ -6033,18 +6018,6 @@ out:
        }
        return len;
 }
-
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
-{
-       const struct quota_format_ops   *ops;
-
-       if (!sb_has_quota_loaded(sb, qid->type))
-               return -ESRCH;
-       ops = sb_dqopt(sb)->ops[qid->type];
-       if (!ops || !ops->get_next_id)
-               return -ENOSYS;
-       return dquot_get_next_id(sb, qid);
-}
 #endif
 
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
index 491f9ee..8966a54 100644 (file)
@@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
        return credits;
 }
 
-static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
-                                    int credits, struct buffer_head *bh,
-                                    bool dirty, bool block_csum)
-{
-       int error;
-
-       if (!ext4_handle_valid(handle))
-               return 0;
-
-       if (handle->h_buffer_credits >= credits)
-               return 0;
-
-       error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
-       if (!error)
-               return 0;
-       if (error < 0) {
-               ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
-               return error;
-       }
-
-       if (bh && dirty) {
-               if (block_csum)
-                       ext4_xattr_block_csum_set(inode, bh);
-               error = ext4_handle_dirty_metadata(handle, NULL, bh);
-               if (error) {
-                       ext4_warning(inode->i_sb, "Handle metadata (error %d)",
-                                    error);
-                       return error;
-               }
-       }
-
-       error = ext4_journal_restart(handle, credits);
-       if (error) {
-               ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
-               return error;
-       }
-
-       if (bh) {
-               error = ext4_journal_get_write_access(handle, bh);
-               if (error) {
-                       ext4_warning(inode->i_sb,
-                                    "Get write access failed (error %d)",
-                                    error);
-                       return error;
-               }
-       }
-       return 0;
-}
-
 static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
                                       int ref_change)
 {
@@ -1149,6 +1100,24 @@ cleanup:
        return saved_err;
 }
 
+static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
+                       struct buffer_head *bh, bool block_csum, bool dirty)
+{
+       int error;
+
+       if (bh && dirty) {
+               if (block_csum)
+                       ext4_xattr_block_csum_set(inode, bh);
+               error = ext4_handle_dirty_metadata(handle, NULL, bh);
+               if (error) {
+                       ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+                                    error);
+                       return error;
+               }
+       }
+       return 0;
+}
+
 static void
 ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
                             struct buffer_head *bh,
@@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
                        continue;
                }
 
-               err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
-                                               dirty, block_csum);
-               if (err) {
+               err = ext4_journal_ensure_credits_fn(handle, credits, credits,
+                       ext4_free_metadata_revoke_credits(parent->i_sb, 1),
+                       ext4_xattr_restart_fn(handle, parent, bh, block_csum,
+                                             dirty));
+               if (err < 0) {
                        ext4_warning_inode(ea_inode, "Ensure credits err=%d",
                                           err);
                        continue;
                }
+               if (err > 0) {
+                       err = ext4_journal_get_write_access(handle, bh);
+                       if (err) {
+                               ext4_warning_inode(ea_inode,
+                                               "Re-get write access err=%d",
+                                               err);
+                               continue;
+                       }
+               }
 
                err = ext4_xattr_inode_dec_ref(handle, ea_inode);
                if (err) {
@@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                                                   flags & XATTR_CREATE);
                brelse(bh);
 
-               if (!ext4_handle_has_enough_credits(handle, credits)) {
+               if (jbd2_handle_buffer_credits(handle) < credits) {
                        error = -ENOSPC;
                        goto cleanup;
                }
@@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
        struct inode *ea_inode;
        int error;
 
-       error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
-                                         NULL /* bh */,
-                                         false /* dirty */,
-                                         false /* block_csum */);
-       if (error) {
+       error = ext4_journal_ensure_credits(handle, extra_credits,
+                       ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+       if (error < 0) {
                EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
                goto cleanup;
        }
index a190906..8fff667 100644 (file)
@@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
        int nblocks, space_left;
        /* assert_spin_locked(&journal->j_state_lock); */
 
-       nblocks = jbd2_space_needed(journal);
+       nblocks = journal->j_max_transaction_buffers;
        while (jbd2_log_space_left(journal) < nblocks) {
                write_unlock(&journal->j_state_lock);
                mutex_lock_io(&journal->j_checkpoint_mutex);
index 132fb92..7f0b362 100644 (file)
@@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                if (jh->b_committed_data) {
                        struct buffer_head *bh = jh2bh(jh);
 
-                       jbd_lock_bh_state(bh);
+                       spin_lock(&jh->b_state_lock);
                        jbd2_free(jh->b_committed_data, bh->b_size);
                        jh->b_committed_data = NULL;
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                }
                jbd2_journal_refile_buffer(journal, jh);
        }
@@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        stats.run.rs_logging = jiffies;
        stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
                                               stats.run.rs_logging);
-       stats.run.rs_blocks =
-               atomic_read(&commit_transaction->t_outstanding_credits);
+       stats.run.rs_blocks = commit_transaction->t_nr_buffers;
        stats.run.rs_blocks_logged = 0;
 
        J_ASSERT(commit_transaction->t_nr_buffers <=
@@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
                /*
                 * start_this_handle() uses t_outstanding_credits to determine
-                * the free space in the log, but this counter is changed
-                * by jbd2_journal_next_log_block() also.
+                * the free space in the log.
                 */
                atomic_dec(&commit_transaction->t_outstanding_credits);
 
@@ -727,7 +725,6 @@ start_journal_io:
                                submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
                        }
                        cond_resched();
-                       stats.run.rs_blocks_logged += bufs;
 
                        /* Force a new descriptor to be generated next
                            time round the loop. */
@@ -814,6 +811,7 @@ start_journal_io:
                if (unlikely(!buffer_uptodate(bh)))
                        err = -EIO;
                jbd2_unfile_log_bh(bh);
+               stats.run.rs_blocks_logged++;
 
                /*
                 * The list contains temporary buffer heads created by
@@ -859,6 +857,7 @@ start_journal_io:
                BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
                clear_buffer_jwrite(bh);
                jbd2_unfile_log_bh(bh);
+               stats.run.rs_blocks_logged++;
                __brelse(bh);           /* One for getblk */
                /* AKPM: bforget here */
        }
@@ -880,6 +879,7 @@ start_journal_io:
        }
        if (cbh)
                err = journal_wait_on_commit_record(journal, cbh);
+       stats.run.rs_blocks_logged++;
        if (jbd2_has_feature_async_commit(journal) &&
            journal->j_flags & JBD2_BARRIER) {
                blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
@@ -888,6 +888,9 @@ start_journal_io:
        if (err)
                jbd2_journal_abort(journal, err);
 
+       WARN_ON_ONCE(
+               atomic_read(&commit_transaction->t_outstanding_credits) < 0);
+
        /*
         * Now disk caches for filesystem device are flushed so we are safe to
         * erase checkpointed transactions from the log by updating journal
@@ -918,6 +921,7 @@ restart_loop:
                transaction_t *cp_transaction;
                struct buffer_head *bh;
                int try_to_free = 0;
+               bool drop_ref;
 
                jh = commit_transaction->t_forget;
                spin_unlock(&journal->j_list_lock);
@@ -927,7 +931,7 @@ restart_loop:
                 * done with it.
                 */
                get_bh(bh);
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 
                /*
@@ -1022,8 +1026,10 @@ restart_loop:
                                try_to_free = 1;
                }
                JBUFFER_TRACE(jh, "refile or unfile buffer");
-               __jbd2_journal_refile_buffer(jh);
-               jbd_unlock_bh_state(bh);
+               drop_ref = __jbd2_journal_refile_buffer(jh);
+               spin_unlock(&jh->b_state_lock);
+               if (drop_ref)
+                       jbd2_journal_put_journal_head(jh);
                if (try_to_free)
                        release_buffer_page(bh);        /* Drops bh reference */
                else
index 1c58859..5e408ee 100644 (file)
@@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        /* keep subsequent assertions sane */
        atomic_set(&new_bh->b_count, 1);
 
-       jbd_lock_bh_state(bh_in);
+       spin_lock(&jh_in->b_state_lock);
 repeat:
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -405,13 +405,13 @@ repeat:
        if (need_copy_out && !done_copy_out) {
                char *tmp;
 
-               jbd_unlock_bh_state(bh_in);
+               spin_unlock(&jh_in->b_state_lock);
                tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
                if (!tmp) {
                        brelse(new_bh);
                        return -ENOMEM;
                }
-               jbd_lock_bh_state(bh_in);
+               spin_lock(&jh_in->b_state_lock);
                if (jh_in->b_frozen_data) {
                        jbd2_free(tmp, bh_in->b_size);
                        goto repeat;
@@ -464,7 +464,7 @@ repeat:
        __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
        spin_unlock(&journal->j_list_lock);
        set_buffer_shadow(bh_in);
-       jbd_unlock_bh_state(bh_in);
+       spin_unlock(&jh_in->b_state_lock);
 
        return do_escape | (done_copy_out << 1);
 }
@@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
        if (!bh)
                return NULL;
+       atomic_dec(&transaction->t_outstanding_credits);
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        header = (journal_header_t *)bh->b_data;
@@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal)
        remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
+/* Minimum size of descriptor tag */
+static int jbd2_min_tag_size(void)
+{
+       /*
+        * Tag with 32-bit block numbers does not use last four bytes of the
+        * structure
+        */
+       return sizeof(journal_block_tag_t) - 4;
+}
+
 /*
  * Management for journal control blocks: functions to create and
  * destroy journal_t structures, and to initialise and read existing
@@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
        journal->j_fs_dev = fs_dev;
        journal->j_blk_offset = start;
        journal->j_maxlen = len;
-       n = journal->j_blocksize / sizeof(journal_block_tag_t);
+       /* We need enough buffers to write out full descriptor block. */
+       n = journal->j_blocksize / jbd2_min_tag_size();
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
                                        GFP_KERNEL);
@@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+       int record_size;
+       int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+       if (jbd2_has_feature_64bit(journal))
+               record_size = 8;
+       else
+               record_size = 4;
+
+       if (jbd2_journal_has_csum_v2or3(journal))
+               space -= sizeof(struct jbd2_journal_block_tail);
+       return space / record_size;
+}
+
 /*
  * Read the superblock for a given journal, performing initial
  * validation of the format.
@@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal)
                                                   sizeof(sb->s_uuid));
        }
 
+       journal->j_revoke_records_per_block =
+                               journal_revoke_records_per_block(journal);
        set_buffer_verified(bh);
 
        return 0;
@@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
        sb->s_feature_incompat  |= cpu_to_be32(incompat);
        unlock_buffer(journal->j_sb_buffer);
+       journal->j_revoke_records_per_block =
+                               journal_revoke_records_per_block(journal);
 
        return 1;
 #undef COMPAT_FEATURE_ON
@@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
        sb->s_feature_compat    &= ~cpu_to_be32(compat);
        sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
        sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
+       journal->j_revoke_records_per_block =
+                               journal_revoke_records_per_block(journal);
 }
 EXPORT_SYMBOL(jbd2_journal_clear_features);
 
@@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void)
                ret = kmem_cache_zalloc(jbd2_journal_head_cache,
                                GFP_NOFS | __GFP_NOFAIL);
        }
+       if (ret)
+               spin_lock_init(&ret->b_state_lock);
        return ret;
 }
 
@@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
        J_ASSERT_BH(bh, buffer_jbd(bh));
        J_ASSERT_BH(bh, jh2bh(jh) == bh);
        BUFFER_TRACE(bh, "remove journal_head");
+
+       /* Unlink before dropping the lock */
+       bh->b_private = NULL;
+       jh->b_bh = NULL;        /* debug, really */
+       clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
        if (jh->b_frozen_data) {
                printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
-               jbd2_free(jh->b_frozen_data, bh->b_size);
+               jbd2_free(jh->b_frozen_data, b_size);
        }
        if (jh->b_committed_data) {
                printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
-               jbd2_free(jh->b_committed_data, bh->b_size);
+               jbd2_free(jh->b_committed_data, b_size);
        }
-       bh->b_private = NULL;
-       jh->b_bh = NULL;        /* debug, really */
-       clear_buffer_jbd(bh);
        journal_free_journal_head(jh);
 }
 
@@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
        if (!jh->b_jcount) {
                __journal_remove_journal_head(bh);
                jbd_unlock_bh_journal_head(bh);
+               journal_release_journal_head(jh, bh->b_size);
                __brelse(bh);
-       } else
+       } else {
                jbd_unlock_bh_journal_head(bh);
+       }
 }
 
 /*
index f08073d..fa60878 100644 (file)
@@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
        }
 #endif
 
+       if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
+               if (!bh_in)
+                       brelse(bh);
+               return -EIO;
+       }
        /* We really ought not ever to revoke twice in a row without
            first having the revoke cancelled: it's illegal to free a
            block twice without allocating it in between! */
@@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
                        __brelse(bh);
                }
        }
+       handle->h_revoke_credits--;
 
        jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
        err = insert_revoke_hash(journal, blocknr,
index b25ebdc..27b9f9d 100644 (file)
@@ -63,6 +63,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
 }
 
 /*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+       int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+       int tags_per_block;
+
+       /* Subtract UUID */
+       tag_space -= 16;
+       if (jbd2_journal_has_csum_v2or3(journal))
+               tag_space -= sizeof(struct jbd2_journal_block_tail);
+       /* Commit code leaves a slack space of 16 bytes at the end of block */
+       tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+       /*
+        * Revoke descriptors are accounted separately so we need to reserve
+        * space for commit block and normal transaction descriptor blocks.
+        */
+       return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+                               tags_per_block);
+}
+
+/*
  * jbd2_get_transaction: obtain a new transaction_t object.
  *
  * Simply initialise a new transaction. Initialize it in
@@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal,
        spin_lock_init(&transaction->t_handle_lock);
        atomic_set(&transaction->t_updates, 0);
        atomic_set(&transaction->t_outstanding_credits,
+                  jbd2_descriptor_blocks_per_trans(journal) +
                   atomic_read(&journal->j_reserved_credits));
+       atomic_set(&transaction->t_outstanding_revokes, 0);
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks,
         * *before* starting to dirty potentially checkpointed buffers
         * in the new transaction.
         */
-       if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+       if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                write_lock(&journal->j_state_lock);
-               if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+               if (jbd2_log_space_left(journal) <
+                                       journal->j_max_transaction_buffers)
                        __jbd2_log_wait_for_space(journal);
                write_unlock(&journal->j_state_lock);
                return 1;
@@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
                             gfp_t gfp_mask)
 {
        transaction_t   *transaction, *new_transaction = NULL;
-       int             blocks = handle->h_buffer_credits;
+       int             blocks = handle->h_total_credits;
        int             rsv_blocks = 0;
        unsigned long ts = jiffies;
 
        if (handle->h_rsv_handle)
-               rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+               rsv_blocks = handle->h_rsv_handle->h_total_credits;
 
        /*
         * Limit the number of reserved credits to 1/2 of maximum transaction
@@ -405,6 +430,7 @@ repeat:
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        handle->h_requested_credits = blocks;
+       handle->h_revoke_credits_requested = handle->h_revoke_credits;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
@@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks)
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
-       handle->h_buffer_credits = nblocks;
+       handle->h_total_credits = nblocks;
        handle->h_ref = 1;
 
        return handle;
 }
 
 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
-                             gfp_t gfp_mask, unsigned int type,
-                             unsigned int line_no)
+                             int revoke_records, gfp_t gfp_mask,
+                             unsigned int type, unsigned int line_no)
 {
        handle_t *handle = journal_current_handle();
        int err;
@@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
                return handle;
        }
 
+       nblocks += DIV_ROUND_UP(revoke_records,
+                               journal->j_revoke_records_per_block);
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
@@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
                rsv_handle->h_journal = journal;
                handle->h_rsv_handle = rsv_handle;
        }
+       handle->h_revoke_credits = revoke_records;
 
        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
@@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start);
  */
 handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 {
-       return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+       return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
 }
 EXPORT_SYMBOL(jbd2_journal_start);
 
-void jbd2_journal_free_reserved(handle_t *handle)
+static void __jbd2_journal_unreserve_handle(handle_t *handle)
 {
        journal_t *journal = handle->h_journal;
 
        WARN_ON(!handle->h_reserved);
-       sub_reserved_credits(journal, handle->h_buffer_credits);
+       sub_reserved_credits(journal, handle->h_total_credits);
+}
+
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+       __jbd2_journal_unreserve_handle(handle);
        jbd2_free_handle(handle);
 }
 EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
-                               line_no, handle->h_buffer_credits);
+                               line_no, handle->h_total_credits);
        return 0;
 }
 EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
  * int jbd2_journal_extend() - extend buffer credits.
  * @handle:  handle to 'extend'
  * @nblocks: nr blocks to try to extend by.
+ * @revoke_records: number of revoke records to try to extend by.
  *
  * Some transactions, such as large extends and truncates, can be done
  * atomically all at once or in several stages.  The operation requests
@@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
  * return code < 0 implies an error
  * return code > 0 implies normal transaction-full status.
  */
-int jbd2_journal_extend(handle_t *handle, int nblocks)
+int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
@@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
                goto error_out;
        }
 
+       nblocks += DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested + revoke_records,
+                       journal->j_revoke_records_per_block) -
+               DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested,
+                       journal->j_revoke_records_per_block);
        spin_lock(&transaction->t_handle_lock);
        wanted = atomic_add_return(nblocks,
                                   &transaction->t_outstanding_credits);
@@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
                goto unlock;
        }
 
-       if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
-           jbd2_log_space_left(journal)) {
-               jbd_debug(3, "denied handle %p %d blocks: "
-                         "insufficient log space\n", handle, nblocks);
-               atomic_sub(nblocks, &transaction->t_outstanding_credits);
-               goto unlock;
-       }
-
        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
-                                handle->h_buffer_credits,
+                                handle->h_total_credits,
                                 nblocks);
 
-       handle->h_buffer_credits += nblocks;
+       handle->h_total_credits += nblocks;
        handle->h_requested_credits += nblocks;
+       handle->h_revoke_credits += revoke_records;
+       handle->h_revoke_credits_requested += revoke_records;
        result = 0;
 
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -655,11 +690,55 @@ error_out:
        return result;
 }
 
+static void stop_this_handle(handle_t *handle)
+{
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int revokes;
+
+       J_ASSERT(journal_current_handle() == handle);
+       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+       current->journal_info = NULL;
+       /*
+        * Subtract necessary revoke descriptor blocks from handle credits. We
+        * take care to account only for revoke descriptor blocks the
+        * transaction will really need as large sequences of transactions with
+        * small numbers of revokes are relatively common.
+        */
+       revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+       if (revokes) {
+               int t_revokes, revoke_descriptors;
+               int rr_per_blk = journal->j_revoke_records_per_block;
+
+               WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+                               > handle->h_total_credits);
+               t_revokes = atomic_add_return(revokes,
+                               &transaction->t_outstanding_revokes);
+               revoke_descriptors =
+                       DIV_ROUND_UP(t_revokes, rr_per_blk) -
+                       DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+               handle->h_total_credits -= revoke_descriptors;
+       }
+       atomic_sub(handle->h_total_credits,
+                  &transaction->t_outstanding_credits);
+       if (handle->h_rsv_handle)
+               __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+       if (atomic_dec_and_test(&transaction->t_updates))
+               wake_up(&journal->j_wait_updates);
+
+       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       /*
+        * Scope of the GFP_NOFS context is over here and so we can restore the
+        * original alloc context.
+        */
+       memalloc_nofs_restore(handle->saved_alloc_context);
+}
 
 /**
  * int jbd2_journal_restart() - restart a handle .
  * @handle:  handle to restart
  * @nblocks: nr credits requested
+ * @revoke_records: number of revoke record credits requested
  * @gfp_mask: memory allocation flags (for start_this_handle)
  *
  * Restart a handle for a multi-transaction filesystem
@@ -672,56 +751,48 @@ error_out:
  * credits. We preserve reserved handle if there's any attached to the
  * passed in handle.
  */
-int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+                         gfp_t gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        tid_t           tid;
-       int             need_to_start, ret;
+       int             need_to_start;
+       int             ret;
 
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
        journal = transaction->t_journal;
+       tid = transaction->t_tid;
 
        /*
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
-       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-       J_ASSERT(journal_current_handle() == handle);
-
-       read_lock(&journal->j_state_lock);
-       spin_lock(&transaction->t_handle_lock);
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
-       if (handle->h_rsv_handle) {
-               sub_reserved_credits(journal,
-                                    handle->h_rsv_handle->h_buffer_credits);
-       }
-       if (atomic_dec_and_test(&transaction->t_updates))
-               wake_up(&journal->j_wait_updates);
-       tid = transaction->t_tid;
-       spin_unlock(&transaction->t_handle_lock);
+       jbd_debug(2, "restarting handle %p\n", handle);
+       stop_this_handle(handle);
        handle->h_transaction = NULL;
-       current->journal_info = NULL;
 
-       jbd_debug(2, "restarting handle %p\n", handle);
+       /*
+        * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+        * get rid of pointless j_state_lock traffic like this.
+        */
+       read_lock(&journal->j_state_lock);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
-
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
-       handle->h_buffer_credits = nblocks;
-       /*
-        * Restore the original nofs context because the journal restart
-        * is basically the same thing as journal stop and start.
-        * start_this_handle will start a new nofs context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       handle->h_total_credits = nblocks +
+               DIV_ROUND_UP(revoke_records,
+                            journal->j_revoke_records_per_block);
+       handle->h_revoke_credits = revoke_records;
        ret = start_this_handle(journal, handle, gfp_mask);
+       trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+                                ret ? 0 : handle->h_transaction->t_tid,
+                                handle->h_type, handle->h_line_no,
+                                handle->h_total_credits);
        return ret;
 }
 EXPORT_SYMBOL(jbd2__journal_restart);
@@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart);
 
 int jbd2_journal_restart(handle_t *handle, int nblocks)
 {
-       return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+       return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
 }
 EXPORT_SYMBOL(jbd2_journal_restart);
 
@@ -879,7 +950,7 @@ repeat:
 
        start_lock = jiffies;
        lock_buffer(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
 
        /* If it takes too long to lock the buffer, trace it */
        time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -929,7 +1000,7 @@ repeat:
 
        error = -EROFS;
        if (is_handle_aborted(handle)) {
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                goto out;
        }
        error = 0;
@@ -993,7 +1064,7 @@ repeat:
         */
        if (buffer_shadow(bh)) {
                JBUFFER_TRACE(jh, "on shadow: sleep");
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }
@@ -1014,7 +1085,7 @@ repeat:
                JBUFFER_TRACE(jh, "generate frozen data");
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                   GFP_NOFS | __GFP_NOFAIL);
                        goto repeat;
@@ -1033,7 +1104,7 @@ attach_next:
        jh->b_next_transaction = transaction;
 
 done:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
 
        /*
         * If we are about to journal a buffer, then any revoke pending on it is
@@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         * that case: the transaction must have deleted the buffer for it to be
         * reused here.
         */
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
@@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                jh->b_next_transaction = transaction;
                spin_unlock(&journal->j_list_lock);
        }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
 
        /*
         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
@@ -1275,13 +1346,13 @@ repeat:
                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                            GFP_NOFS|__GFP_NOFAIL);
 
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        if (!jh->b_committed_data) {
                /* Copy out the current buffer contents into the
                 * preserved, committed copy. */
                JBUFFER_TRACE(jh, "generate b_committed data");
                if (!committed_data) {
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        goto repeat;
                }
 
@@ -1289,7 +1360,7 @@ repeat:
                committed_data = NULL;
                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
        }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
 out:
        jbd2_journal_put_journal_head(jh);
        if (unlikely(committed_data))
@@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction != transaction &&
            jh->b_next_transaction != transaction) {
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_next_transaction == transaction);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
        }
        if (jh->b_modified == 1) {
                /* If it's in our transaction it must be in BJ_Metadata list. */
                if (jh->b_transaction == transaction &&
                    jh->b_jlist != BJ_Metadata) {
-                       jbd_lock_bh_state(bh);
+                       spin_lock(&jh->b_state_lock);
                        if (jh->b_transaction == transaction &&
                            jh->b_jlist != BJ_Metadata)
                                pr_err("JBD2: assertion failure: h_type=%u "
@@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                                       jh->b_jlist);
                        J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                        jh->b_jlist == BJ_Metadata);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                }
                goto out;
        }
 
        journal = transaction->t_journal;
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
 
        if (jh->b_modified == 0) {
                /*
@@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                 * of the transaction. This needs to be done
                 * once a transaction -bzzz
                 */
-               if (handle->h_buffer_credits <= 0) {
+               if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                        ret = -ENOSPC;
                        goto out_unlock_bh;
                }
                jh->b_modified = 1;
-               handle->h_buffer_credits--;
+               handle->h_total_credits--;
        }
 
        /*
@@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
 out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
@@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 
        BUFFER_TRACE(bh, "entry");
 
-       jbd_lock_bh_state(bh);
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh) {
+               __bforget(bh);
+               return 0;
+       }
 
-       if (!buffer_jbd(bh))
-               goto not_jbd;
-       jh = bh2jh(bh);
+       spin_lock(&jh->b_state_lock);
 
        /* Critical error: attempting to delete a bitmap buffer, maybe?
         * Don't do any jbd operations, and return an error. */
        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                         "inconsistent data on disk")) {
                err = -EIO;
-               goto not_jbd;
+               goto drop;
        }
 
        /* keep track of whether or not this transaction modified us */
@@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
-                       if (!buffer_jbd(bh)) {
-                               spin_unlock(&journal->j_list_lock);
-                               goto not_jbd;
-                       }
+                       jbd2_journal_put_journal_head(jh);
                }
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
@@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "belongs to none transaction");
                        spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                }
 
                /*
@@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                if (!buffer_dirty(bh)) {
                        __jbd2_journal_remove_checkpoint(jh);
                        spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                }
 
                /*
@@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                spin_unlock(&journal->j_list_lock);
        }
-
-       jbd_unlock_bh_state(bh);
-       __brelse(bh);
 drop:
+       __brelse(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
        if (drop_reserve) {
                /* no need to reserve log space for this block -bzzz */
-               handle->h_buffer_credits++;
+               handle->h_total_credits++;
        }
        return err;
-
-not_jbd:
-       jbd_unlock_bh_state(bh);
-       __bforget(bh);
-       goto drop;
 }
 
 /**
@@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle)
        tid_t tid;
        pid_t pid;
 
+       if (--handle->h_ref > 0) {
+               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                                                handle->h_ref);
+               if (is_handle_aborted(handle))
+                       return -EIO;
+               return 0;
+       }
        if (!transaction) {
                /*
-                * Handle is already detached from the transaction so
-                * there is nothing to do other than decrease a refcount,
-                * or free the handle if refcount drops to zero
+                * Handle is already detached from the transaction so there is
+                * nothing to do other than free the handle.
                 */
-               if (--handle->h_ref > 0) {
-                       jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                                                        handle->h_ref);
-                       return err;
-               } else {
-                       if (handle->h_rsv_handle)
-                               jbd2_free_handle(handle->h_rsv_handle);
-                       goto free_and_exit;
-               }
+               memalloc_nofs_restore(handle->saved_alloc_context);
+               goto free_and_exit;
        }
        journal = transaction->t_journal;
-
-       J_ASSERT(journal_current_handle() == handle);
+       tid = transaction->t_tid;
 
        if (is_handle_aborted(handle))
                err = -EIO;
-       else
-               J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-
-       if (--handle->h_ref > 0) {
-               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                         handle->h_ref);
-               return err;
-       }
 
        jbd_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-                               transaction->t_tid,
-                               handle->h_type, handle->h_line_no,
+                               tid, handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
                                (handle->h_requested_credits -
-                                handle->h_buffer_credits));
+                                handle->h_total_credits));
 
        /*
         * Implement synchronous transaction batching.  If the handle
@@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle)
 
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
-       current->journal_info = NULL;
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
 
        /*
         * If the handle is marked SYNC, we need to set another commit
-        * going!  We also want to force a commit if the current
-        * transaction is occupying too much of the log, or if the
-        * transaction is too old now.
+        * going!  We also want to force a commit if the transaction is too
+        * old now.
         */
        if (handle->h_sync ||
-           (atomic_read(&transaction->t_outstanding_credits) >
-            journal->j_max_transaction_buffers) ||
            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
@@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle)
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-               jbd2_log_start_commit(journal, transaction->t_tid);
+               jbd2_log_start_commit(journal, tid);
 
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
@@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle)
        }
 
        /*
-        * Once we drop t_updates, if it goes to zero the transaction
-        * could start committing on us and eventually disappear.  So
-        * once we do this, we must not dereference transaction
-        * pointer again.
+        * Once stop_this_handle() drops t_updates, the transaction could start
+        * committing on us and eventually disappear.  So we must not
+        * dereference transaction pointer again after calling
+        * stop_this_handle().
         */
-       tid = transaction->t_tid;
-       if (atomic_dec_and_test(&transaction->t_updates)) {
-               wake_up(&journal->j_wait_updates);
-               if (journal->j_barrier_count)
-                       wake_up(&journal->j_wait_transaction_locked);
-       }
-
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       stop_this_handle(handle);
 
        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);
 
-       if (handle->h_rsv_handle)
-               jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
-       /*
-        * Scope of the GFP_NOFS context is over here and so we can restore the
-        * original alloc context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       if (handle->h_rsv_handle)
+               jbd2_free_handle(handle->h_rsv_handle);
        jbd2_free_handle(handle);
        return err;
 }
@@ -1878,7 +1914,7 @@ free_and_exit:
  *
  * j_list_lock is held.
  *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
  */
 
 static inline void
@@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
  *
  * Called with j_list_lock held, and the journal may not be locked.
  *
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
  */
 
 static inline void
@@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
        transaction_t *transaction;
        struct buffer_head *bh = jh2bh(jh);
 
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        transaction = jh->b_transaction;
        if (transaction)
                assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 }
 
 /*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
  *
  * Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
  */
 static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
 {
        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
-       jbd2_journal_put_journal_head(jh);
 }
 
 void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 
        /* Get reference so that buffer cannot be freed before we unlock it */
        get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
        __brelse(bh);
 }
 
 /*
  * Called from jbd2_journal_try_to_free_buffers().
  *
- * Called under jbd_lock_bh_state(bh)
+ * Called under jh->b_state_lock
  */
 static void
 __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
                if (!jh)
                        continue;
 
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                __journal_try_to_free_buffer(journal, bh);
+               spin_unlock(&jh->b_state_lock);
                jbd2_journal_put_journal_head(jh);
-               jbd_unlock_bh_state(bh);
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
@@ -2112,7 +2147,7 @@ busy:
  *
  * Called under j_list_lock.
  *
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
  */
 static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 {
@@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
                __jbd2_journal_unfile_buffer(jh);
+               jbd2_journal_put_journal_head(jh);
        }
        return may_free;
 }
@@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
         * holding the page lock. --sct
         */
 
-       if (!buffer_jbd(bh))
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh)
                goto zap_buffer_unlocked;
 
        /* OK, we have data buffer in journaled mode */
        write_lock(&journal->j_state_lock);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
 
-       jh = jbd2_journal_grab_journal_head(bh);
-       if (!jh)
-               goto zap_buffer_no_jh;
-
        /*
         * We cannot remove the buffer from checkpoint lists until the
         * transaction adding inode to orphan list (let's call it T)
@@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                 * for commit and try again.
                 */
                if (partial_page) {
-                       jbd2_journal_put_journal_head(jh);
                        spin_unlock(&journal->j_list_lock);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        write_unlock(&journal->j_state_lock);
+                       jbd2_journal_put_journal_head(jh);
                        return -EBUSY;
                }
                /*
@@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                set_buffer_freed(bh);
                if (journal->j_running_transaction && buffer_jbddirty(bh))
                        jh->b_next_transaction = journal->j_running_transaction;
-               jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                write_unlock(&journal->j_state_lock);
+               jbd2_journal_put_journal_head(jh);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
@@ -2331,11 +2364,10 @@ zap_buffer:
         * here.
         */
        jh->b_modified = 0;
-       jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
        spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
        write_unlock(&journal->j_state_lock);
+       jbd2_journal_put_journal_head(jh);
 zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);
 
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        assert_spin_locked(&transaction->t_journal->j_list_lock);
 
        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 void jbd2_journal_file_buffer(struct journal_head *jh,
                                transaction_t *transaction, int jlist)
 {
-       jbd_lock_bh_state(jh2bh(jh));
+       spin_lock(&jh->b_state_lock);
        spin_lock(&transaction->t_journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, jlist);
        spin_unlock(&transaction->t_journal->j_list_lock);
-       jbd_unlock_bh_state(jh2bh(jh));
+       spin_unlock(&jh->b_state_lock);
 }
 
 /*
@@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
  * buffer on that transaction's metadata list.
  *
  * Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
  *
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
  */
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);
 
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        if (jh->b_transaction)
                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
                __jbd2_journal_unfile_buffer(jh);
-               return;
+               return true;
        }
 
        /*
@@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 
        if (was_dirty)
                set_buffer_jbddirty(bh);
+       return false;
 }
 
 /*
@@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
  */
 void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
-       struct buffer_head *bh = jh2bh(jh);
+       bool drop;
 
-       /* Get reference so that buffer cannot be freed before we unlock it */
-       get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
-       __jbd2_journal_refile_buffer(jh);
-       jbd_unlock_bh_state(bh);
+       drop = __jbd2_journal_refile_buffer(jh);
+       spin_unlock(&jh->b_state_lock);
        spin_unlock(&journal->j_list_lock);
-       __brelse(bh);
+       if (drop)
+               jbd2_journal_put_journal_head(jh);
 }
 
 /*
index f9baefc..88534eb 100644 (file)
@@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
        int ret = 0;
        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
 
-       if (handle->h_buffer_credits < credits)
+       if (jbd2_handle_buffer_credits(handle) < credits)
                ret = ocfs2_extend_trans(handle,
-                                        credits - handle->h_buffer_credits);
+                               credits - jbd2_handle_buffer_credits(handle));
 
        return ret;
 }
@@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
                                   struct ocfs2_path *right_path,
                                   struct ocfs2_path **ret_left_path)
 {
-       int ret, start, orig_credits = handle->h_buffer_credits;
+       int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
        u32 cpos;
        struct ocfs2_path *left_path = NULL;
        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
@@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle,
                                  struct ocfs2_path *path,
                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
 {
-       int ret, orig_credits = handle->h_buffer_credits;
+       int ret, orig_credits = jbd2_handle_buffer_credits(handle);
        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
@@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
                                                        right_path);
 
                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
-                                                     handle->h_buffer_credits,
-                                                     right_path);
+                                       jbd2_handle_buffer_credits(handle),
+                                       right_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                                                        right_path);
 
                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
-                                                     handle->h_buffer_credits,
-                                                     left_path);
+                                       jbd2_handle_buffer_credits(handle),
+                                       left_path);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
                    le16_to_cpu(el->l_next_free_rec) == 1) {
                        /* extend credit for ocfs2_remove_rightmost_path */
                        ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                       handle->h_buffer_credits,
+                                       jbd2_handle_buffer_credits(handle),
                                        right_path);
                        if (ret) {
                                mlog_errno(ret);
@@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
                /* extend credit for ocfs2_remove_rightmost_path */
                ret = ocfs2_extend_rotate_transaction(handle, 0,
-                               handle->h_buffer_credits,
+                               jbd2_handle_buffer_credits(handle),
                                path);
                if (ret) {
                        mlog_errno(ret);
@@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
 
                /* extend credit for ocfs2_remove_rightmost_path */
                ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                       handle->h_buffer_credits,
+                                       jbd2_handle_buffer_credits(handle),
                                        path);
                if (ret) {
                        mlog_errno(ret);
@@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
 
                /* extend credit for ocfs2_remove_rightmost_path */
                ret = ocfs2_extend_rotate_transaction(handle, 0,
-                               handle->h_buffer_credits,
+                               jbd2_handle_buffer_credits(handle),
                                path);
                if (ret) {
                        mlog_errno(ret);
@@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
                if (ctxt->c_split_covers_rec) {
                        /* extend credit for ocfs2_remove_rightmost_path */
                        ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                       handle->h_buffer_credits,
+                                       jbd2_handle_buffer_credits(handle),
                                        path);
                        if (ret) {
                                mlog_errno(ret);
@@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
                /* extend credit for ocfs2_remove_rightmost_path */
                ret = ocfs2_extend_rotate_transaction(handle, 0,
-                               handle->h_buffer_credits,
+                               jbd2_handle_buffer_credits(handle),
                                path);
                if (ret) {
                        mlog_errno(ret);
@@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
        }
 
        ret = ocfs2_extend_rotate_transaction(handle, 0,
-                                             handle->h_buffer_credits,
-                                             path);
+                                       jbd2_handle_buffer_credits(handle),
+                                       path);
        if (ret) {
                mlog_errno(ret);
                goto out;
index 699a560..1afe57f 100644 (file)
@@ -420,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
        if (!nblocks)
                return 0;
 
-       old_nblocks = handle->h_buffer_credits;
+       old_nblocks = jbd2_handle_buffer_credits(handle);
 
        trace_ocfs2_extend_trans(old_nblocks, nblocks);
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
        status = 1;
 #else
-       status = jbd2_journal_extend(handle, nblocks);
+       status = jbd2_journal_extend(handle, nblocks, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -461,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
 
        BUG_ON(!handle);
 
-       old_nblks = handle->h_buffer_credits;
+       old_nblks = jbd2_handle_buffer_credits(handle);
        trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
 
        if (old_nblks < thresh)
                return 0;
 
-       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
index 69c21a3..4180c3e 100644 (file)
@@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
                                         int nr)
 {
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+       struct journal_head *jh;
        int ret;
 
        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
@@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
        if (!buffer_jbd(bg_bh))
                return 1;
 
-       jbd_lock_bh_state(bg_bh);
-       bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+       jh = bh2jh(bg_bh);
+       spin_lock(&jh->b_state_lock);
+       bg = (struct ocfs2_group_desc *) jh->b_committed_data;
        if (bg)
                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
        else
                ret = 1;
-       jbd_unlock_bh_state(bg_bh);
+       spin_unlock(&jh->b_state_lock);
 
        return ret;
 }
@@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        int status;
        unsigned int tmp;
        struct ocfs2_group_desc *undo_bg = NULL;
+       struct journal_head *jh;
 
        /* The caller got this descriptor from
         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
@@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
                goto bail;
        }
 
+       jh = bh2jh(group_bh);
        if (undo_fn) {
-               jbd_lock_bh_state(group_bh);
-               undo_bg = (struct ocfs2_group_desc *)
-                                       bh2jh(group_bh)->b_committed_data;
+               spin_lock(&jh->b_state_lock);
+               undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
                BUG_ON(!undo_bg);
        }
 
@@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
                if (undo_fn)
-                       jbd_unlock_bh_state(group_bh);
+                       spin_unlock(&jh->b_state_lock);
                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
                                   le16_to_cpu(bg->bg_bits),
@@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        }
 
        if (undo_fn)
-               jbd_unlock_bh_state(group_bh);
+               spin_unlock(&jh->b_state_lock);
 
        ocfs2_journal_dirty(handle, group_bh);
 bail:
index 564793c..29dce6f 100644 (file)
@@ -313,7 +313,6 @@ enum jbd_state_bits {
        BH_Revoked,             /* Has been revoked from the log */
        BH_RevokeValid,         /* Revoked flag is valid */
        BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
        BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,              /* IO on shadow buffer is running */
        BH_Verified,            /* Metadata block has been verified ok */
@@ -342,26 +341,6 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
        return bh->b_private;
 }
 
-static inline void jbd_lock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_lock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_trylock_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_trylock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
-{
-       return bit_spin_is_locked(BH_State, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_state(struct buffer_head *bh)
-{
-       bit_spin_unlock(BH_State, &bh->b_state);
-}
-
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
        bit_spin_lock(BH_JournalHead, &bh->b_state);
@@ -477,7 +456,9 @@ struct jbd2_revoke_table_s;
  * @h_transaction: Which compound transaction is this update a part of?
  * @h_journal: Which journal handle belongs to - used iff h_reserved set.
  * @h_rsv_handle: Handle reserved for finishing the logical operation.
- * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+ * @h_total_credits: Number of remaining buffers we are allowed to add to
+       journal. These are dirty buffers and revoke descriptor blocks.
+ * @h_revoke_credits: Number of remaining revoke records available for handle
  * @h_ref: Reference count on this handle.
  * @h_err: Field for caller's use to track errors through large fs operations.
  * @h_sync: Flag for sync-on-close.
@@ -487,7 +468,8 @@ struct jbd2_revoke_table_s;
  * @h_type: For handle statistics.
  * @h_line_no: For handle statistics.
  * @h_start_jiffies: Handle Start time.
- * @h_requested_credits: Holds @h_buffer_credits after handle is started.
+ * @h_requested_credits: Holds @h_total_credits after handle is started.
+ * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
  * @saved_alloc_context: Saved context while transaction is open.
  **/
 
@@ -504,7 +486,9 @@ struct jbd2_journal_handle
        };
 
        handle_t                *h_rsv_handle;
-       int                     h_buffer_credits;
+       int                     h_total_credits;
+       int                     h_revoke_credits;
+       int                     h_revoke_credits_requested;
        int                     h_ref;
        int                     h_err;
 
@@ -556,9 +540,9 @@ struct transaction_chp_stats_s {
  *      ->jbd_lock_bh_journal_head()   (This is "innermost")
  *
  *    j_state_lock
- *    ->jbd_lock_bh_state()
+ *    ->b_state_lock
  *
- *    jbd_lock_bh_state()
+ *    b_state_lock
  *    ->j_list_lock
  *
  *    j_state_lock
@@ -681,12 +665,25 @@ struct transaction_s
        atomic_t                t_updates;
 
        /*
-        * Number of buffers reserved for use by all handles in this transaction
-        * handle but not yet modified. [none]
+        * Number of blocks reserved for this transaction in the journal.
+        * This is including all credits reserved when starting transaction
+        * handles as well as all journal descriptor blocks needed for this
+        * transaction. [none]
         */
        atomic_t                t_outstanding_credits;
 
        /*
+        * Number of revoke records for this transaction added by already
+        * stopped handles. [none]
+        */
+       atomic_t                t_outstanding_revokes;
+
+       /*
+        * How many handles used this transaction? [none]
+        */
+       atomic_t                t_handle_count;
+
+       /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
@@ -704,11 +701,6 @@ struct transaction_s
        ktime_t                 t_start_time;
 
        /*
-        * How many handles used this transaction? [none]
-        */
-       atomic_t                t_handle_count;
-
-       /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
@@ -1025,6 +1017,13 @@ struct journal_s
        int                     j_max_transaction_buffers;
 
        /**
+        * @j_revoke_records_per_block:
+        *
+        * Number of revoke records that fit in one descriptor block.
+        */
+       int                     j_revoke_records_per_block;
+
+       /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
@@ -1257,7 +1256,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,                CSUM_V3)
 
 /* Filing buffers */
 extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __jbd2_journal_refile_buffer(struct journal_head *);
+extern bool __jbd2_journal_refile_buffer(struct journal_head *);
 extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
 extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
 extern void __journal_free_buffer(struct journal_head *bh);
@@ -1358,14 +1357,16 @@ static inline handle_t *journal_current_handle(void)
 
 extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
 extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
-                                    gfp_t gfp_mask, unsigned int type,
-                                    unsigned int line_no);
+                                    int revoke_records, gfp_t gfp_mask,
+                                    unsigned int type, unsigned int line_no);
 extern int      jbd2_journal_restart(handle_t *, int nblocks);
-extern int      jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
+extern int      jbd2__journal_restart(handle_t *, int nblocks,
+                                      int revoke_records, gfp_t gfp_mask);
 extern int      jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
 extern void     jbd2_journal_free_reserved(handle_t *handle);
-extern int      jbd2_journal_extend (handle_t *, int nblocks);
+extern int      jbd2_journal_extend(handle_t *handle, int nblocks,
+                                    int revoke_records);
 extern int      jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int      jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int      jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
@@ -1561,37 +1562,18 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
 }
 
 /*
- * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
- * transaction control blocks.
- */
-#define JBD2_CONTROL_BLOCKS_SHIFT 5
-
-/*
- * Return the minimum number of blocks which must be free in the journal
- * before a new transaction may be started.  Must be called under j_state_lock.
- */
-static inline int jbd2_space_needed(journal_t *journal)
-{
-       int nblocks = journal->j_max_transaction_buffers;
-       return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT);
-}
-
-/*
  * Return number of free blocks in the log. Must be called under j_state_lock.
  */
 static inline unsigned long jbd2_log_space_left(journal_t *journal)
 {
        /* Allow for rounding errors */
-       unsigned long free = journal->j_free - 32;
+       long free = journal->j_free - 32;
 
        if (journal->j_committing_transaction) {
-               unsigned long committing = atomic_read(&journal->
-                       j_committing_transaction->t_outstanding_credits);
-
-               /* Transaction + control blocks */
-               free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT);
+               free -= atomic_read(&journal->
+                        j_committing_transaction->t_outstanding_credits);
        }
-       return free;
+       return max_t(long, free, 0);
 }
 
 /*
@@ -1645,6 +1627,20 @@ static inline tid_t  jbd2_get_latest_transaction(journal_t *journal)
        return tid;
 }
 
+static inline int jbd2_handle_buffer_credits(handle_t *handle)
+{
+       journal_t *journal;
+
+       if (!handle->h_reserved)
+               journal = handle->h_transaction->t_journal;
+       else
+               journal = handle->h_journal;
+
+       return handle->h_total_credits -
+               DIV_ROUND_UP(handle->h_revoke_credits_requested,
+                            journal->j_revoke_records_per_block);
+}
+
 #ifdef __KERNEL__
 
 #define buffer_trace_init(bh)  do {} while (0)
index 9fb8705..75bc561 100644 (file)
@@ -11,6 +11,8 @@
 #ifndef JOURNAL_HEAD_H_INCLUDED
 #define JOURNAL_HEAD_H_INCLUDED
 
+#include <linux/spinlock.h>
+
 typedef unsigned int           tid_t;          /* Unique transaction ID */
 typedef struct transaction_s   transaction_t;  /* Compound transaction type */
 
@@ -24,13 +26,18 @@ struct journal_head {
        struct buffer_head *b_bh;
 
        /*
+        * Protect the buffer head state
+        */
+       spinlock_t b_state_lock;
+
+       /*
         * Reference count - see description in journal.c
         * [jbd_lock_bh_journal_head()]
         */
        int b_jcount;
 
        /*
-        * Journalling list for this buffer [jbd_lock_bh_state()]
+        * Journalling list for this buffer [b_state_lock]
         * NOTE: We *cannot* combine this with b_modified into a bitfield
         * as gcc would then (which the C standard allows but which is
         * very unuseful) make 64-bit accesses to the bitfield and clobber
@@ -41,20 +48,20 @@ struct journal_head {
        /*
         * This flag signals the buffer has been modified by
         * the currently running transaction
-        * [jbd_lock_bh_state()]
+        * [b_state_lock]
         */
        unsigned b_modified;
 
        /*
         * Copy of the buffer data frozen for writing to the log.
-        * [jbd_lock_bh_state()]
+        * [b_state_lock]
         */
        char *b_frozen_data;
 
        /*
         * Pointer to a saved copy of the buffer containing no uncommitted
         * deallocation references, so that allocations can avoid overwriting
-        * uncommitted deletes. [jbd_lock_bh_state()]
+        * uncommitted deletes. [b_state_lock]
         */
        char *b_committed_data;
 
@@ -63,7 +70,7 @@ struct journal_head {
         * metadata: either the running transaction or the committing
         * transaction (if there is one).  Only applies to buffers on a
         * transaction's data or metadata journaling list.
-        * [j_list_lock] [jbd_lock_bh_state()]
+        * [j_list_lock] [b_state_lock]
         * Either of these locks is enough for reading, both are needed for
         * changes.
         */
@@ -73,13 +80,13 @@ struct journal_head {
         * Pointer to the running compound transaction which is currently
         * modifying the buffer's metadata, if there was already a transaction
         * committing it when the new transaction touched it.
-        * [t_list_lock] [jbd_lock_bh_state()]
+        * [t_list_lock] [b_state_lock]
         */
        transaction_t *b_next_transaction;
 
        /*
         * Doubly-linked list of buffers on a transaction's data, metadata or
-        * forget queue. [t_list_lock] [jbd_lock_bh_state()]
+        * forget queue. [t_list_lock] [b_state_lock]
         */
        struct journal_head *b_tnext, *b_tprev;
 
index d68e9e5..182c9fe 100644 (file)
@@ -1746,15 +1746,16 @@ TRACE_EVENT(ext4_load_inode,
 
 TRACE_EVENT(ext4_journal_start,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
-                unsigned long IP),
+                int revoke_creds, unsigned long IP),
 
-       TP_ARGS(sb, blocks, rsv_blocks, IP),
+       TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, IP),
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
                __field(unsigned long,  ip                      )
                __field(          int,  blocks                  )
                __field(          int,  rsv_blocks              )
+               __field(          int,  revoke_creds            )
        ),
 
        TP_fast_assign(
@@ -1762,11 +1763,13 @@ TRACE_EVENT(ext4_journal_start,
                __entry->ip              = IP;
                __entry->blocks          = blocks;
                __entry->rsv_blocks      = rsv_blocks;
+               __entry->revoke_creds    = revoke_creds;
        ),
 
-       TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pS",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip)
+       TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d, "
+                 "caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds,
+                 (void *)__entry->ip)
 );
 
 TRACE_EVENT(ext4_journal_start_reserved,
index 2310b25..d16a328 100644 (file)
@@ -133,7 +133,7 @@ TRACE_EVENT(jbd2_submit_inode_data,
                  (unsigned long) __entry->ino)
 );
 
-TRACE_EVENT(jbd2_handle_start,
+DECLARE_EVENT_CLASS(jbd2_handle_start_class,
        TP_PROTO(dev_t dev, unsigned long tid, unsigned int type,
                 unsigned int line_no, int requested_blocks),
 
@@ -161,6 +161,20 @@ TRACE_EVENT(jbd2_handle_start,
                  __entry->type, __entry->line_no, __entry->requested_blocks)
 );
 
+DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start,
+       TP_PROTO(dev_t dev, unsigned long tid, unsigned int type,
+                unsigned int line_no, int requested_blocks),
+
+       TP_ARGS(dev, tid, type, line_no, requested_blocks)
+);
+
+DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart,
+       TP_PROTO(dev_t dev, unsigned long tid, unsigned int type,
+                unsigned int line_no, int requested_blocks),
+
+       TP_ARGS(dev, tid, type, line_no, requested_blocks)
+);
+
 TRACE_EVENT(jbd2_handle_extend,
        TP_PROTO(dev_t dev, unsigned long tid, unsigned int type,
                 unsigned int line_no, int buffer_credits,