Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 30 Nov 2019 18:53:02 +0000 (10:53 -0800)
Pull ext4 updates from Ted Ts'o:
 "This merge window saw the the following new featuers added to ext4:

   - Direct I/O via iomap (required the iomap-for-next branch from
     Darrick as a prereq).

   - Support for using dioread-nolock where the block size < page size.

   - Support for encryption for file systems where the block size < page
     size.

   - Rework of journal credits handling so a revoke-heavy workload will
     not cause the journal to run out of space.

   - Replace bit-spinlocks with spinlocks in jbd2

  Also included were some bug fixes and cleanups, mostly to clean up
  corner cases from fuzzed file systems and error path handling"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (59 commits)
  ext4: work around deleting a file with i_nlink == 0 safely
  ext4: add more paranoia checking in ext4_expand_extra_isize handling
  jbd2: make jbd2_handle_buffer_credits() handle reserved handles
  ext4: fix a bug in ext4_wait_for_tail_page_commit
  ext4: bio_alloc with __GFP_DIRECT_RECLAIM never fails
  ext4: code cleanup for get_next_id
  ext4: fix leak of quota reservations
  ext4: remove unused variable warning in parse_options()
  ext4: Enable encryption for subpage-sized blocks
  fs/buffer.c: support fscrypt in block_read_full_page()
  ext4: Add error handling for io_end_vec struct allocation
  jbd2: Fine tune estimate of necessary descriptor blocks
  jbd2: Provide trace event for handle restarts
  ext4: Reserve revoke credits for freed blocks
  jbd2: Make credit checking more strict
  jbd2: Rename h_buffer_credits to h_total_credits
  jbd2: Reserve space for revoke descriptor blocks
  jbd2: Drop jbd2_space_needed()
  jbd2: Account descriptor blocks into t_outstanding_credits
  jbd2: Factor out common parts of stopping and restarting a handle
  ...

1  2 
Documentation/filesystems/fscrypt.rst
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/super.c
fs/jbd2/transaction.c
fs/ocfs2/journal.c
include/linux/jbd2.h

@@@ -256,8 -256,13 +256,8 @@@ alternative master keys or to support r
  the master keys may be wrapped in userspace, e.g. as is done by the
  `fscrypt <https://github.com/google/fscrypt>`_ tool.
  
 -Including the inode number in the IVs was considered.  However, it was
 -rejected as it would have prevented ext4 filesystems from being
 -resized, and by itself still wouldn't have been sufficient to prevent
 -the same key from being directly reused for both XTS and CTS-CBC.
 -
 -DIRECT_KEY and per-mode keys
 -----------------------------
 +DIRECT_KEY policies
 +-------------------
  
  The Adiantum encryption mode (see `Encryption modes and usage`_) is
  suitable for both contents and filenames encryption, and it accepts
@@@ -280,21 -285,6 +280,21 @@@ IV.  Moreover
    key derived using the KDF.  Users may use the same master key for
    other v2 encryption policies.
  
 +IV_INO_LBLK_64 policies
 +-----------------------
 +
 +When FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 is set in the fscrypt policy,
 +the encryption keys are derived from the master key, encryption mode
 +number, and filesystem UUID.  This normally results in all files
 +protected by the same master key sharing a single contents encryption
 +key and a single filenames encryption key.  To still encrypt different
 +files' data differently, inode numbers are included in the IVs.
 +Consequently, shrinking the filesystem may not be allowed.
 +
 +This format is optimized for use with inline encryption hardware
 +compliant with the UFS or eMMC standards, which support only 64 IV
 +bits per I/O request and may have only a small number of keyslots.
 +
  Key identifiers
  ---------------
  
@@@ -318,9 -308,8 +318,9 @@@ If unsure, you should use the (AES-256-
  
  AES-128-CBC was added only for low-powered embedded devices with
  crypto accelerators such as CAAM or CESA that do not support XTS.  To
 -use AES-128-CBC, CONFIG_CRYPTO_SHA256 (or another SHA-256
 -implementation) must be enabled so that ESSIV can be used.
 +use AES-128-CBC, CONFIG_CRYPTO_ESSIV and CONFIG_CRYPTO_SHA256 (or
 +another SHA-256 implementation) must be enabled so that ESSIV can be
 +used.
  
  Adiantum is a (primarily) stream cipher-based mode that is fast even
  on CPUs without dedicated crypto instructions.  It's also a true
@@@ -342,8 -331,8 +342,8 @@@ Contents encryptio
  -------------------
  
  For file contents, each filesystem block is encrypted independently.
- Currently, only the case where the filesystem block size is equal to
the system's page size (usually 4096 bytes) is supported.
+ Starting from Linux kernel 5.5, encryption of filesystems with block
size less than system's page size is supported.
  
  Each block's IV is set to the logical block number within the file as
  a little endian number, except that:
    is encrypted with AES-256 where the AES-256 key is the SHA-256 hash
    of the file's data encryption key.
  
 -- In the "direct key" configuration (FSCRYPT_POLICY_FLAG_DIRECT_KEY
 -  set in the fscrypt_policy), the file's nonce is also appended to the
 -  IV.  Currently this is only allowed with the Adiantum encryption
 -  mode.
 +- With `DIRECT_KEY policies`_, the file's nonce is appended to the IV.
 +  Currently this is only allowed with the Adiantum encryption mode.
 +
 +- With `IV_INO_LBLK_64 policies`_, the logical block number is limited
 +  to 32 bits and is placed in bits 0-31 of the IV.  The inode number
 +  (which is also limited to 32 bits) is placed in bits 32-63.
 +
 +Note that because file logical block numbers are included in the IVs,
 +filesystems must enforce that blocks are never shifted around within
 +encrypted files, e.g. via "collapse range" or "insert range".
  
  Filenames encryption
  --------------------
@@@ -371,10 -354,10 +371,10 @@@ the requirements to retain support for 
  filenames of up to 255 bytes, the same IV is used for every filename
  in a directory.
  
 -However, each encrypted directory still uses a unique key; or
 -alternatively (for the "direct key" configuration) has the file's
 -nonce included in the IVs.  Thus, IV reuse is limited to within a
 -single directory.
 +However, each encrypted directory still uses a unique key, or
 +alternatively has the file's nonce (for `DIRECT_KEY policies`_) or
 +inode number (for `IV_INO_LBLK_64 policies`_) included in the IVs.
 +Thus, IV reuse is limited to within a single directory.
  
  With CTS-CBC, the IV reuse means that when the plaintext filenames
  share a common prefix at least as long as the cipher block size (16
@@@ -448,15 -431,12 +448,15 @@@ This structure must be initialized as f
    (1) for ``contents_encryption_mode`` and FSCRYPT_MODE_AES_256_CTS
    (4) for ``filenames_encryption_mode``.
  
 -- ``flags`` must contain a value from ``<linux/fscrypt.h>`` which
 -  identifies the amount of NUL-padding to use when encrypting
 -  filenames.  If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32 (0x3).
 -  Additionally, if the encryption modes are both
 -  FSCRYPT_MODE_ADIANTUM, this can contain
 -  FSCRYPT_POLICY_FLAG_DIRECT_KEY; see `DIRECT_KEY and per-mode keys`_.
 +- ``flags`` contains optional flags from ``<linux/fscrypt.h>``:
 +
 +  - FSCRYPT_POLICY_FLAGS_PAD_*: The amount of NUL padding to use when
 +    encrypting filenames.  If unsure, use FSCRYPT_POLICY_FLAGS_PAD_32
 +    (0x3).
 +  - FSCRYPT_POLICY_FLAG_DIRECT_KEY: See `DIRECT_KEY policies`_.
 +  - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64: See `IV_INO_LBLK_64
 +    policies`_.  This is mutually exclusive with DIRECT_KEY and is not
 +    supported on v1 policies.
  
  - For v2 encryption policies, ``__reserved`` must be zeroed.
  
@@@ -1109,7 -1089,7 +1109,7 @@@ policy structs (see `Setting an encrypt
  context structs also contain a nonce.  The nonce is randomly generated
  by the kernel and is used as KDF input or as a tweak to cause
  different files to be encrypted differently; see `Per-file keys`_ and
 -`DIRECT_KEY and per-mode keys`_.
 +`DIRECT_KEY policies`_.
  
  Data path changes
  -----------------
diff --combined fs/ext4/ext4.h
@@@ -198,6 -198,12 +198,12 @@@ struct ext4_system_blocks 
   */
  #define       EXT4_IO_END_UNWRITTEN   0x0001
  
+ struct ext4_io_end_vec {
+       struct list_head list;          /* list of io_end_vec */
+       loff_t offset;                  /* offset in the file */
+       ssize_t size;                   /* size of the extent */
+ };
  /*
   * For converting unwritten extents on a work queue. 'handle' is used for
   * buffered writeback.
@@@ -211,8 -217,7 +217,7 @@@ typedef struct ext4_io_end 
                                                 * bios covering the extent */
        unsigned int            flag;           /* unwritten or not */
        atomic_t                count;          /* reference counter */
-       loff_t                  offset;         /* offset in the file */
-       ssize_t                 size;           /* size of the extent */
+       struct list_head        list_vec;       /* list of ext4_io_end_vec */
  } ext4_io_end_t;
  
  struct ext4_io_submit {
@@@ -1579,7 -1584,6 +1584,6 @@@ enum 
        EXT4_STATE_NO_EXPAND,           /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
-       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
        EXT4_STATE_NEWENTRY,            /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
@@@ -1678,7 -1682,6 +1682,7 @@@ static inline bool ext4_verity_in_progr
  #define EXT4_FEATURE_COMPAT_RESIZE_INODE      0x0010
  #define EXT4_FEATURE_COMPAT_DIR_INDEX         0x0020
  #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2     0x0200
 +#define EXT4_FEATURE_COMPAT_STABLE_INODES     0x0800
  
  #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER   0x0001
  #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE     0x0002
@@@ -1780,7 -1783,6 +1784,7 @@@ EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_A
  EXT4_FEATURE_COMPAT_FUNCS(resize_inode,               RESIZE_INODE)
  EXT4_FEATURE_COMPAT_FUNCS(dir_index,          DIR_INDEX)
  EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,      SPARSE_SUPER2)
 +EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,      STABLE_INODES)
  
  EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,    SPARSE_SUPER)
  EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,      LARGE_FILE)
@@@ -2562,8 -2564,6 +2566,6 @@@ int ext4_get_block_unwritten(struct ino
                             struct buffer_head *bh_result, int create);
  int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
- int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh_result, int create);
  int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
  int ext4_walk_page_buffers(handle_t *handle,
@@@ -2606,7 -2606,6 +2608,6 @@@ extern int ext4_can_truncate(struct ino
  extern int ext4_truncate(struct inode *);
  extern int ext4_break_layouts(struct inode *);
  extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
- extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
  extern void ext4_set_inode_flags(struct inode *);
  extern int ext4_alloc_da_blocks(struct inode *inode);
  extern void ext4_set_aops(struct inode *inode);
@@@ -3266,6 -3265,8 +3267,8 @@@ extern long ext4_fallocate(struct file 
                          loff_t len);
  extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
+ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+                                            ext4_io_end_t *io_end);
  extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
  extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@@ -3298,6 -3299,10 +3301,10 @@@ extern int ext4_swap_extents(handle_t *
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
  extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+ extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+                                      int check_cred, int restart_cred,
+                                      int revoke_cred);
  
  /* move_extent.c */
  extern void ext4_double_down_write_data_sem(struct inode *first,
@@@ -3324,6 -3329,8 +3331,8 @@@ extern int ext4_bio_write_page(struct e
                               int len,
                               struct writeback_control *wbc,
                               bool keep_towrite);
+ extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
  
  /* mmp.c */
  extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@@ -3381,6 -3388,7 +3390,7 @@@ static inline void ext4_clear_io_unwrit
  }
  
  extern const struct iomap_ops ext4_iomap_ops;
+ extern const struct iomap_ops ext4_iomap_report_ops;
  
  static inline int ext4_buffer_uptodate(struct buffer_head *bh)
  {
diff --combined fs/ext4/inode.c
@@@ -164,39 -164,18 +164,18 @@@ int ext4_inode_is_fast_symlink(struct i
  }
  
  /*
-  * Restart the transaction associated with *handle.  This does a commit,
-  * so before we call here everything must be consistently dirtied against
-  * this transaction.
-  */
- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
-                                int nblocks)
- {
-       int ret;
-       /*
-        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
-        * moment, get_block can be called only for blocks inside i_size since
-        * page cache has been already dropped and writes are blocked by
-        * i_mutex. So we can safely drop the i_data_sem here.
-        */
-       BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       jbd_debug(2, "restarting handle %p\n", handle);
-       up_write(&EXT4_I(inode)->i_data_sem);
-       ret = ext4_journal_restart(handle, nblocks);
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
-       return ret;
- }
- /*
   * Called at the last iput() if i_nlink is zero.
   */
  void ext4_evict_inode(struct inode *inode)
  {
        handle_t *handle;
        int err;
-       int extra_credits = 3;
+       /*
+        * Credits for final inode cleanup and freeing:
+        * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+        * (xattr block freeing), bitmap, group descriptor (inode freeing)
+        */
+       int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
  
        trace_ext4_evict_inode(inode);
        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
  
+       /*
+        * Block bitmap, group descriptor, and inode are accounted in both
+        * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+        */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                ext4_blocks_for_truncate(inode)+extra_credits);
+                        ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
@@@ -827,136 -810,6 +810,6 @@@ int ext4_get_block_unwritten(struct ino
  #define DIO_MAX_BLOCKS 4096
  
  /*
-  * Get blocks function for the cases that need to start a transaction -
-  * generally difference cases of direct IO and DAX IO. It also handles retries
-  * in case of ENOSPC.
-  */
- static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
-                               struct buffer_head *bh_result, int flags)
- {
-       int dio_credits;
-       handle_t *handle;
-       int retries = 0;
-       int ret;
-       /* Trim mapping request to maximum we can map at once for DIO */
-       if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
-               bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
-       dio_credits = ext4_chunk_trans_blocks(inode,
-                                     bh_result->b_size >> inode->i_blkbits);
- retry:
-       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
-       ret = _ext4_get_block(inode, iblock, bh_result, flags);
-       ext4_journal_stop(handle);
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-       return ret;
- }
- /* Get block function for DIO reads and writes to inodes without extents */
- int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh, int create)
- {
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-       if (!create)
-               return _ext4_get_block(inode, iblock, bh, 0);
-       return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
- }
- /*
-  * Get block function for AIO DIO writes when we create unwritten extent if
-  * blocks are not allocated yet. The extent will be converted to written
-  * after IO is complete.
-  */
- static int ext4_dio_get_block_unwritten_async(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
- {
-       int ret;
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-       /*
-        * When doing DIO using unwritten extents, we need io_end to convert
-        * unwritten extents to written on IO completion. We allocate io_end
-        * once we spot unwritten extent and store it in b_private. Generic
-        * DIO code keeps b_private set and furthermore passes the value to
-        * our completion callback in 'private' argument.
-        */
-       if (!ret && buffer_unwritten(bh_result)) {
-               if (!bh_result->b_private) {
-                       ext4_io_end_t *io_end;
-                       io_end = ext4_init_io_end(inode, GFP_KERNEL);
-                       if (!io_end)
-                               return -ENOMEM;
-                       bh_result->b_private = io_end;
-                       ext4_set_io_unwritten_flag(inode, io_end);
-               }
-               set_buffer_defer_completion(bh_result);
-       }
-       return ret;
- }
- /*
-  * Get block function for non-AIO DIO writes when we create unwritten extent if
-  * blocks are not allocated yet. The extent will be converted to written
-  * after IO is complete by ext4_direct_IO_write().
-  */
- static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
- {
-       int ret;
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-       /*
-        * Mark inode as having pending DIO writes to unwritten extents.
-        * ext4_direct_IO_write() checks this flag and converts extents to
-        * written.
-        */
-       if (!ret && buffer_unwritten(bh_result))
-               ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-       return ret;
- }
- static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
- {
-       int ret;
-       ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-       ret = _ext4_get_block(inode, iblock, bh_result, 0);
-       /*
-        * Blocks should have been preallocated! ext4_file_write_iter() checks
-        * that.
-        */
-       WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-       return ret;
- }
- /*
   * `handle' can be NULL if create is zero
   */
  struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@@ -2341,6 -2194,79 +2194,79 @@@ static int mpage_process_page_bufs(stru
  }
  
  /*
+  * mpage_process_page - update page buffers corresponding to changed extent and
+  *                   may submit fully mapped page for IO
+  *
+  * @mpd               - description of extent to map, on return next extent to map
+  * @m_lblk    - logical block mapping.
+  * @m_pblk    - corresponding physical mapping.
+  * @map_bh    - determines on return whether this page requires any further
+  *              mapping or not.
+  * Scan given page buffers corresponding to changed extent and update buffer
+  * state according to new extent state.
+  * We map delalloc buffers to their physical location, clear unwritten bits.
+  * If the given page is not fully mapped, we update @map to the next extent in
+  * the given page that needs mapping & return @map_bh as true.
+  */
+ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+                             ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+                             bool *map_bh)
+ {
+       struct buffer_head *head, *bh;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       ext4_lblk_t lblk = *m_lblk;
+       ext4_fsblk_t pblock = *m_pblk;
+       int err = 0;
+       int blkbits = mpd->inode->i_blkbits;
+       ssize_t io_end_size = 0;
+       struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+       bh = head = page_buffers(page);
+       do {
+               if (lblk < mpd->map.m_lblk)
+                       continue;
+               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                       /*
+                        * Buffer after end of mapped extent.
+                        * Find next buffer in the page to map.
+                        */
+                       mpd->map.m_len = 0;
+                       mpd->map.m_flags = 0;
+                       io_end_vec->size += io_end_size;
+                       io_end_size = 0;
+                       err = mpage_process_page_bufs(mpd, head, bh, lblk);
+                       if (err > 0)
+                               err = 0;
+                       if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+                               io_end_vec = ext4_alloc_io_end_vec(io_end);
+                               if (IS_ERR(io_end_vec)) {
+                                       err = PTR_ERR(io_end_vec);
+                                       goto out;
+                               }
+                               io_end_vec->offset = mpd->map.m_lblk << blkbits;
+                       }
+                       *map_bh = true;
+                       goto out;
+               }
+               if (buffer_delay(bh)) {
+                       clear_buffer_delay(bh);
+                       bh->b_blocknr = pblock++;
+               }
+               clear_buffer_unwritten(bh);
+               io_end_size += (1 << blkbits);
+       } while (lblk++, (bh = bh->b_this_page) != head);
+       io_end_vec->size += io_end_size;
+       io_end_size = 0;
+       *map_bh = false;
+ out:
+       *m_lblk = lblk;
+       *m_pblk = pblock;
+       return err;
+ }
+ /*
   * mpage_map_buffers - update buffers corresponding to changed extent and
   *                   submit fully mapped pages for IO
   *
@@@ -2359,12 -2285,12 +2285,12 @@@ static int mpage_map_and_submit_buffers
        struct pagevec pvec;
        int nr_pages, i;
        struct inode *inode = mpd->inode;
-       struct buffer_head *head, *bh;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
-       sector_t pblock;
+       ext4_fsblk_t pblock;
        int err;
+       bool map_bh = false;
  
        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
  
-                       bh = head = page_buffers(page);
-                       do {
-                               if (lblk < mpd->map.m_lblk)
-                                       continue;
-                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
-                                       /*
-                                        * Buffer after end of mapped extent.
-                                        * Find next buffer in the page to map.
-                                        */
-                                       mpd->map.m_len = 0;
-                                       mpd->map.m_flags = 0;
-                                       /*
-                                        * FIXME: If dioread_nolock supports
-                                        * blocksize < pagesize, we need to make
-                                        * sure we add size mapped so far to
-                                        * io_end->size as the following call
-                                        * can submit the page for IO.
-                                        */
-                                       err = mpage_process_page_bufs(mpd, head,
-                                                                     bh, lblk);
-                                       pagevec_release(&pvec);
-                                       if (err > 0)
-                                               err = 0;
-                                       return err;
-                               }
-                               if (buffer_delay(bh)) {
-                                       clear_buffer_delay(bh);
-                                       bh->b_blocknr = pblock++;
-                               }
-                               clear_buffer_unwritten(bh);
-                       } while (lblk++, (bh = bh->b_this_page) != head);
+                       err = mpage_process_page(mpd, page, &lblk, &pblock,
+                                                &map_bh);
                        /*
-                        * FIXME: This is going to break if dioread_nolock
-                        * supports blocksize < pagesize as we will try to
-                        * convert potentially unmapped parts of inode.
+                        * If map_bh is true, means page may require further bh
+                        * mapping, or maybe the page was submitted for IO.
+                        * So we return to call further extent mapping.
                         */
-                       mpd->io_submit.io_end->size += PAGE_SIZE;
+                       if (err < 0 || map_bh == true)
+                               goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_page(mpd, page);
-                       if (err < 0) {
-                               pagevec_release(&pvec);
-                               return err;
-                       }
+                       if (err < 0)
+                               goto out;
                }
                pagevec_release(&pvec);
        }
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
+ out:
+       pagevec_release(&pvec);
+       return err;
  }
  
  static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@@ -2510,9 -2408,13 +2408,13 @@@ static int mpage_map_and_submit_extent(
        int err;
        loff_t disksize;
        int progress = 0;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       struct ext4_io_end_vec *io_end_vec;
  
-       mpd->io_submit.io_end->offset =
-                               ((loff_t)map->m_lblk) << inode->i_blkbits;
+       io_end_vec = ext4_alloc_io_end_vec(io_end);
+       if (IS_ERR(io_end_vec))
+               return PTR_ERR(io_end_vec);
+       io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
@@@ -3406,473 -3308,235 +3308,235 @@@ static bool ext4_inode_datasync_dirty(s
        return inode->i_state & I_DIRTY_DATASYNC;
  }
  
- static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+                          struct ext4_map_blocks *map, loff_t offset,
+                          loff_t length)
  {
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned int blkbits = inode->i_blkbits;
-       unsigned long first_block, last_block;
-       struct ext4_map_blocks map;
-       bool delalloc = false;
-       int ret;
-       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
-               return -EINVAL;
-       first_block = offset >> blkbits;
-       last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
-                          EXT4_MAX_LOGICAL_BLOCK);
-       if (flags & IOMAP_REPORT) {
-               if (ext4_has_inline_data(inode)) {
-                       ret = ext4_inline_data_iomap(inode, iomap);
-                       if (ret != -EAGAIN) {
-                               if (ret == 0 && offset >= iomap->length)
-                                       ret = -ENOENT;
-                               return ret;
-                       }
-               }
-       } else {
-               if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
-                       return -ERANGE;
-       }
-       map.m_lblk = first_block;
-       map.m_len = last_block - first_block + 1;
-       if (flags & IOMAP_REPORT) {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-               if (ret == 0) {
-                       ext4_lblk_t end = map.m_lblk + map.m_len - 1;
-                       struct extent_status es;
-                       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
-                                                 map.m_lblk, end, &es);
-                       if (!es.es_len || es.es_lblk > end) {
-                               /* entire range is a hole */
-                       } else if (es.es_lblk > map.m_lblk) {
-                               /* range starts with a hole */
-                               map.m_len = es.es_lblk - map.m_lblk;
-                       } else {
-                               ext4_lblk_t offs = 0;
-                               if (es.es_lblk < map.m_lblk)
-                                       offs = map.m_lblk - es.es_lblk;
-                               map.m_lblk = es.es_lblk + offs;
-                               map.m_len = es.es_len - offs;
-                               delalloc = true;
-                       }
-               }
-       } else if (flags & IOMAP_WRITE) {
-               int dio_credits;
-               handle_t *handle;
-               int retries = 0;
-               /* Trim mapping request to maximum we can map at once for DIO */
-               if (map.m_len > DIO_MAX_BLOCKS)
-                       map.m_len = DIO_MAX_BLOCKS;
-               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- retry:
-               /*
-                * Either we allocate blocks and then we don't get unwritten
-                * extent so we have reserved enough credits, or the blocks
-                * are already allocated and unwritten and in that case
-                * extent conversion fits in the credits as well.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           dio_credits);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (ret < 0) {
-                       ext4_journal_stop(handle);
-                       if (ret == -ENOSPC &&
-                           ext4_should_retry_alloc(inode->i_sb, &retries))
-                               goto retry;
-                       return ret;
-               }
-               /*
-                * If we added blocks beyond i_size, we need to make sure they
-                * will get truncated if we crash before updating i_size in
-                * ext4_iomap_end(). For faults we don't need to do that (and
-                * even cannot because for orphan list operations inode_lock is
-                * required) - if we happen to instantiate block beyond i_size,
-                * it is because we race with truncate which has already added
-                * the inode to the orphan list.
-                */
-               if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
-                   (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
-                       int err;
-                       err = ext4_orphan_add(handle, inode);
-                       if (err < 0) {
-                               ext4_journal_stop(handle);
-                               return err;
-                       }
-               }
-               ext4_journal_stop(handle);
-       } else {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-       }
+       u8 blkbits = inode->i_blkbits;
  
+       /*
+        * Writes that span EOF might trigger an I/O size update on completion,
+        * so consider them to be dirty for the purpose of O_DSYNC, even if
+        * there is no other metadata changes being made or are pending.
+        */
        iomap->flags = 0;
-       if (ext4_inode_datasync_dirty(inode))
+       if (ext4_inode_datasync_dirty(inode) ||
+           offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;
+       if (map->m_flags & EXT4_MAP_NEW)
+               iomap->flags |= IOMAP_F_NEW;
        iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = sbi->s_daxdev;
-       iomap->offset = (u64)first_block << blkbits;
-       iomap->length = (u64)map.m_len << blkbits;
+       iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+       iomap->offset = (u64) map->m_lblk << blkbits;
+       iomap->length = (u64) map->m_len << blkbits;
  
-       if (ret == 0) {
-               iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
-               iomap->addr = IOMAP_NULL_ADDR;
+       /*
+        * Flags passed to ext4_map_blocks() for direct I/O writes can result
+        * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+        * set. In order for any allocated unwritten extents to be converted
+        * into written extents correctly within the ->end_io() handler, we
+        * need to ensure that the iomap->type is set appropriately. Hence, the
+        * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+        * been set first.
+        */
+       if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+               iomap->type = IOMAP_UNWRITTEN;
+               iomap->addr = (u64) map->m_pblk << blkbits;
+       } else if (map->m_flags & EXT4_MAP_MAPPED) {
+               iomap->type = IOMAP_MAPPED;
+               iomap->addr = (u64) map->m_pblk << blkbits;
        } else {
-               if (map.m_flags & EXT4_MAP_MAPPED) {
-                       iomap->type = IOMAP_MAPPED;
-               } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       iomap->type = IOMAP_UNWRITTEN;
-               } else {
-                       WARN_ON_ONCE(1);
-                       return -EIO;
-               }
-               iomap->addr = (u64)map.m_pblk << blkbits;
+               iomap->type = IOMAP_HOLE;
+               iomap->addr = IOMAP_NULL_ADDR;
        }
-       if (map.m_flags & EXT4_MAP_NEW)
-               iomap->flags |= IOMAP_F_NEW;
-       return 0;
  }
  
- static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
-                         ssize_t written, unsigned flags, struct iomap *iomap)
+ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+                           unsigned int flags)
  {
-       int ret = 0;
        handle_t *handle;
-       int blkbits = inode->i_blkbits;
-       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       int ret, dio_credits, m_flags = 0, retries = 0;
  
-       if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-               return 0;
-       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto orphan_del;
-       }
-       if (ext4_update_inode_size(inode, offset + written))
-               ext4_mark_inode_dirty(handle, inode);
        /*
-        * We may need to truncate allocated but not written blocks beyond EOF.
+        * Trim the mapping request to the maximum value that we can map at
+        * once for direct I/O.
         */
-       if (iomap->offset + iomap->length > 
-           ALIGN(inode->i_size, 1 << blkbits)) {
-               ext4_lblk_t written_blk, end_blk;
+       if (map->m_len > DIO_MAX_BLOCKS)
+               map->m_len = DIO_MAX_BLOCKS;
+       dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
  
-               written_blk = (offset + written) >> blkbits;
-               end_blk = (offset + length) >> blkbits;
-               if (written_blk < end_blk && ext4_can_truncate(inode))
-                       truncate = true;
-       }
+ retry:
        /*
-        * Remove inode from orphan list if we were extending a inode and
-        * everything went fine.
+        * Either we allocate blocks and then don't get an unwritten extent, so
+        * in that case we have reserved enough credits. Or, the blocks are
+        * already allocated and unwritten. In that case, the extent conversion
+        * fits into the credits as well.
         */
-       if (!truncate && inode->i_nlink &&
-           !list_empty(&EXT4_I(inode)->i_orphan))
-               ext4_orphan_del(handle, inode);
-       ext4_journal_stop(handle);
-       if (truncate) {
-               ext4_truncate_failed_write(inode);
- orphan_del:
-               /*
-                * If truncate failed early the inode might still be on the
-                * orphan list; we need to make sure the inode is removed from
-                * the orphan list in that case.
-                */
-               if (inode->i_nlink)
-                       ext4_orphan_del(NULL, inode);
-       }
-       return ret;
- }
- const struct iomap_ops ext4_iomap_ops = {
-       .iomap_begin            = ext4_iomap_begin,
-       .iomap_end              = ext4_iomap_end,
- };
- static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                           ssize_t size, void *private)
- {
-         ext4_io_end_t *io_end = private;
+       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
  
-       /* if not async direct IO just return */
-       if (!io_end)
-               return 0;
+       /*
+        * DAX and direct I/O are the only two operations that are currently
+        * supported with IOMAP_WRITE.
+        */
+       WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+       if (IS_DAX(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+       /*
+        * We use i_size instead of i_disksize here because delalloc writeback
+        * can complete at any point during the I/O and subsequently push the
+        * i_disksize out to i_size. This could be beyond where direct I/O is
+        * happening and thus expose allocated blocks to direct I/O reads.
+        */
+       else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE;
+       else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
  
-       ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                 io_end, io_end->inode->i_ino, iocb, offset, size);
+       ret = ext4_map_blocks(handle, inode, map, m_flags);
  
        /*
-        * Error during AIO DIO. We cannot convert unwritten extents as the
-        * data was not written. Just clear the unwritten flag and drop io_end.
+        * We cannot fill holes in indirect tree based inodes as that could
+        * expose stale data in the case of a crash. Use the magic error code
+        * to fallback to buffered I/O.
         */
-       if (size <= 0) {
-               ext4_clear_io_unwritten_flag(io_end);
-               size = 0;
-       }
-       io_end->offset = offset;
-       io_end->size = size;
-       ext4_put_io_end(io_end);
+       if (!m_flags && !ret)
+               ret = -ENOTBLK;
  
-       return 0;
+       ext4_journal_stop(handle);
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+       return ret;
  }
  
- /*
-  * Handling of direct IO writes.
-  *
-  * For ext4 extent files, ext4 will do direct-io write even to holes,
-  * preallocated extents, and those write extend the file, no need to
-  * fall back to buffered IO.
-  *
-  * For holes, we fallocate those blocks, mark them as unwritten
-  * If those blocks were preallocated, we mark sure they are split, but
-  * still keep the range to write as unwritten.
-  *
-  * The unwritten extents will be converted to written when DIO is completed.
-  * For async direct IO, since the IO may still pending when return, we
-  * set up an end_io call back function, which will do the conversion
-  * when async direct IO completed.
-  *
-  * If the O_DIRECT write will extend the file then add this inode to the
-  * orphan list.  So recovery will truncate it back to the original size
-  * if the machine crashes during the write.
-  *
-  */
- static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       ssize_t ret;
-       loff_t offset = iocb->ki_pos;
-       size_t count = iov_iter_count(iter);
-       int overwrite = 0;
-       get_block_t *get_block_func = NULL;
-       int dio_flags = 0;
-       loff_t final_size = offset + count;
-       int orphan = 0;
-       handle_t *handle;
+       int ret;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
  
-       if (final_size > inode->i_size || final_size > ei->i_disksize) {
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
-               ret = ext4_orphan_add(handle, inode);
-               if (ret) {
-                       ext4_journal_stop(handle);
-                       goto out;
-               }
-               orphan = 1;
-               ext4_update_i_disksize(inode, inode->i_size);
-               ext4_journal_stop(handle);
-       }
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
  
-       BUG_ON(iocb->private == NULL);
+       if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+               return -ERANGE;
  
        /*
-        * Make all waiters for direct IO properly wait also for extent
-        * conversion. This also disallows race between truncate() and
-        * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+        * Calculate the first and last logical blocks respectively.
         */
-       inode_dio_begin(inode);
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+       if (flags & IOMAP_WRITE)
+               ret = ext4_iomap_alloc(inode, &map, flags);
+       else
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+       if (ret < 0)
+               return ret;
  
-       /* If we do a overwrite dio, i_mutex locking can be released */
-       overwrite = *((int *)iocb->private);
+       ext4_set_iomap(inode, iomap, &map, offset, length);
  
-       if (overwrite)
-               inode_unlock(inode);
+       return 0;
+ }
  
+ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+                         ssize_t written, unsigned flags, struct iomap *iomap)
+ {
        /*
-        * For extent mapped files we could direct write to holes and fallocate.
-        *
-        * Allocated blocks to fill the hole are marked as unwritten to prevent
-        * parallel buffered read to expose the stale data before DIO complete
-        * the data IO.
-        *
-        * As to previously fallocated extents, ext4 get_block will just simply
-        * mark the buffer mapped but still keep the extents unwritten.
-        *
-        * For non AIO case, we will convert those unwritten extents to written
-        * after return back from blockdev_direct_IO. That way we save us from
-        * allocating io_end structure and also the overhead of offloading
-        * the extent convertion to a workqueue.
-        *
-        * For async DIO, the conversion needs to be deferred when the
-        * IO is completed. The ext4 end_io callback function will be
-        * called to take care of the conversion work.  Here for async
-        * case, we allocate an io_end structure to hook to the iocb.
+        * Check to see whether an error occurred while writing out the data to
+        * the allocated blocks. If so, return the magic error code so that we
+        * fallback to buffered I/O and attempt to complete the remainder of
+        * the I/O. Any blocks that may have been allocated in preparation for
+        * the direct I/O will be reused during buffered I/O.
         */
-       iocb->private = NULL;
-       if (overwrite)
-               get_block_func = ext4_dio_get_block_overwrite;
-       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-                  round_down(offset, i_blocksize(inode)) >= inode->i_size) {
-               get_block_func = ext4_dio_get_block;
-               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
-       } else if (is_sync_kiocb(iocb)) {
-               get_block_func = ext4_dio_get_block_unwritten_sync;
-               dio_flags = DIO_LOCKING;
-       } else {
-               get_block_func = ext4_dio_get_block_unwritten_async;
-               dio_flags = DIO_LOCKING;
-       }
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                  get_block_func, ext4_end_io_dio, NULL,
-                                  dio_flags);
+       if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+               return -ENOTBLK;
  
-       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                               EXT4_STATE_DIO_UNWRITTEN)) {
-               int err;
-               /*
-                * for non AIO case, since the IO is already
-                * completed, we could do the conversion right here
-                */
-               err = ext4_convert_unwritten_extents(NULL, inode,
-                                                    offset, ret);
-               if (err < 0)
-                       ret = err;
-               ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-       }
+       return 0;
+ }
  
-       inode_dio_end(inode);
-       /* take i_mutex locking again if we do a ovewrite dio */
-       if (overwrite)
              inode_lock(inode);
+ const struct iomap_ops ext4_iomap_ops = {
+       .iomap_begin            = ext4_iomap_begin,
+       .iomap_end              = ext4_iomap_end,
};
  
-       if (ret < 0 && final_size > inode->i_size)
-               ext4_truncate_failed_write(inode);
+ static bool ext4_iomap_is_delalloc(struct inode *inode,
+                                  struct ext4_map_blocks *map)
+ {
+       struct extent_status es;
+       ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
  
-       /* Handle extending of i_size after direct IO write */
-       if (orphan) {
-               int err;
+       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+                                 map->m_lblk, end, &es);
  
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /*
-                        * We wrote the data but cannot extend
-                        * i_size. Bail out. In async io case, we do
-                        * not return error here because we have
-                        * already submmitted the corresponding
-                        * bio. Returning error here makes the caller
-                        * think that this IO is done and failed
-                        * resulting in race with bio's completion
-                        * handler.
-                        */
-                       if (!ret)
-                               ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
+       if (!es.es_len || es.es_lblk > end)
+               return false;
  
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size || end > ei->i_disksize) {
-                               ext4_update_i_disksize(inode, end);
-                               if (end > inode->i_size)
-                                       i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
+       if (es.es_lblk > map->m_lblk) {
+               map->m_len = es.es_lblk - map->m_lblk;
+               return false;
        }
- out:
-       return ret;
- }
  
- static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
- {
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
+       offset = map->m_lblk - es.es_lblk;
+       map->m_len = es.es_len - offset;
  
-       /*
-        * Shared inode_lock is enough for us - it protects against concurrent
-        * writes & truncates and since we take care of writing back page cache,
-        * we are protected against page writeback as well.
-        */
-       inode_lock_shared(inode);
-       ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-                                          iocb->ki_pos + count - 1);
-       if (ret)
-               goto out_unlock;
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                  iter, ext4_dio_get_block, NULL, NULL, 0);
- out_unlock:
-       inode_unlock_shared(inode);
-       return ret;
+       return true;
  }
  
- static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+                                  loff_t length, unsigned int flags,
+                                  struct iomap *iomap, struct iomap *srcmap)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       size_t count = iov_iter_count(iter);
-       loff_t offset = iocb->ki_pos;
-       ssize_t ret;
+       int ret;
+       bool delalloc = false;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
  
- #ifdef CONFIG_FS_ENCRYPTION
-       if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
-               return 0;
- #endif
-       if (fsverity_active(inode))
-               return 0;
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
+       if (ext4_has_inline_data(inode)) {
+               ret = ext4_inline_data_iomap(inode, iomap);
+               if (ret != -EAGAIN) {
+                       if (ret == 0 && offset >= iomap->length)
+                               ret = -ENOENT;
+                       return ret;
+               }
+       }
  
        /*
-        * If we are doing data journalling we don't support O_DIRECT
+        * Calculate the first and last logical block respectively.
         */
-       if (ext4_should_journal_data(inode))
-               return 0;
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
  
-       /* Let buffer I/O handle the inline data case. */
-       if (ext4_has_inline_data(inode))
-               return 0;
+       ret = ext4_map_blocks(NULL, inode, &map, 0);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               delalloc = ext4_iomap_is_delalloc(inode, &map);
  
-       trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (iov_iter_rw(iter) == READ)
-               ret = ext4_direct_IO_read(iocb, iter);
-       else
-               ret = ext4_direct_IO_write(iocb, iter);
-       trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
-       return ret;
+       ext4_set_iomap(inode, iomap, &map, offset, length);
+       if (delalloc && iomap->type == IOMAP_HOLE)
+               iomap->type = IOMAP_DELALLOC;
+       return 0;
  }
  
+ const struct iomap_ops ext4_iomap_report_ops = {
+       .iomap_begin = ext4_iomap_begin_report,
+ };
  /*
   * Pages can be marked dirty completely asynchronously from ext4's journalling
   * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@@ -3910,7 -3574,7 +3574,7 @@@ static const struct address_space_opera
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
@@@ -3927,7 -3591,7 +3591,7 @@@ static const struct address_space_opera
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_journalled_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
  };
@@@ -3943,7 -3607,7 +3607,7 @@@ static const struct address_space_opera
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
@@@ -5450,11 -5114,15 +5114,15 @@@ static void ext4_wait_for_tail_page_com
  
        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
-        * All buffers in the last page remain valid? Then there's nothing to
-        * do. We do the check mainly to optimize the common PAGE_SIZE ==
-        * blocksize case
+        * If the page is fully truncated, we don't need to wait for any commit
+        * (and we even should not as __ext4_journalled_invalidatepage() may
+        * strip all buffers from the page but keep the page dirty which can then
+        * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+        * buffers). Also we don't need to wait for any commit if all buffers in
+        * the page remain valid. This is most beneficial for the common case of
+        * blocksize == PAGESIZE.
         */
-       if (offset > PAGE_SIZE - i_blocksize(inode))
+       if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                page = find_lock_page(inode->i_mapping,
@@@ -5717,15 -5385,12 +5385,15 @@@ int ext4_getattr(const struct path *pat
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
 +      if (flags & EXT4_VERITY_FL)
 +              stat->attributes |= STATX_ATTR_VERITY;
  
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
 -                                STATX_ATTR_NODUMP);
 +                                STATX_ATTR_NODUMP |
 +                                STATX_ATTR_VERITY);
  
        generic_fillattr(inode, stat);
        return 0;
@@@ -5915,8 -5580,23 +5583,23 @@@ static int __ext4_expand_extra_isize(st
  {
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
+       unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
        int error;
  
+       /* this was checked at iget time, but double check for good measure */
+       if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+           (ei->i_extra_isize & 3)) {
+               EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+                                ei->i_extra_isize,
+                                EXT4_INODE_SIZE(inode->i_sb));
+               return -EFSCORRUPTED;
+       }
+       if ((new_extra_isize < ei->i_extra_isize) ||
+           (new_extra_isize < 4) ||
+           (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+               return -EINVAL; /* Should never happen */
        raw_inode = ext4_raw_inode(iloc);
  
        header = IHDR(inode, raw_inode);
@@@ -5968,9 -5648,8 +5651,8 @@@ static int ext4_try_to_expand_extra_isi
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
-       if (ext4_handle_valid(handle) &&
-           jbd2_journal_extend(handle,
-                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+       if (ext4_journal_extend(handle,
+                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;
  
        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
diff --combined fs/ext4/super.c
@@@ -1172,9 -1172,9 +1172,9 @@@ void ext4_clear_inode(struct inode *ino
  {
        invalidate_inode_buffers(inode);
        clear_inode(inode);
-       dquot_drop(inode);
        ext4_discard_preallocations(inode);
        ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+       dquot_drop(inode);
        if (EXT4_I(inode)->jinode) {
                jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
                                               EXT4_I(inode)->jinode);
@@@ -1345,18 -1345,6 +1345,18 @@@ static bool ext4_dummy_context(struct i
        return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
  }
  
 +static bool ext4_has_stable_inodes(struct super_block *sb)
 +{
 +      return ext4_has_feature_stable_inodes(sb);
 +}
 +
 +static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
 +                                     int *ino_bits_ret, int *lblk_bits_ret)
 +{
 +      *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
 +      *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
 +}
 +
  static const struct fscrypt_operations ext4_cryptops = {
        .key_prefix             = "ext4:",
        .get_context            = ext4_get_context,
        .dummy_context          = ext4_dummy_context,
        .empty_dir              = ext4_empty_dir,
        .max_namelen            = EXT4_NAME_LEN,
 +      .has_stable_inodes      = ext4_has_stable_inodes,
 +      .get_ino_and_lblk_bits  = ext4_get_ino_and_lblk_bits,
  };
  #endif
  
@@@ -1388,7 -1374,6 +1388,6 @@@ static ssize_t ext4_quota_write(struct 
  static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
                             unsigned int flags);
  static int ext4_enable_quotas(struct super_block *sb);
- static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
  
  static struct dquot **ext4_get_dquots(struct inode *inode)
  {
@@@ -1406,7 -1391,7 +1405,7 @@@ static const struct dquot_operations ex
        .destroy_dquot          = dquot_destroy,
        .get_projid             = ext4_get_projid,
        .get_inode_usage        = ext4_get_inode_usage,
-       .get_next_id            = ext4_get_next_id,
+       .get_next_id            = dquot_get_next_id,
  };
  
  static const struct quotactl_ops ext4_qctl_operations = {
@@@ -2065,7 -2050,7 +2064,7 @@@ static int parse_options(char *options
                         unsigned int *journal_ioprio,
                         int is_remount)
  {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
        char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
        substring_t args[MAX_OPT_ARGS];
        int token;
                }
        }
  #endif
-       if (test_opt(sb, DIOREAD_NOLOCK)) {
-               int blocksize =
-                       BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-               if (blocksize < PAGE_SIZE) {
-                       ext4_msg(sb, KERN_ERR, "can't mount with "
-                                "dioread_nolock if block size != PAGE_SIZE");
-                       return 0;
-               }
-       }
        return 1;
  }
  
@@@ -3569,12 -3544,15 +3558,15 @@@ static void ext4_clamp_want_extra_isize
  {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
+       unsigned def_extra_isize = sizeof(struct ext4_inode) -
+                                               EXT4_GOOD_OLD_INODE_SIZE;
  
-       /* determine the minimum size of new large inodes, if present */
-       if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
-           sbi->s_want_extra_isize == 0) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                    EXT4_GOOD_OLD_INODE_SIZE;
+       if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+               sbi->s_want_extra_isize = 0;
+               return;
+       }
+       if (sbi->s_want_extra_isize < 4) {
+               sbi->s_want_extra_isize = def_extra_isize;
                if (ext4_has_feature_extra_isize(sb)) {
                        if (sbi->s_want_extra_isize <
                            le16_to_cpu(es->s_want_extra_isize))
                }
        }
        /* Check if enough inode space is available */
-       if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
-                                                       sbi->s_inode_size) {
-               sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-                                                      EXT4_GOOD_OLD_INODE_SIZE;
+       if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+           (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+                                                       sbi->s_inode_size)) {
+               sbi->s_want_extra_isize = def_extra_isize;
                ext4_msg(sb, KERN_INFO,
                         "required extra inode space not available");
        }
@@@ -4453,13 -4431,6 +4445,6 @@@ no_journal
                }
        }
  
-       if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
-           (blocksize != PAGE_SIZE)) {
-               ext4_msg(sb, KERN_ERR,
-                        "Unsupported blocksize for fs encryption");
-               goto failed_mount_wq;
-       }
        if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
                ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
                goto failed_mount_wq;
@@@ -6033,18 -6004,6 +6018,6 @@@ out
        }
        return len;
  }
- static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
- {
-       const struct quota_format_ops   *ops;
-       if (!sb_has_quota_loaded(sb, qid->type))
-               return -ESRCH;
-       ops = sb_dqopt(sb)->ops[qid->type];
-       if (!ops || !ops->get_next_id)
-               return -ENOSYS;
-       return dquot_get_next_id(sb, qid);
- }
  #endif
  
  static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
diff --combined fs/jbd2/transaction.c
@@@ -63,6 -63,28 +63,28 @@@ void jbd2_journal_free_transaction(tran
  }
  
  /*
+  * Base amount of descriptor blocks we reserve for each transaction.
+  */
+ static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+ {
+       int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+       int tags_per_block;
+       /* Subtract UUID */
+       tag_space -= 16;
+       if (jbd2_journal_has_csum_v2or3(journal))
+               tag_space -= sizeof(struct jbd2_journal_block_tail);
+       /* Commit code leaves a slack space of 16 bytes at the end of block */
+       tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+       /*
+        * Revoke descriptors are accounted separately so we need to reserve
+        * space for commit block and normal transaction descriptor blocks.
+        */
+       return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+                               tags_per_block);
+ }
+ /*
   * jbd2_get_transaction: obtain a new transaction_t object.
   *
   * Simply initialise a new transaction. Initialize it in
@@@ -88,7 -110,9 +110,9 @@@ static void jbd2_get_transaction(journa
        spin_lock_init(&transaction->t_handle_lock);
        atomic_set(&transaction->t_updates, 0);
        atomic_set(&transaction->t_outstanding_credits,
+                  jbd2_descriptor_blocks_per_trans(journal) +
                   atomic_read(&journal->j_reserved_credits));
+       atomic_set(&transaction->t_outstanding_revokes, 0);
        atomic_set(&transaction->t_handle_count, 0);
        INIT_LIST_HEAD(&transaction->t_inode_list);
        INIT_LIST_HEAD(&transaction->t_private_list);
@@@ -258,12 -282,13 +282,13 @@@ static int add_transaction_credits(jour
         * *before* starting to dirty potentially checkpointed buffers
         * in the new transaction.
         */
-       if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+       if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
                atomic_sub(total, &t->t_outstanding_credits);
                read_unlock(&journal->j_state_lock);
                jbd2_might_wait_for_commit(journal);
                write_lock(&journal->j_state_lock);
-               if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+               if (jbd2_log_space_left(journal) <
+                                       journal->j_max_transaction_buffers)
                        __jbd2_log_wait_for_space(journal);
                write_unlock(&journal->j_state_lock);
                return 1;
@@@ -299,12 -324,12 +324,12 @@@ static int start_this_handle(journal_t 
                             gfp_t gfp_mask)
  {
        transaction_t   *transaction, *new_transaction = NULL;
-       int             blocks = handle->h_buffer_credits;
+       int             blocks = handle->h_total_credits;
        int             rsv_blocks = 0;
        unsigned long ts = jiffies;
  
        if (handle->h_rsv_handle)
-               rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+               rsv_blocks = handle->h_rsv_handle->h_total_credits;
  
        /*
         * Limit the number of reserved credits to 1/2 of maximum transaction
@@@ -405,6 -430,7 +430,7 @@@ repeat
        update_t_max_wait(transaction, ts);
        handle->h_transaction = transaction;
        handle->h_requested_credits = blocks;
+       handle->h_revoke_credits_requested = handle->h_revoke_credits;
        handle->h_start_jiffies = jiffies;
        atomic_inc(&transaction->t_updates);
        atomic_inc(&transaction->t_handle_count);
@@@ -431,15 -457,15 +457,15 @@@ static handle_t *new_handle(int nblocks
        handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
        if (!handle)
                return NULL;
-       handle->h_buffer_credits = nblocks;
+       handle->h_total_credits = nblocks;
        handle->h_ref = 1;
  
        return handle;
  }
  
  handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
-                             gfp_t gfp_mask, unsigned int type,
-                             unsigned int line_no)
+                             int revoke_records, gfp_t gfp_mask,
+                             unsigned int type, unsigned int line_no)
  {
        handle_t *handle = journal_current_handle();
        int err;
                return handle;
        }
  
+       nblocks += DIV_ROUND_UP(revoke_records,
+                               journal->j_revoke_records_per_block);
        handle = new_handle(nblocks);
        if (!handle)
                return ERR_PTR(-ENOMEM);
                rsv_handle->h_journal = journal;
                handle->h_rsv_handle = rsv_handle;
        }
+       handle->h_revoke_credits = revoke_records;
  
        err = start_this_handle(journal, handle, gfp_mask);
        if (err < 0) {
@@@ -508,16 -537,21 +537,21 @@@ EXPORT_SYMBOL(jbd2__journal_start)
   */
  handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
  {
-       return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+       return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
  }
  EXPORT_SYMBOL(jbd2_journal_start);
  
void jbd2_journal_free_reserved(handle_t *handle)
static void __jbd2_journal_unreserve_handle(handle_t *handle)
  {
        journal_t *journal = handle->h_journal;
  
        WARN_ON(!handle->h_reserved);
-       sub_reserved_credits(journal, handle->h_buffer_credits);
+       sub_reserved_credits(journal, handle->h_total_credits);
+ }
+ void jbd2_journal_free_reserved(handle_t *handle)
+ {
+       __jbd2_journal_unreserve_handle(handle);
        jbd2_free_handle(handle);
  }
  EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@@ -571,7 -605,7 +605,7 @@@ int jbd2_journal_start_reserved(handle_
        handle->h_line_no = line_no;
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
-                               line_no, handle->h_buffer_credits);
+                               line_no, handle->h_total_credits);
        return 0;
  }
  EXPORT_SYMBOL(jbd2_journal_start_reserved);
   * int jbd2_journal_extend() - extend buffer credits.
   * @handle:  handle to 'extend'
   * @nblocks: nr blocks to try to extend by.
+  * @revoke_records: number of revoke records to try to extend by.
   *
   * Some transactions, such as large extends and truncates, can be done
   * atomically all at once or in several stages.  The operation requests
   * return code < 0 implies an error
   * return code > 0 implies normal transaction-full status.
   */
- int jbd2_journal_extend(handle_t *handle, int nblocks)
+ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
  {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
                goto error_out;
        }
  
+       nblocks += DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested + revoke_records,
+                       journal->j_revoke_records_per_block) -
+               DIV_ROUND_UP(
+                       handle->h_revoke_credits_requested,
+                       journal->j_revoke_records_per_block);
        spin_lock(&transaction->t_handle_lock);
        wanted = atomic_add_return(nblocks,
                                   &transaction->t_outstanding_credits);
                goto unlock;
        }
  
-       if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
-           jbd2_log_space_left(journal)) {
-               jbd_debug(3, "denied handle %p %d blocks: "
-                         "insufficient log space\n", handle, nblocks);
-               atomic_sub(nblocks, &transaction->t_outstanding_credits);
-               goto unlock;
-       }
        trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
                                 transaction->t_tid,
                                 handle->h_type, handle->h_line_no,
-                                handle->h_buffer_credits,
+                                handle->h_total_credits,
                                 nblocks);
  
-       handle->h_buffer_credits += nblocks;
+       handle->h_total_credits += nblocks;
        handle->h_requested_credits += nblocks;
+       handle->h_revoke_credits += revoke_records;
+       handle->h_revoke_credits_requested += revoke_records;
        result = 0;
  
        jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@@ -655,11 -690,55 +690,55 @@@ error_out
        return result;
  }
  
 -      rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ static void stop_this_handle(handle_t *handle)
+ {
+       transaction_t *transaction = handle->h_transaction;
+       journal_t *journal = transaction->t_journal;
+       int revokes;
+       J_ASSERT(journal_current_handle() == handle);
+       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+       current->journal_info = NULL;
+       /*
+        * Subtract necessary revoke descriptor blocks from handle credits. We
+        * take care to account only for revoke descriptor blocks the
+        * transaction will really need as large sequences of transactions with
+        * small numbers of revokes are relatively common.
+        */
+       revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+       if (revokes) {
+               int t_revokes, revoke_descriptors;
+               int rr_per_blk = journal->j_revoke_records_per_block;
+               WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+                               > handle->h_total_credits);
+               t_revokes = atomic_add_return(revokes,
+                               &transaction->t_outstanding_revokes);
+               revoke_descriptors =
+                       DIV_ROUND_UP(t_revokes, rr_per_blk) -
+                       DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+               handle->h_total_credits -= revoke_descriptors;
+       }
+       atomic_sub(handle->h_total_credits,
+                  &transaction->t_outstanding_credits);
+       if (handle->h_rsv_handle)
+               __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+       if (atomic_dec_and_test(&transaction->t_updates))
+               wake_up(&journal->j_wait_updates);
++      rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       /*
+        * Scope of the GFP_NOFS context is over here and so we can restore the
+        * original alloc context.
+        */
+       memalloc_nofs_restore(handle->saved_alloc_context);
+ }
  
  /**
   * int jbd2_journal_restart() - restart a handle .
   * @handle:  handle to restart
   * @nblocks: nr credits requested
+  * @revoke_records: number of revoke record credits requested
   * @gfp_mask: memory allocation flags (for start_this_handle)
   *
   * Restart a handle for a multi-transaction filesystem
   * credits. We preserve reserved handle if there's any attached to the
   * passed in handle.
   */
- int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+                         gfp_t gfp_mask)
  {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal;
        tid_t           tid;
-       int             need_to_start, ret;
+       int             need_to_start;
+       int             ret;
  
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
        if (is_handle_aborted(handle))
                return 0;
        journal = transaction->t_journal;
+       tid = transaction->t_tid;
  
        /*
         * First unlink the handle from its current transaction, and start the
         * commit on that.
         */
-       J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-       J_ASSERT(journal_current_handle() == handle);
-       read_lock(&journal->j_state_lock);
-       spin_lock(&transaction->t_handle_lock);
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
-       if (handle->h_rsv_handle) {
-               sub_reserved_credits(journal,
-                                    handle->h_rsv_handle->h_buffer_credits);
-       }
-       if (atomic_dec_and_test(&transaction->t_updates))
-               wake_up(&journal->j_wait_updates);
-       tid = transaction->t_tid;
-       spin_unlock(&transaction->t_handle_lock);
+       jbd_debug(2, "restarting handle %p\n", handle);
+       stop_this_handle(handle);
        handle->h_transaction = NULL;
-       current->journal_info = NULL;
  
-       jbd_debug(2, "restarting handle %p\n", handle);
+       /*
+        * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+        * get rid of pointless j_state_lock traffic like this.
+        */
+       read_lock(&journal->j_state_lock);
        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
        if (need_to_start)
                jbd2_log_start_commit(journal, tid);
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
-       handle->h_buffer_credits = nblocks;
-       /*
-        * Restore the original nofs context because the journal restart
-        * is basically the same thing as journal stop and start.
-        * start_this_handle will start a new nofs context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       handle->h_total_credits = nblocks +
+               DIV_ROUND_UP(revoke_records,
+                            journal->j_revoke_records_per_block);
+       handle->h_revoke_credits = revoke_records;
        ret = start_this_handle(journal, handle, gfp_mask);
+       trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+                                ret ? 0 : handle->h_transaction->t_tid,
+                                handle->h_type, handle->h_line_no,
+                                handle->h_total_credits);
        return ret;
  }
  EXPORT_SYMBOL(jbd2__journal_restart);
  
  int jbd2_journal_restart(handle_t *handle, int nblocks)
  {
-       return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+       return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
  }
  EXPORT_SYMBOL(jbd2_journal_restart);
  
@@@ -879,7 -950,7 +950,7 @@@ repeat
  
        start_lock = jiffies;
        lock_buffer(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
  
        /* If it takes too long to lock the buffer, trace it */
        time_lock = jbd2_time_diff(start_lock, jiffies);
  
        error = -EROFS;
        if (is_handle_aborted(handle)) {
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                goto out;
        }
        error = 0;
         */
        if (buffer_shadow(bh)) {
                JBUFFER_TRACE(jh, "on shadow: sleep");
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }
                JBUFFER_TRACE(jh, "generate frozen data");
                if (!frozen_buffer) {
                        JBUFFER_TRACE(jh, "allocate memory for buffer");
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
                                                   GFP_NOFS | __GFP_NOFAIL);
                        goto repeat;
@@@ -1033,7 -1104,7 +1104,7 @@@ attach_next
        jh->b_next_transaction = transaction;
  
  done:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
  
        /*
         * If we are about to journal a buffer, then any revoke pending on it is
@@@ -1172,7 -1243,7 +1243,7 @@@ int jbd2_journal_get_create_access(hand
         * that case: the transaction must have deleted the buffer for it to be
         * reused here.
         */
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
                jh->b_next_transaction = transaction;
                spin_unlock(&journal->j_list_lock);
        }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
  
        /*
         * akpm: I added this.  ext3_alloc_branch can pick up new indirect
@@@ -1275,13 -1346,13 +1346,13 @@@ repeat
                committed_data = jbd2_alloc(jh2bh(jh)->b_size,
                                            GFP_NOFS|__GFP_NOFAIL);
  
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        if (!jh->b_committed_data) {
                /* Copy out the current buffer contents into the
                 * preserved, committed copy. */
                JBUFFER_TRACE(jh, "generate b_committed data");
                if (!committed_data) {
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        goto repeat;
                }
  
                committed_data = NULL;
                memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
        }
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
  out:
        jbd2_journal_put_journal_head(jh);
        if (unlikely(committed_data))
@@@ -1390,16 -1461,16 +1461,16 @@@ int jbd2_journal_dirty_metadata(handle_
         */
        if (jh->b_transaction != transaction &&
            jh->b_next_transaction != transaction) {
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                J_ASSERT_JH(jh, jh->b_transaction == transaction ||
                                jh->b_next_transaction == transaction);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
        }
        if (jh->b_modified == 1) {
                /* If it's in our transaction it must be in BJ_Metadata list. */
                if (jh->b_transaction == transaction &&
                    jh->b_jlist != BJ_Metadata) {
-                       jbd_lock_bh_state(bh);
+                       spin_lock(&jh->b_state_lock);
                        if (jh->b_transaction == transaction &&
                            jh->b_jlist != BJ_Metadata)
                                pr_err("JBD2: assertion failure: h_type=%u "
                                       jh->b_jlist);
                        J_ASSERT_JH(jh, jh->b_transaction != transaction ||
                                        jh->b_jlist == BJ_Metadata);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                }
                goto out;
        }
  
        journal = transaction->t_journal;
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
  
        if (jh->b_modified == 0) {
                /*
                 * of the transaction. This needs to be done
                 * once a transaction -bzzz
                 */
-               if (handle->h_buffer_credits <= 0) {
+               if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
                        ret = -ENOSPC;
                        goto out_unlock_bh;
                }
                jh->b_modified = 1;
-               handle->h_buffer_credits--;
+               handle->h_total_credits--;
        }
  
        /*
        __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
        spin_unlock(&journal->j_list_lock);
  out_unlock_bh:
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
  out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
@@@ -1539,18 -1610,20 +1610,20 @@@ int jbd2_journal_forget (handle_t *hand
  
        BUFFER_TRACE(bh, "entry");
  
-       jbd_lock_bh_state(bh);
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh) {
+               __bforget(bh);
+               return 0;
+       }
  
-       if (!buffer_jbd(bh))
-               goto not_jbd;
-       jh = bh2jh(bh);
+       spin_lock(&jh->b_state_lock);
  
        /* Critical error: attempting to delete a bitmap buffer, maybe?
         * Don't do any jbd operations, and return an error. */
        if (!J_EXPECT_JH(jh, !jh->b_committed_data,
                         "inconsistent data on disk")) {
                err = -EIO;
-               goto not_jbd;
+               goto drop;
        }
  
        /* keep track of whether or not this transaction modified us */
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
                        __jbd2_journal_unfile_buffer(jh);
-                       if (!buffer_jbd(bh)) {
-                               spin_unlock(&journal->j_list_lock);
-                               goto not_jbd;
-                       }
+                       jbd2_journal_put_journal_head(jh);
                }
                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                if (!jh->b_cp_transaction) {
                        JBUFFER_TRACE(jh, "belongs to none transaction");
                        spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                }
  
                /*
                if (!buffer_dirty(bh)) {
                        __jbd2_journal_remove_checkpoint(jh);
                        spin_unlock(&journal->j_list_lock);
-                       goto not_jbd;
+                       goto drop;
                }
  
                /*
                __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
                spin_unlock(&journal->j_list_lock);
        }
-       jbd_unlock_bh_state(bh);
-       __brelse(bh);
  drop:
+       __brelse(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
        if (drop_reserve) {
                /* no need to reserve log space for this block -bzzz */
-               handle->h_buffer_credits++;
+               handle->h_total_credits++;
        }
        return err;
- not_jbd:
-       jbd_unlock_bh_state(bh);
-       __bforget(bh);
-       goto drop;
  }
  
  /**
@@@ -1706,45 -1771,34 +1771,34 @@@ int jbd2_journal_stop(handle_t *handle
        tid_t tid;
        pid_t pid;
  
+       if (--handle->h_ref > 0) {
+               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+                                                handle->h_ref);
+               if (is_handle_aborted(handle))
+                       return -EIO;
+               return 0;
+       }
        if (!transaction) {
                /*
-                * Handle is already detached from the transaction so
-                * there is nothing to do other than decrease a refcount,
-                * or free the handle if refcount drops to zero
+                * Handle is already detached from the transaction so there is
+                * nothing to do other than free the handle.
                 */
-               if (--handle->h_ref > 0) {
-                       jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                                                        handle->h_ref);
-                       return err;
-               } else {
-                       if (handle->h_rsv_handle)
-                               jbd2_free_handle(handle->h_rsv_handle);
-                       goto free_and_exit;
-               }
+               memalloc_nofs_restore(handle->saved_alloc_context);
+               goto free_and_exit;
        }
        journal = transaction->t_journal;
-       J_ASSERT(journal_current_handle() == handle);
+       tid = transaction->t_tid;
  
        if (is_handle_aborted(handle))
                err = -EIO;
-       else
-               J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-       if (--handle->h_ref > 0) {
-               jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
-                         handle->h_ref);
-               return err;
-       }
  
        jbd_debug(4, "Handle %p going down\n", handle);
        trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
-                               transaction->t_tid,
-                               handle->h_type, handle->h_line_no,
+                               tid, handle->h_type, handle->h_line_no,
                                jiffies - handle->h_start_jiffies,
                                handle->h_sync, handle->h_requested_credits,
                                (handle->h_requested_credits -
-                                handle->h_buffer_credits));
+                                handle->h_total_credits));
  
        /*
         * Implement synchronous transaction batching.  If the handle
  
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
-       current->journal_info = NULL;
-       atomic_sub(handle->h_buffer_credits,
-                  &transaction->t_outstanding_credits);
  
        /*
         * If the handle is marked SYNC, we need to set another commit
-        * going!  We also want to force a commit if the current
-        * transaction is occupying too much of the log, or if the
-        * transaction is too old now.
+        * going!  We also want to force a commit if the transaction is too
+        * old now.
         */
        if (handle->h_sync ||
-           (atomic_read(&transaction->t_outstanding_credits) >
-            journal->j_max_transaction_buffers) ||
            time_after_eq(jiffies, transaction->t_expires)) {
                /* Do this even for aborted journals: an abort still
                 * completes the commit thread, it just doesn't write
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-               jbd2_log_start_commit(journal, transaction->t_tid);
+               jbd2_log_start_commit(journal, tid);
  
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
        }
  
        /*
-        * Once we drop t_updates, if it goes to zero the transaction
-        * could start committing on us and eventually disappear.  So
-        * once we do this, we must not dereference transaction
-        * pointer again.
+        * Once stop_this_handle() drops t_updates, the transaction could start
+        * committing on us and eventually disappear.  So we must not
+        * dereference transaction pointer again after calling
+        * stop_this_handle().
         */
-       tid = transaction->t_tid;
-       if (atomic_dec_and_test(&transaction->t_updates)) {
-               wake_up(&journal->j_wait_updates);
-               if (journal->j_barrier_count)
-                       wake_up(&journal->j_wait_transaction_locked);
-       }
-       rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+       stop_this_handle(handle);
  
        if (wait_for_commit)
                err = jbd2_log_wait_commit(journal, tid);
  
-       if (handle->h_rsv_handle)
-               jbd2_journal_free_reserved(handle->h_rsv_handle);
  free_and_exit:
-       /*
-        * Scope of the GFP_NOFS context is over here and so we can restore the
-        * original alloc context.
-        */
-       memalloc_nofs_restore(handle->saved_alloc_context);
+       if (handle->h_rsv_handle)
+               jbd2_free_handle(handle->h_rsv_handle);
        jbd2_free_handle(handle);
        return err;
  }
   *
   * j_list_lock is held.
   *
-  * jbd_lock_bh_state(jh2bh(jh)) is held.
+  * jh->b_state_lock is held.
   */
  
  static inline void
@@@ -1902,7 -1938,7 +1938,7 @@@ __blist_add_buffer(struct journal_head 
   *
   * Called with j_list_lock held, and the journal may not be locked.
   *
-  * jbd_lock_bh_state(jh2bh(jh)) is held.
+  * jh->b_state_lock is held.
   */
  
  static inline void
@@@ -1934,7 -1970,7 +1970,7 @@@ static void __jbd2_journal_temp_unlink_
        transaction_t *transaction;
        struct buffer_head *bh = jh2bh(jh);
  
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        transaction = jh->b_transaction;
        if (transaction)
                assert_spin_locked(&transaction->t_journal->j_list_lock);
  }
  
  /*
-  * Remove buffer from all transactions.
+  * Remove buffer from all transactions. The caller is responsible for dropping
+  * the jh reference that belonged to the transaction.
   *
   * Called with bh_state lock and j_list_lock
-  *
-  * jh and bh may be already freed when this function returns.
   */
  static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
  {
        __jbd2_journal_temp_unlink_buffer(jh);
        jh->b_transaction = NULL;
-       jbd2_journal_put_journal_head(jh);
  }
  
  void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
  
        /* Get reference so that buffer cannot be freed before we unlock it */
        get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
        __jbd2_journal_unfile_buffer(jh);
        spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
+       jbd2_journal_put_journal_head(jh);
        __brelse(bh);
  }
  
  /*
   * Called from jbd2_journal_try_to_free_buffers().
   *
-  * Called under jbd_lock_bh_state(bh)
+  * Called under jh->b_state_lock
   */
  static void
  __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@@ -2088,10 -2123,10 +2123,10 @@@ int jbd2_journal_try_to_free_buffers(jo
                if (!jh)
                        continue;
  
-               jbd_lock_bh_state(bh);
+               spin_lock(&jh->b_state_lock);
                __journal_try_to_free_buffer(journal, bh);
+               spin_unlock(&jh->b_state_lock);
                jbd2_journal_put_journal_head(jh);
-               jbd_unlock_bh_state(bh);
                if (buffer_jbd(bh))
                        goto busy;
        } while ((bh = bh->b_this_page) != head);
@@@ -2112,7 -2147,7 +2147,7 @@@ busy
   *
   * Called under j_list_lock.
   *
-  * Called under jbd_lock_bh_state(bh).
+  * Called under jh->b_state_lock.
   */
  static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
  {
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
                __jbd2_journal_unfile_buffer(jh);
+               jbd2_journal_put_journal_head(jh);
        }
        return may_free;
  }
@@@ -2199,18 -2235,15 +2235,15 @@@ static int journal_unmap_buffer(journal
         * holding the page lock. --sct
         */
  
-       if (!buffer_jbd(bh))
+       jh = jbd2_journal_grab_journal_head(bh);
+       if (!jh)
                goto zap_buffer_unlocked;
  
        /* OK, we have data buffer in journaled mode */
        write_lock(&journal->j_state_lock);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
  
-       jh = jbd2_journal_grab_journal_head(bh);
-       if (!jh)
-               goto zap_buffer_no_jh;
        /*
         * We cannot remove the buffer from checkpoint lists until the
         * transaction adding inode to orphan list (let's call it T)
                 * for commit and try again.
                 */
                if (partial_page) {
-                       jbd2_journal_put_journal_head(jh);
                        spin_unlock(&journal->j_list_lock);
-                       jbd_unlock_bh_state(bh);
+                       spin_unlock(&jh->b_state_lock);
                        write_unlock(&journal->j_state_lock);
+                       jbd2_journal_put_journal_head(jh);
                        return -EBUSY;
                }
                /*
                set_buffer_freed(bh);
                if (journal->j_running_transaction && buffer_jbddirty(bh))
                        jh->b_next_transaction = journal->j_running_transaction;
-               jbd2_journal_put_journal_head(jh);
                spin_unlock(&journal->j_list_lock);
-               jbd_unlock_bh_state(bh);
+               spin_unlock(&jh->b_state_lock);
                write_unlock(&journal->j_state_lock);
+               jbd2_journal_put_journal_head(jh);
                return 0;
        } else {
                /* Good, the buffer belongs to the running transaction.
@@@ -2331,11 -2364,10 +2364,10 @@@ zap_buffer
         * here.
         */
        jh->b_modified = 0;
-       jbd2_journal_put_journal_head(jh);
- zap_buffer_no_jh:
        spin_unlock(&journal->j_list_lock);
-       jbd_unlock_bh_state(bh);
+       spin_unlock(&jh->b_state_lock);
        write_unlock(&journal->j_state_lock);
+       jbd2_journal_put_journal_head(jh);
  zap_buffer_unlocked:
        clear_buffer_dirty(bh);
        J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@@ -2422,7 -2454,7 +2454,7 @@@ void __jbd2_journal_file_buffer(struct 
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);
  
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        assert_spin_locked(&transaction->t_journal->j_list_lock);
  
        J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
  void jbd2_journal_file_buffer(struct journal_head *jh,
                                transaction_t *transaction, int jlist)
  {
-       jbd_lock_bh_state(jh2bh(jh));
+       spin_lock(&jh->b_state_lock);
        spin_lock(&transaction->t_journal->j_list_lock);
        __jbd2_journal_file_buffer(jh, transaction, jlist);
        spin_unlock(&transaction->t_journal->j_list_lock);
-       jbd_unlock_bh_state(jh2bh(jh));
+       spin_unlock(&jh->b_state_lock);
  }
  
  /*
   * buffer on that transaction's metadata list.
   *
   * Called under j_list_lock
-  * Called under jbd_lock_bh_state(jh2bh(jh))
+  * Called under jh->b_state_lock
   *
-  * jh and bh may be already free when this function returns
+  * When this function returns true, there's no next transaction to refile to
+  * and the caller has to drop jh reference through
+  * jbd2_journal_put_journal_head().
   */
void __jbd2_journal_refile_buffer(struct journal_head *jh)
bool __jbd2_journal_refile_buffer(struct journal_head *jh)
  {
        int was_dirty, jlist;
        struct buffer_head *bh = jh2bh(jh);
  
-       J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+       lockdep_assert_held(&jh->b_state_lock);
        if (jh->b_transaction)
                assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
  
        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
                __jbd2_journal_unfile_buffer(jh);
-               return;
+               return true;
        }
  
        /*
  
        if (was_dirty)
                set_buffer_jbddirty(bh);
+       return false;
  }
  
  /*
   */
  void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
  {
-       struct buffer_head *bh = jh2bh(jh);
+       bool drop;
  
-       /* Get reference so that buffer cannot be freed before we unlock it */
-       get_bh(bh);
-       jbd_lock_bh_state(bh);
+       spin_lock(&jh->b_state_lock);
        spin_lock(&journal->j_list_lock);
-       __jbd2_journal_refile_buffer(jh);
-       jbd_unlock_bh_state(bh);
+       drop = __jbd2_journal_refile_buffer(jh);
+       spin_unlock(&jh->b_state_lock);
        spin_unlock(&journal->j_list_lock);
-       __brelse(bh);
+       if (drop)
+               jbd2_journal_put_journal_head(jh);
  }
  
  /*
diff --combined fs/ocfs2/journal.c
@@@ -217,8 -217,7 +217,8 @@@ void ocfs2_recovery_exit(struct ocfs2_s
        /* At this point, we know that no more recovery threads can be
         * launched, so wait for any recovery completion work to
         * complete. */
 -      flush_workqueue(osb->ocfs2_wq);
 +      if (osb->ocfs2_wq)
 +              flush_workqueue(osb->ocfs2_wq);
  
        /*
         * Now that recovery is shut down, and the osb is about to be
@@@ -420,14 -419,14 +420,14 @@@ int ocfs2_extend_trans(handle_t *handle
        if (!nblocks)
                return 0;
  
-       old_nblocks = handle->h_buffer_credits;
+       old_nblocks = jbd2_handle_buffer_credits(handle);
  
        trace_ocfs2_extend_trans(old_nblocks, nblocks);
  
  #ifdef CONFIG_OCFS2_DEBUG_FS
        status = 1;
  #else
-       status = jbd2_journal_extend(handle, nblocks);
+       status = jbd2_journal_extend(handle, nblocks, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@@ -461,13 -460,13 +461,13 @@@ int ocfs2_allocate_extend_trans(handle_
  
        BUG_ON(!handle);
  
-       old_nblks = handle->h_buffer_credits;
+       old_nblks = jbd2_handle_buffer_credits(handle);
        trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
  
        if (old_nblks < thresh)
                return 0;
  
-       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+       status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --combined include/linux/jbd2.h
@@@ -313,7 -313,6 +313,6 @@@ enum jbd_state_bits 
        BH_Revoked,             /* Has been revoked from the log */
        BH_RevokeValid,         /* Revoked flag is valid */
        BH_JBDDirty,            /* Is dirty but journaled */
-       BH_State,               /* Pins most journal_head state */
        BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
        BH_Shadow,              /* IO on shadow buffer is running */
        BH_Verified,            /* Metadata block has been verified ok */
@@@ -342,26 -341,6 +341,6 @@@ static inline struct journal_head *bh2j
        return bh->b_private;
  }
  
- static inline void jbd_lock_bh_state(struct buffer_head *bh)
- {
-       bit_spin_lock(BH_State, &bh->b_state);
- }
- static inline int jbd_trylock_bh_state(struct buffer_head *bh)
- {
-       return bit_spin_trylock(BH_State, &bh->b_state);
- }
- static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
- {
-       return bit_spin_is_locked(BH_State, &bh->b_state);
- }
- static inline void jbd_unlock_bh_state(struct buffer_head *bh)
- {
-       bit_spin_unlock(BH_State, &bh->b_state);
- }
  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
  {
        bit_spin_lock(BH_JournalHead, &bh->b_state);
@@@ -477,7 -456,9 +456,9 @@@ struct jbd2_revoke_table_s
   * @h_transaction: Which compound transaction is this update a part of?
   * @h_journal: Which journal handle belongs to - used iff h_reserved set.
   * @h_rsv_handle: Handle reserved for finishing the logical operation.
-  * @h_buffer_credits: Number of remaining buffers we are allowed to dirty.
+  * @h_total_credits: Number of remaining buffers we are allowed to add to
+       journal. These are dirty buffers and revoke descriptor blocks.
+  * @h_revoke_credits: Number of remaining revoke records available for handle
   * @h_ref: Reference count on this handle.
   * @h_err: Field for caller's use to track errors through large fs operations.
   * @h_sync: Flag for sync-on-close.
   * @h_type: For handle statistics.
   * @h_line_no: For handle statistics.
   * @h_start_jiffies: Handle Start time.
-  * @h_requested_credits: Holds @h_buffer_credits after handle is started.
+  * @h_requested_credits: Holds @h_total_credits after handle is started.
+  * @h_revoke_credits_requested: Holds @h_revoke_credits after handle is started.
   * @saved_alloc_context: Saved context while transaction is open.
   **/
  
@@@ -504,7 -486,9 +486,9 @@@ struct jbd2_journal_handl
        };
  
        handle_t                *h_rsv_handle;
-       int                     h_buffer_credits;
+       int                     h_total_credits;
+       int                     h_revoke_credits;
+       int                     h_revoke_credits_requested;
        int                     h_ref;
        int                     h_err;
  
@@@ -556,9 -540,9 +540,9 @@@ struct transaction_chp_stats_s 
   *      ->jbd_lock_bh_journal_head()  (This is "innermost")
   *
   *    j_state_lock
-  *    ->jbd_lock_bh_state()
+  *    ->b_state_lock
   *
-  *    jbd_lock_bh_state()
+  *    b_state_lock
   *    ->j_list_lock
   *
   *    j_state_lock
@@@ -681,12 -665,25 +665,25 @@@ struct transaction_
        atomic_t                t_updates;
  
        /*
-        * Number of buffers reserved for use by all handles in this transaction
-        * handle but not yet modified. [none]
+        * Number of blocks reserved for this transaction in the journal.
+        * This is including all credits reserved when starting transaction
+        * handles as well as all journal descriptor blocks needed for this
+        * transaction. [none]
         */
        atomic_t                t_outstanding_credits;
  
        /*
+        * Number of revoke records for this transaction added by already
+        * stopped handles. [none]
+        */
+       atomic_t                t_outstanding_revokes;
+       /*
+        * How many handles used this transaction? [none]
+        */
+       atomic_t                t_handle_count;
+       /*
         * Forward and backward links for the circular list of all transactions
         * awaiting checkpoint. [j_list_lock]
         */
        ktime_t                 t_start_time;
  
        /*
-        * How many handles used this transaction? [none]
-        */
-       atomic_t                t_handle_count;
-       /*
         * This transaction is being forced and some process is
         * waiting for it to finish.
         */
@@@ -1025,6 -1017,13 +1017,13 @@@ struct journal_
        int                     j_max_transaction_buffers;
  
        /**
+        * @j_revoke_records_per_block:
+        *
+        * Number of revoke records that fit in one descriptor block.
+        */
+       int                     j_revoke_records_per_block;
+       /**
         * @j_commit_interval:
         *
         * What is the maximum transaction lifetime before we begin a commit?
  #define jbd2_might_wait_for_commit(j) \
        do { \
                rwsem_acquire(&j->j_trans_commit_map, 0, 0, _THIS_IP_); \
 -              rwsem_release(&j->j_trans_commit_map, 1, _THIS_IP_); \
 +              rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \
        } while (0)
  
  /* journal feature predicate functions */
@@@ -1257,7 -1256,7 +1256,7 @@@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3,              CSU
  
  /* Filing buffers */
  extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
- extern void __jbd2_journal_refile_buffer(struct journal_head *);
+ extern bool __jbd2_journal_refile_buffer(struct journal_head *);
  extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
  extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
  extern void __journal_free_buffer(struct journal_head *bh);
@@@ -1358,14 -1357,16 +1357,16 @@@ static inline handle_t *journal_current
  
  extern handle_t *jbd2_journal_start(journal_t *, int nblocks);
  extern handle_t *jbd2__journal_start(journal_t *, int blocks, int rsv_blocks,
-                                    gfp_t gfp_mask, unsigned int type,
-                                    unsigned int line_no);
+                                    int revoke_records, gfp_t gfp_mask,
+                                    unsigned int type, unsigned int line_no);
  extern int     jbd2_journal_restart(handle_t *, int nblocks);
- extern int     jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask);
+ extern int     jbd2__journal_restart(handle_t *, int nblocks,
+                                      int revoke_records, gfp_t gfp_mask);
  extern int     jbd2_journal_start_reserved(handle_t *handle,
                                unsigned int type, unsigned int line_no);
  extern void    jbd2_journal_free_reserved(handle_t *handle);
- extern int     jbd2_journal_extend (handle_t *, int nblocks);
+ extern int     jbd2_journal_extend(handle_t *handle, int nblocks,
+                                    int revoke_records);
  extern int     jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
  extern int     jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
  extern int     jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
@@@ -1561,37 -1562,18 +1562,18 @@@ static inline int jbd2_journal_has_csum
  }
  
  /*
-  * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
-  * transaction control blocks.
-  */
- #define JBD2_CONTROL_BLOCKS_SHIFT 5
- /*
-  * Return the minimum number of blocks which must be free in the journal
-  * before a new transaction may be started.  Must be called under j_state_lock.
-  */
- static inline int jbd2_space_needed(journal_t *journal)
- {
-       int nblocks = journal->j_max_transaction_buffers;
-       return nblocks + (nblocks >> JBD2_CONTROL_BLOCKS_SHIFT);
- }
- /*
   * Return number of free blocks in the log. Must be called under j_state_lock.
   */
  static inline unsigned long jbd2_log_space_left(journal_t *journal)
  {
        /* Allow for rounding errors */
-       unsigned long free = journal->j_free - 32;
+       long free = journal->j_free - 32;
  
        if (journal->j_committing_transaction) {
-               unsigned long committing = atomic_read(&journal->
-                       j_committing_transaction->t_outstanding_credits);
-               /* Transaction + control blocks */
-               free -= committing + (committing >> JBD2_CONTROL_BLOCKS_SHIFT);
+               free -= atomic_read(&journal->
+                         j_committing_transaction->t_outstanding_credits);
        }
-       return free;
+       return max_t(long, free, 0);
  }
  
  /*
@@@ -1645,6 -1627,20 +1627,20 @@@ static inline tid_t  jbd2_get_latest_tr
        return tid;
  }
  
+ static inline int jbd2_handle_buffer_credits(handle_t *handle)
+ {
+       journal_t *journal;
+       if (!handle->h_reserved)
+               journal = handle->h_transaction->t_journal;
+       else
+               journal = handle->h_journal;
+       return handle->h_total_credits -
+               DIV_ROUND_UP(handle->h_revoke_credits_requested,
+                            journal->j_revoke_records_per_block);
+ }
  #ifdef __KERNEL__
  
  #define buffer_trace_init(bh) do {} while (0)